From f6cff3427fe7af62a688d3f2a8219720379b8a3a Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 19 Dec 2024 23:29:29 -0800 Subject: [PATCH 01/35] Rename EP instance context as session_context --- .../providers/openvino/backend_manager.cc | 82 ++++----- .../core/providers/openvino/backend_manager.h | 7 +- .../core/providers/openvino/backend_utils.cc | 16 +- .../core/providers/openvino/backend_utils.h | 4 +- .../openvino/backends/backend_factory.cc | 6 +- .../openvino/backends/basic_backend.cc | 164 +++++++++--------- .../openvino/backends/basic_backend.h | 4 +- .../core/providers/openvino/contexts.h | 2 +- .../core/providers/openvino/ibackend.h | 2 +- .../openvino/openvino_execution_provider.cc | 64 +++---- .../openvino/openvino_execution_provider.h | 2 +- 11 files changed, 176 insertions(+), 177 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index b079e3794c4cc..1796256a23441 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -21,8 +21,8 @@ namespace onnxruntime { namespace openvino_ep { -GlobalContext& BackendManager::GetGlobalContext() { - return global_context_; +SessionContext& BackendManager::GetSessionContext() { + return session_context_; } ov::CompiledModel& BackendManager::GetOVCompiledModel() { @@ -30,17 +30,17 @@ ov::CompiledModel& BackendManager::GetOVCompiledModel() { return (ov_ptr); } -BackendManager::BackendManager(const GlobalContext& global_context, +BackendManager::BackendManager(const SessionContext& session_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, EPCtxHandler& ep_ctx_handle_) { - global_context_ = global_context; + session_context_ = session_context; - openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." + - std::to_string(global_context_.OpenVINO_Version.at(1)); + openvino_sdk_version_ = std::to_string(session_context_.OpenVINO_Version.at(0)) + "." + + std::to_string(session_context_.OpenVINO_Version.at(1)); if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) { - if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, global_context_.ep_context_embed_mode) != Status::OK()) + if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, session_context_.ep_context_embed_mode) != Status::OK()) ORT_THROW("Import blob from model failed"); } @@ -74,19 +74,19 @@ BackendManager::BackendManager(const GlobalContext& global_context, if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) { model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); } - std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type; + std::string device_type = session_context_.device_type; if (ModelHasSymbolicInputDims(subgraph)) { subgraph_context_.has_dynamic_input_shape = true; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims"; - if ((GetGlobalContext().device_type.find("CPU") != std::string::npos || - GetGlobalContext().device_type.find("GPU") != std::string::npos) && - !GetGlobalContext().disable_dynamic_shapes) { + if ((session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos) && + !session_context_.disable_dynamic_shapes) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. " << "Creating backend Dynamic Shapes"; try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, - GetGlobalContext(), + session_context_, subgraph_context_, ep_ctx_handle_); } catch (std::string const& msg) { @@ -95,7 +95,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " << "Backend created for graph " << subgraph_context_.subgraph_name; } else { - // Only cache model_proto in global to rewrite the model with input shapes at runtime. + // Only cache model_proto in session context to rewrite the model with input shapes at runtime. // For dynamic backend creation model_proto_ = std::move(model_proto); } @@ -109,13 +109,13 @@ BackendManager::BackendManager(const GlobalContext& global_context, // OV NPU plugin is supported with fallback to OV CPU upon compilation failures. try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, - GetGlobalContext(), + session_context_, subgraph_context_, ep_ctx_handle_); } catch (const OnnxRuntimeException& ex) { std::string exception_str = ex.what(); bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && - !GetGlobalContext().disable_cpu_fallback && + !session_context_.disable_cpu_fallback && !ep_ctx_handle_.IsValidOVEPCtxGraph(); #if defined(OPENVINO_DISABLE_NPU_FALLBACK) eligible_for_cpu_fallback = false; @@ -124,11 +124,11 @@ BackendManager::BackendManager(const GlobalContext& global_context, LOGS_DEFAULT(VERBOSE) << exception_str; LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; - GetGlobalContext().device_type = "CPU"; - GetGlobalContext().precision_str = "FP32"; + session_context_.device_type = "CPU"; + session_context_.precision_str = "FP32"; try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, - GetGlobalContext(), + session_context_, subgraph_context_, ep_ctx_handle_); } catch (std::string const& msg) { @@ -162,7 +162,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, } } } - if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { + if (session_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, logger); if ((!status.IsOK())) { @@ -177,7 +177,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, // the EPContext node. Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer, const logging::Logger& logger) { - if (GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { + if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { std::string exception_str = "Exporting dynamically compiled models at runtime is not supported. " "Cannot export blobs of dynamic models that request static shape inference. " @@ -189,19 +189,19 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie auto compiled_model = concrete_backend_->GetOVCompiledModel(); std::string graph_name = ""; // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability - if (!global_context_.cache_dir.empty()) { - graph_name = global_context_.cache_dir; + if (!session_context_.cache_dir.empty()) { + graph_name = session_context_.cache_dir; } else { - graph_name = global_context_.onnx_model_path_name; + graph_name = session_context_.onnx_model_path_name; // Remove extension so we can append suffix to form the complete name of output graph - size_t dot = global_context_.onnx_model_path_name.find_last_of("."); + size_t dot = session_context_.onnx_model_path_name.find_last_of("."); graph_name = graph_name.substr(0, dot); if (dot != std::string::npos) graph_name += "_ctx.onnx"; } // If embed_mode, then pass on the serialized blob // If not embed_mode, dump the blob here and only pass on the path to the blob - if (global_context_.ep_context_embed_mode) { + if (session_context_.ep_context_embed_mode) { std::ostringstream model_blob_stream; compiled_model.export_model(model_blob_stream); model_blob_str = std::move(model_blob_stream).str(); @@ -223,7 +223,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer, graph_name, logger, - global_context_.ep_context_embed_mode, + session_context_.ep_context_embed_mode, std::move(model_blob_str), openvino_sdk_version_)); @@ -342,8 +342,8 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, }; // QDQ stripping enabled only for the NPU - if (global_context_.device_type.find("NPU") != std::string::npos && - global_context_.enable_qdq_optimizer && + if (session_context_.device_type.find("NPU") != std::string::npos && + session_context_.enable_qdq_optimizer && IsQDQGraph(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1"; std::unique_ptr model; @@ -351,7 +351,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); - DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else { @@ -361,7 +361,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); subgraph.ToProto(*model_proto->mutable_graph(), true, true); print_model_proto_duration(); - DumpOpenVINOEPModel(global_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node); return model_proto; } } @@ -453,13 +453,13 @@ void BackendManager::Compute(OrtKernelContext* context) { // by rewriting the model to static shaped model at runtime based on input shape. // disable_dynamic_shapes is always set to true for OV NPU plugin. if (subgraph_context_.has_dynamic_input_shape && - !GetGlobalContext().disable_dynamic_shapes && - (GetGlobalContext().device_type.find("CPU") != std::string::npos || - GetGlobalContext().device_type.find("GPU") != std::string::npos)) { + !session_context_.disable_dynamic_shapes && + (session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { concrete_backend_->Infer(context); } else if (subgraph_context_.has_dynamic_input_shape) { std::vector> tensor_shapes = GetInputTensorShapes(ctx); - auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + auto key = MakeMapKeyString(tensor_shapes, session_context_.device_type); std::shared_ptr dynamic_backend; auto search = backend_map_.find(key); if (search == backend_map_.end()) { @@ -470,7 +470,7 @@ void BackendManager::Compute(OrtKernelContext* context) { auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes); try { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, - GetGlobalContext(), + session_context_, subgraph_context_, ep_ctx_handle_); } catch (const OnnxRuntimeException& ex) { @@ -479,17 +479,17 @@ void BackendManager::Compute(OrtKernelContext* context) { LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."; ORT_THROW(ex.what()); #else - if (GetGlobalContext().device_type.find("NPU") != std::string::npos && - !GetGlobalContext().disable_cpu_fallback) { + if (session_context_.device_type.find("NPU") != std::string::npos && + !session_context_.disable_cpu_fallback) { LOGS_DEFAULT(WARNING) << ex.what(); LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; - GetGlobalContext().device_type = "CPU"; - GetGlobalContext().precision_str = "FP32"; - key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + session_context_.device_type = "CPU"; + session_context_.precision_str = "FP32"; + key = MakeMapKeyString(tensor_shapes, session_context_.device_type); try { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, - GetGlobalContext(), + session_context_, subgraph_context_, ep_ctx_handle_); } catch (std::string const& msg) { diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 5ec462afd9d01..7ae647188976d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -19,15 +19,14 @@ namespace openvino_ep { // Singleton class that manages all the backends class BackendManager { public: - BackendManager(const GlobalContext& global_context, + BackendManager(const SessionContext& session_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, EPCtxHandler& ctx_handle); void Compute(OrtKernelContext* context); void ShutdownBackendManager(); - void SetGlobalCotext(const GlobalContext& global_context); - GlobalContext& GetGlobalContext(); + SessionContext& GetSessionContext(); Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger); ov::CompiledModel& GetOVCompiledModel(); @@ -52,7 +51,7 @@ class BackendManager { std::shared_ptr concrete_backend_; std::map> backend_map_; SubGraphContext subgraph_context_; - GlobalContext global_context_; + SessionContext session_context_; EPCtxHandler ep_ctx_handle_{}; std::string openvino_sdk_version_{}; }; diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index b97736f2e124d..6c28db5803cb1 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -40,17 +40,17 @@ struct static_cast_int64 { }; std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, +CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContext& session_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; } const std::string model = model_proto.SerializeAsString(); try { - auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); + auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name); // Check for Constant Folding - if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) { + if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; pass_const_obj.run_on_model(ov_model); auto& results = const_cast(ov_model.get()->get_results()); @@ -129,13 +129,13 @@ GetOutputTensor(Ort::KernelContext& context, return context.GetOutput(index, output_shape.get(), num_dims); } -int GetFirstAvailableDevice(GlobalContext& global_context) { +int GetFirstAvailableDevice(SessionContext& session_context) { int i = 0; // Get the first available VAD-M device and set the device to busy while (i < 8) { - bool device = global_context.deviceAvailableList[i]; + bool device = session_context.deviceAvailableList[i]; if (device) { - global_context.deviceAvailableList[i] = false; + session_context.deviceAvailableList[i] = false; break; } i++; @@ -144,9 +144,9 @@ int GetFirstAvailableDevice(GlobalContext& global_context) { // make all remaining devices free if (i == 8) { i = 0; - global_context.deviceAvailableList[i] = false; + session_context.deviceAvailableList[i] = false; for (int j = 1; j < 8; j++) { - global_context.deviceAvailableList[j] = true; + session_context.deviceAvailableList[j] = true; } } return i; diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 9d58e1ca73abb..4a500a3f146f7 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -34,7 +34,7 @@ bool IsDebugEnabled(); // Internal diagnostic function. bool IsCILogEnabled(); -int GetFirstAvailableDevice(GlobalContext& global_context); +int GetFirstAvailableDevice(SessionContext& session_context); void FillOutputsWithConstantData(std::shared_ptr node, Ort::UnownedValue& out_tensor); @@ -62,7 +62,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, - const GlobalContext& global_context, + const SessionContext& session_context, std::map>& const_outputs_map); void printPerformanceCounts(const std::vector& performanceMap, diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index b7e4aed6e7e18..4b3e57d087381 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -12,10 +12,10 @@ namespace openvino_ep { std::shared_ptr BackendFactory::MakeBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, EPCtxHandler& ep_ctx_handle) { - std::string type = global_context.device_type; + std::string type = session_context.device_type; if (type == "CPU" || type.find("GPU") != std::string::npos || type.find("NPU") != std::string::npos || type.find("HETERO") != std::string::npos || @@ -23,7 +23,7 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p type.find("AUTO") != std::string::npos) { std::shared_ptr concrete_backend_; try { - concrete_backend_ = std::make_shared(model_proto, global_context, subgraph_context, ep_ctx_handle); + concrete_backend_ = std::make_shared(model_proto, session_context, subgraph_context, ep_ctx_handle); } catch (std::string const& msg) { ORT_THROW(msg); } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 435ca83ff69d4..7dbd8bd5e979b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -21,11 +21,11 @@ namespace openvino_ep { using namespace backend_utils; BasicBackend::BasicBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, EPCtxHandler& ep_ctx_handle) - : global_context_(global_context), subgraph_context_(subgraph_context) { - std::string& hw_target = global_context_.device_type; + : session_context_(session_context), subgraph_context_(subgraph_context) { + std::string& hw_target = session_context_.device_type; is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph(); @@ -59,77 +59,77 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } try { - std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str; + std::string dev_prec = session_context.device_type + "_" + session_context_.precision_str; - if (global_context.is_wholly_supported_graph) { // Full graph is supported + if (session_context.is_wholly_supported_graph) { // Full graph is supported #if defined(IO_BUFFER_ENABLED) if (is_ep_ctx_graph_) { std::istringstream model_stream(ep_ctx_handle.GetModelBlobString()); - exe_network_ = global_context_.ie_core.ImportModel(model_stream, + exe_network_ = session_context_.ie_core.ImportModel(model_stream, remote_context_, subgraph_context_.subgraph_name); - } else if ((global_context.device_type.find("GPU") != std::string::npos) && - (global_context_.context != nullptr)) { + } else if ((session_context.device_type.find("GPU") != std::string::npos) && + (session_context_.context != nullptr)) { LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; - cl_context ctx = static_cast(global_context_.context); - remote_context_ = new ov::intel_gpu::ocl::ClContext(global_context_.ie_core.Get(), ctx); - ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + cl_context ctx = static_cast(session_context_.context); + remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx); + ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_); + exe_network_ = session_context_.ie_core.CompileModel( ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name); } else { - ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_); + exe_network_ = session_context_.ie_core.CompileModel( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); } #else // !IO_BUFFER_ENABLED - std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision; + std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : session_context_.model_precision; if (is_ep_ctx_graph_) { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob - exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), + exe_network_ = session_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), hw_target, device_config, - global_context_.ep_context_embed_mode, + session_context_.ep_context_embed_mode, subgraph_context_.subgraph_name); - } else if (global_context_.export_ep_ctx_blob && + } else if (session_context_.export_ep_ctx_blob && hw_target.find("NPU") != std::string::npos && - !global_context_.has_external_weights) { + !session_context_.has_external_weights) { std::shared_ptr ov_model; { const std::string model = model_proto->SerializeAsString(); if (!subgraph_context.has_dynamic_input_shape) { delete model_proto.release(); } - ov_model = global_context_.ie_core.Get().read_model(model, ov::Tensor()); + ov_model = session_context_.ie_core.Get().read_model(model, ov::Tensor()); } - exe_network_ = OVExeNetwork(global_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config)); - } else if (!global_context_.has_external_weights && + exe_network_ = OVExeNetwork(session_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config)); + } else if (!session_context_.has_external_weights && (!subgraph_context_.has_dynamic_input_shape) && ((hw_target.find("AUTO") == std::string::npos) || - (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) { + (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2))) { // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above // Inputs with static dimenstions const std::string model = model_proto->SerializeAsString(); - exe_network_ = global_context_.ie_core.CompileModel(model, + exe_network_ = session_context_.ie_core.CompileModel(model, hw_target, device_config, subgraph_context_.subgraph_name); } else { // For all other types use ov::Model Type - auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_); + exe_network_ = session_context_.ie_core.CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } #endif } else { // Full graph is not supported - auto ov_model = CreateOVModel(*model_proto, global_context_, const_outputs_map_); - exe_network_ = global_context_.ie_core.CompileModel( + auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_); + exe_network_ = session_context_.ie_core.CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { ORT_THROW(msg); } - int num_infer_req = (global_context_.num_of_threads > 0) ? global_context_.num_of_threads : 1; + int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1; inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req)); } @@ -146,21 +146,21 @@ bool BasicBackend::ValidateSubgraph(std::map= 2024) { + if (session_context_.precision_str.find("ACCURACY") != std::string::npos && + session_context_.device_type.find("GPU") != std::string::npos) { + if (session_context_.OpenVINO_Version.at(0) >= 2024) { device_config.emplace(ov::hint::inference_precision(ov::element::undefined)); device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)); } else { - if (global_context_.model_precision != "") - device_config.emplace(ov::hint::inference_precision(global_context_.model_precision)); + if (session_context_.model_precision != "") + device_config.emplace(ov::hint::inference_precision(session_context_.model_precision)); } } #ifndef NDEBUG @@ -171,10 +171,10 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { // Set a priority level for the current workload for preemption; default priority is "DEFAULT" // CPU Plugin doesn't support workload priority - if (global_context_.device_type.find("CPU") == std::string::npos) - device_config.emplace(ov::hint::model_priority(global_context_.model_priority)); + if (session_context_.device_type.find("CPU") == std::string::npos) + device_config.emplace(ov::hint::model_priority(session_context_.model_priority)); - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { std::pair device_property; device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER"); @@ -184,16 +184,16 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } device_config.emplace(ov::device::properties("NPU", device_property)); #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024)) - if (global_context_.export_ep_ctx_blob) { - global_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); + if (session_context_.export_ep_ctx_blob) { + session_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); } #endif } - if (!global_context_.load_config.empty()) { - const std::map& target_config = global_context_.load_config; + if (!session_context_.load_config.empty()) { + const std::map& target_config = session_context_.load_config; - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { auto npuw_config = target_config.at("NPU"); // Check if "NPU_USE_NPUW" exists and is set to "YES" @@ -253,7 +253,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { continue; } if (is_supported_and_mutable(key, supported_properties)) { - global_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); + session_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); } else { LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key << "\" is either unsupported in current OpenVINO version" @@ -264,26 +264,26 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { }; // Check if the device type is AUTO, HETERO, or MULTI - if (global_context_.device_type.find("AUTO") == 0 || - global_context_.device_type.find("HETERO") == 0 || - global_context_.device_type.find("MULTI") == 0) { + if (session_context_.device_type.find("AUTO") == 0 || + session_context_.device_type.find("HETERO") == 0 || + session_context_.device_type.find("MULTI") == 0) { // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"]) - auto individual_devices = parse_individual_devices(global_context_.device_type); + auto individual_devices = parse_individual_devices(session_context_.device_type); // Set properties only for individual devices (e.g., "CPU", "GPU") for (const std::string& device : individual_devices) { if (target_config.count(device)) { // Get supported properties for each individual device - auto device_properties = global_context_.ie_core.Get().get_property(device, ov::supported_properties); + auto device_properties = session_context_.ie_core.Get().get_property(device, ov::supported_properties); // Set properties for the device set_target_properties(device, target_config.at(device), device_properties); } } } else { - if (target_config.count(global_context_.device_type)) { - auto supported_properties = global_context_.ie_core.Get().get_property(global_context_.device_type, + if (target_config.count(session_context_.device_type)) { + auto supported_properties = session_context_.ie_core.Get().get_property(session_context_.device_type, ov::supported_properties); - set_target_properties(global_context_.device_type, - target_config.at(global_context_.device_type), supported_properties); + set_target_properties(session_context_.device_type, + target_config.at(session_context_.device_type), supported_properties); } } } @@ -293,21 +293,21 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) { // cache_dir argument has no effect when working with an embed-mode EPContext Graph if (is_ep_ctx_graph_) return; - if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) { + if (!session_context_.cache_dir.empty() && !session_context_.export_ep_ctx_blob) { LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; - if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) { + if (session_context_.device_type.find("AUTO:GPU") != std::string::npos) { std::pair device_property; - device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir); + device_property = std::make_pair("CACHE_DIR", session_context_.cache_dir); device_config.emplace(ov::device::properties("GPU", device_property)); } else { - global_context_.ie_core.SetCache(global_context_.cache_dir); + session_context_.ie_core.SetCache(session_context_.cache_dir); } } } void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) { - if (global_context_.enable_opencl_throttling == true && - global_context_.device_type.find("GPU") != std::string::npos) { + if (session_context_.enable_opencl_throttling == true && + session_context_.device_type.find("GPU") != std::string::npos) { LOGS_DEFAULT(INFO) << log_tag << "Enabled OpenCL queue throttling for GPU device"; std::pair device_property; device_property = std::make_pair("PLUGIN_THROTTLE", "1"); @@ -318,28 +318,28 @@ void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) { void BasicBackend::EnableStreams() { // Return silently for NPU as it's currently treated as a read-only flag by the NPU plugin // and throws an exception for the same - if (global_context_.device_type.find("NPU") != std::string::npos) + if (session_context_.device_type.find("NPU") != std::string::npos) return; // Streams can be set only if the device is not one of AUTO, MULTI, or HETERO // Throw an exception if the user tries to set num_streams for these devices - if ((global_context_.device_type.find("MULTI") != std::string::npos) || - (global_context_.device_type.find("HETERO") != std::string::npos) || - (global_context_.device_type.find("AUTO") != std::string::npos)) { - if (global_context_.num_streams != 1) { + if ((session_context_.device_type.find("MULTI") != std::string::npos) || + (session_context_.device_type.find("HETERO") != std::string::npos) || + (session_context_.device_type.find("AUTO") != std::string::npos)) { + if (session_context_.num_streams != 1) { ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + - std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type); + std::to_string(session_context_.num_streams) + " for device " + session_context_.device_type); } // Do nothing } else { - global_context_.ie_core.SetStreams(global_context_.device_type, global_context_.num_streams); + session_context_.ie_core.SetStreams(session_context_.device_type, session_context_.num_streams); } } void BasicBackend::SetNumThreads(ov::AnyMap& device_config) { // inference_num_threads is applicable only for the CPU device - if (global_context_.device_type.find("CPU") != std::string::npos) - device_config.emplace(ov::inference_num_threads(global_context_.num_of_threads)); + if (session_context_.device_type.find("CPU") != std::string::npos) + device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads)); } // Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on @@ -370,9 +370,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && - !global_context_.disable_dynamic_shapes && - (global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + !session_context_.disable_dynamic_shapes && + (session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name)); auto tensor_info = tensor.GetTensorTypeAndShapeInfo(); auto tensor_shape = tensor_info.GetShape(); @@ -387,7 +387,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque const auto& input = graph_input_info.at(input_idx); OVTensorPtr tensor_ptr; // avoid input copies on the CPU device - if (global_context_.device_type.find("CPU") != std::string::npos) { + if (session_context_.device_type.find("CPU") != std::string::npos) { tensor_ptr = std::make_shared(input.get_element_type(), input_tensor_shape, (void*)tensor_data); } else { @@ -401,8 +401,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque ORT_THROW(msg); } } else { - if ((global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + if ((session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { OVTensorPtr graph_input_blob; try { graph_input_blob = infer_request->GetTensor(input_name); @@ -434,7 +434,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } input_idx++; } - if (global_context_.device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { // Set the output blob as remote blob auto graph_output_info = exe_network_.Get().outputs(); auto output_idx = 0; @@ -628,8 +628,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe " doesn't exist in the " "list of OpenVINO output tensor names"); } - if ((global_context_.device_type.find("CPU") != std::string::npos || - global_context_.device_type.find("GPU") != std::string::npos)) { + if ((session_context_.device_type.find("CPU") != std::string::npos || + session_context_.device_type.find("GPU") != std::string::npos)) { try { graph_output_blob = infer_request->GetTensor(output_name); } catch (const char* msg) { @@ -703,8 +703,8 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { OVInferRequestPtr infer_request; infer_request = inferRequestsQueue_->getIdleRequest(); #ifdef IO_BUFFER_ENABLED - if ((global_context_.device_type.find("GPU") != std::string::npos) && - (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) { + if ((session_context_.device_type.find("GPU") != std::string::npos) && + (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) { try { StartRemoteAsyncInference(context, infer_request); } catch (std::string const& msg) { @@ -748,7 +748,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { #ifndef IO_BUFFER_ENABLED // Printing performance counts is disabled when IO_BUFFER_ENABLED if (openvino_ep::backend_utils::IsDebugEnabled()) { inferRequestsQueue_->printstatus(); // Printing the elements of infer_requests_ vector pool only in debug mode - std::string& hw_target = global_context_.device_type; + std::string& hw_target = session_context_.device_type; printPerformanceCounts(std::move(infer_request_), std::cout, hw_target); } #endif diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 3fcf6e4384d52..0aab336ce909f 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -30,7 +30,7 @@ class InferRequestsQueue; class BasicBackend : public IBackend { public: BasicBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, EPCtxHandler& ep_ctx_handle); @@ -55,7 +55,7 @@ class BasicBackend : public IBackend { void CompleteAsyncInference(Ort::KernelContext& context, std::shared_ptr infer_request); - GlobalContext& global_context_; + SessionContext& session_context_; SubGraphContext subgraph_context_; mutable std::mutex compute_lock_; OVExeNetwork exe_network_; diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 4f970bc7bc287..23256b8df6fd0 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -13,7 +13,7 @@ namespace onnxruntime { namespace openvino_ep { // Holds context applicable to the entire EP instance. -struct GlobalContext { +struct SessionContext { OVCore ie_core; bool is_wholly_supported_graph = false; bool enable_opencl_throttling = false; diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 7a2d6f4e8cd69..6d4aad3aec919 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -21,7 +21,7 @@ class BackendFactory { public: static std::shared_ptr MakeBackend(std::unique_ptr& model_proto, - GlobalContext& global_context, + SessionContext& session_context, const SubGraphContext& subgraph_context, EPCtxHandler& ctx_handle); }; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 72a188108adef..e5ffde62eeedb 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -25,28 +25,28 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} { InitProviderOrtApi(); - global_context_ = std::make_unique(); - global_context_->device_type = info.device_type_; - global_context_->precision_str = info.precision_; - global_context_->cache_dir = info.cache_dir_; - global_context_->load_config = info.load_config_; - global_context_->model_priority = info.model_priority_; - global_context_->num_streams = info.num_streams_; - global_context_->context = info.context_; - global_context_->enable_opencl_throttling = info.enable_opencl_throttling_; - global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_; - global_context_->num_of_threads = info.num_of_threads_; - global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; - global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_; - global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_; - global_context_->disable_cpu_fallback = info.disable_cpu_fallback_; - global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_; + session_context_ = std::make_unique(); + session_context_->device_type = info.device_type_; + session_context_->precision_str = info.precision_; + session_context_->cache_dir = info.cache_dir_; + session_context_->load_config = info.load_config_; + session_context_->model_priority = info.model_priority_; + session_context_->num_streams = info.num_streams_; + session_context_->context = info.context_; + session_context_->enable_opencl_throttling = info.enable_opencl_throttling_; + session_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_; + session_context_->num_of_threads = info.num_of_threads_; + session_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; + session_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_; + session_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_; + session_context_->disable_cpu_fallback = info.disable_cpu_fallback_; + session_context_->ep_context_embed_mode = info.so_epctx_embed_mode_; // to check if target device is available // using ie_core capability GetAvailableDevices to fetch list of devices plugged in if (info.cache_dir_.empty()) { bool device_found = false; - std::vector available_devices = global_context_->ie_core.GetAvailableDevices(); + std::vector available_devices = session_context_->ie_core.GetAvailableDevices(); // Checking for device_type configuration if (info.device_type_ != "") { if (info.device_type_.find("HETERO") != std::string::npos || @@ -85,8 +85,8 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const { std::vector> result; - std::string openvino_sdk_version = std::to_string(global_context_->OpenVINO_Version.at(0)) + "." + - std::to_string(global_context_->OpenVINO_Version.at(1)); + std::string openvino_sdk_version = std::to_string(session_context_->OpenVINO_Version.at(0)) + "." + + std::to_string(session_context_->OpenVINO_Version.at(1)); // Check for valid ctx node and maintain state for validity if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version))) @@ -97,20 +97,20 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) { std::cout << "In the OpenVINO EP" << std::endl; } - global_context_->onnx_model_path_name = graph_viewer.ModelPath().string(); + session_context_->onnx_model_path_name = graph_viewer.ModelPath().string(); - global_context_->onnx_opset_version = + session_context_->onnx_opset_version = graph_viewer.DomainToVersionMap().at(kOnnxDomain); - global_context_->model_precision = [&](const GraphViewer& graph_viewer) { + session_context_->model_precision = [&](const GraphViewer& graph_viewer) { // return empty if graph has no inputs or if types are not one of FP32/FP16 // else assume the type of the first input if (graph_viewer.GetInputs().empty()) { return ""; } else { auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type(); - if (global_context_->precision_str == "ACCURACY" && - global_context_->device_type.find("GPU") != std::string::npos) { + if (session_context_->precision_str == "ACCURACY" && + session_context_->device_type.find("GPU") != std::string::npos) { if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { return "FP32"; } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) { @@ -122,12 +122,12 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, }(graph_viewer); openvino_ep::GetCapability obj(graph_viewer, - global_context_->device_type, - global_context_->enable_qdq_optimizer); + session_context_->device_type, + session_context_->enable_qdq_optimizer); result = obj.Execute(); - global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph(); - global_context_->has_external_weights = obj.HasExternalWeights(); + session_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph(); + session_context_->has_external_weights = obj.HasExternalWeights(); return result; } @@ -141,14 +141,14 @@ common::Status OpenVINOExecutionProvider::Compile( NodeComputeInfo compute_info; - global_context_->use_api_2 = true; + session_context_->use_api_2 = true; // During backend creation, we check if user wants to use precompiled blob onnx model or the original model // For precompiled blob, directly load the model instead of compiling the model // For original model, check if the user wants to export a model with pre-compiled blob std::shared_ptr backend_manager = - std::make_shared(*global_context_, + std::make_shared(*session_context_, fused_node, graph_body_viewer, *GetLogger(), @@ -189,11 +189,11 @@ common::Status OpenVINOExecutionProvider::Compile( #ifdef USE_OVEP_NPU_MEMORY std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() { - if (global_context_->device_type.find("NPU") != std::string::npos) { + if (session_context_->device_type.find("NPU") != std::string::npos) { AllocatorCreationInfo npu_allocator_info{ [this](OrtDevice::DeviceId device_id) { return std::make_unique( - global_context_->ie_core.Get(), + session_context_->ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU); diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index d5c22a4e2a9e4..26a67ba04756b 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -198,7 +198,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider { std::vector CreatePreferredAllocators() override; #endif private: - std::unique_ptr global_context_; + std::unique_ptr session_context_; std::shared_ptr backend_manager_; openvino_ep::EPCtxHandler ep_ctx_handle_{}; }; From f170c88bbea5b0461251c5714d5f89225ce8217a Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Wed, 8 Jan 2025 00:56:07 -0800 Subject: [PATCH 02/35] Add support for GetEpContextNodes --- .../providers/openvino/backend_manager.cc | 132 +++++++++-------- .../core/providers/openvino/backend_manager.h | 3 +- .../core/providers/openvino/backend_utils.cc | 6 +- .../core/providers/openvino/backend_utils.h | 1 + .../openvino/backends/backend_factory.cc | 4 +- .../openvino/backends/basic_backend.cc | 55 ++++--- .../openvino/backends/basic_backend.h | 3 +- .../core/providers/openvino/contexts.h | 11 +- .../core/providers/openvino/ibackend.h | 5 +- .../openvino/onnx_ctx_model_helper.cc | 132 +++++++++-------- .../openvino/onnx_ctx_model_helper.h | 28 ++-- .../openvino/openvino_execution_provider.cc | 137 ++++++++---------- .../openvino/openvino_execution_provider.h | 13 +- .../core/providers/openvino/ov_interface.cc | 13 +- .../core/providers/openvino/ov_interface.h | 3 +- .../openvino/ov_versions/capability.cc | 56 +++++-- .../openvino/ov_versions/capability.h | 5 +- .../shared_library/provider_wrappedtypes.h | 3 +- .../core/session/provider_bridge_ort.cc | 3 +- 19 files changed, 334 insertions(+), 279 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 1796256a23441..04c1ffebb838d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -10,8 +10,10 @@ #include #include #include +#include #include "core/providers/shared_library/provider_api.h" +#include "core/providers/openvino/ov_versions/capability.h" #include "core/providers/openvino/contexts.h" #include "core/providers/openvino/backend_manager.h" #include "core/providers/openvino/ibackend.h" @@ -34,15 +36,35 @@ BackendManager::BackendManager(const SessionContext& session_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, - EPCtxHandler& ep_ctx_handle_) { - session_context_ = session_context; - - openvino_sdk_version_ = std::to_string(session_context_.OpenVINO_Version.at(0)) + "." + - std::to_string(session_context_.OpenVINO_Version.at(1)); - if (ep_ctx_handle_.CheckForOVEPCtxNode(subgraph, openvino_sdk_version_)) { - if (ep_ctx_handle_.ImportBlobFromEPCtxModel(subgraph, session_context_.ep_context_embed_mode) != Status::OK()) - ORT_THROW("Import blob from model failed"); - } + EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle), session_context_(session_context) { + subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph); + + subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) { + // return empty if graph has no inputs or if types are not one of FP32/FP16 + // else assume the type of the first input + if (graph_viewer.GetInputs().empty()) { + return ""; + } else { + auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type(); + if (session_context_.precision_str == "ACCURACY" && + session_context_.device_type.find("GPU") != std::string::npos) { + if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { + return "FP32"; + } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) { + return "FP16"; + } + } + } + return ""; + }(subgraph); + + openvino_ep::GetCapability obj(ep_ctx_handle_, + subgraph, + session_context_.device_type, + session_context_.enable_qdq_optimizer); + std::ignore = obj.Execute(); + subgraph_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph(); + subgraph_context_.has_external_weights = obj.HasExternalWeights(); // Save the indexes of graph inputs among fused_node's inputDefs // (which also contains initializers). @@ -70,8 +92,11 @@ BackendManager::BackendManager(const SessionContext& session_context, i++; } subgraph_context_.subgraph_name = fused_node.Name(); + ptr_stream_t model_stream; std::unique_ptr model_proto; - if (!ep_ctx_handle_.IsValidOVEPCtxGraph()) { + if (subgraph_context_.is_ep_ctx_graph) { + model_stream = ep_ctx_handle_.GetModelBlobStream(subgraph); + } else { model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); } std::string device_type = session_context_.device_type; @@ -88,7 +113,7 @@ BackendManager::BackendManager(const SessionContext& session_context, concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, subgraph_context_, - ep_ctx_handle_); + model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } @@ -111,12 +136,12 @@ BackendManager::BackendManager(const SessionContext& session_context, concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, subgraph_context_, - ep_ctx_handle_); + model_stream); } catch (const OnnxRuntimeException& ex) { std::string exception_str = ex.what(); bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && !session_context_.disable_cpu_fallback && - !ep_ctx_handle_.IsValidOVEPCtxGraph(); + !subgraph_context_.is_ep_ctx_graph; #if defined(OPENVINO_DISABLE_NPU_FALLBACK) eligible_for_cpu_fallback = false; #else @@ -130,7 +155,7 @@ BackendManager::BackendManager(const SessionContext& session_context, concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, subgraph_context_, - ep_ctx_handle_); + model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } @@ -162,7 +187,7 @@ BackendManager::BackendManager(const SessionContext& session_context, } } } - if (session_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { + if (session_context_.export_ep_ctx_blob && !subgraph_context_.is_ep_ctx_graph) { auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, logger); if ((!status.IsOK())) { @@ -185,23 +210,12 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_THROW(exception_str); } - std::string model_blob_str; - auto compiled_model = concrete_backend_->GetOVCompiledModel(); - std::string graph_name = ""; - // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability - if (!session_context_.cache_dir.empty()) { - graph_name = session_context_.cache_dir; - } else { - graph_name = session_context_.onnx_model_path_name; - // Remove extension so we can append suffix to form the complete name of output graph - size_t dot = session_context_.onnx_model_path_name.find_last_of("."); - graph_name = graph_name.substr(0, dot); - if (dot != std::string::npos) graph_name += "_ctx.onnx"; - } - // If embed_mode, then pass on the serialized blob // If not embed_mode, dump the blob here and only pass on the path to the blob + std::string model_blob_str; + auto compiled_model = concrete_backend_->GetOVCompiledModel(); if (session_context_.ep_context_embed_mode) { + // Internal blob std::ostringstream model_blob_stream; compiled_model.export_model(model_blob_stream); model_blob_str = std::move(model_blob_stream).str(); @@ -209,23 +223,30 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_THROW("Model blob stream is empty after exporting the compiled model."); } } else { - // Remove extension so we can append suffix to form the complete name of output graph - auto blob_name = graph_name.substr(0, graph_name.find_last_of(".")); - std::ofstream blob_file(blob_name + ".blob", + // External blob + std::filesystem::path blob_filename; + // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability + if (!session_context_.cache_dir.empty()) { + blob_filename = session_context_.cache_dir; + } else { + blob_filename = graph_body_viewer.ModelPath(); + } + const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)}; + blob_filename = blob_filename.parent_path() / name; + blob_filename.replace_extension("blob"); + std::ofstream blob_file(blob_filename, std::ios::out | std::ios::trunc | std::ios::binary); if (!blob_file) { ORT_THROW("Unable to open file for epctx model dump."); } compiled_model.export_model(blob_file); - model_blob_str = blob_name + ".blob"; + model_blob_str = blob_filename.string(); } - ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer, - graph_name, - logger, - session_context_.ep_context_embed_mode, - std::move(model_blob_str), - openvino_sdk_version_)); + ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer, + subgraph_context_.subgraph_name, + session_context_.ep_context_embed_mode, + std::move(model_blob_str))); return Status::OK(); } @@ -296,27 +317,20 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { return false; } -static void DumpOpenVINOEPModel(std::string onnx_model_path_name, +static void DumpOpenVINOEPModel(const std::filesystem::path& onnx_model_path_name, ONNX_NAMESPACE::ModelProto* model_proto, const onnxruntime::Node& fused_node) { if (openvino_ep::backend_utils::IsDebugEnabled()) { - auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : std::move(onnx_model_path_name); -#ifdef _WIN32 - size_t slash = model_name.find_last_of("\\"); -#else - size_t slash = model_name.find_last_of("/"); -#endif - model_name = model_name.substr(slash + 1, std::string::npos); - size_t dot = model_name.find_last_of("."); - model_name = model_name.substr(0, dot); + auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name.filename(); - std::string subgraph_name = fused_node.Name(); + const auto& subgraph_name = fused_node.Name(); size_t dash = subgraph_name.find_last_of("-"); - subgraph_name = subgraph_name.substr(dash, std::string::npos); - - const std::string name = model_name + subgraph_name + ".onnx"; + if (dash != std::string::npos) { + auto new_name = model_name.stem().string() + subgraph_name.substr(dash, std::string::npos); + model_name.replace_filename(new_name); + } - std::fstream dump(name, std::ios::out | std::ios::trunc | std::ios::binary); + std::fstream dump(model_name, std::ios::out | std::ios::trunc | std::ios::binary); model_proto->SerializeToOstream(dump); } } @@ -341,6 +355,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, } }; + const auto& onnx_model_path_name = subgraph.ModelPath(); // QDQ stripping enabled only for the NPU if (session_context_.device_type.find("NPU") != std::string::npos && session_context_.enable_qdq_optimizer && @@ -351,7 +366,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); - DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else { @@ -361,7 +376,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); subgraph.ToProto(*model_proto->mutable_graph(), true, true); print_model_proto_duration(); - DumpOpenVINOEPModel(session_context_.onnx_model_path_name, model_proto.get(), fused_node); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); return model_proto; } } @@ -463,6 +478,7 @@ void BackendManager::Compute(OrtKernelContext* context) { std::shared_ptr dynamic_backend; auto search = backend_map_.find(key); if (search == backend_map_.end()) { + ptr_stream_t model_stream; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " << "Creating dynamic backend for key: " << key; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " @@ -472,7 +488,7 @@ void BackendManager::Compute(OrtKernelContext* context) { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, session_context_, subgraph_context_, - ep_ctx_handle_); + model_stream); } catch (const OnnxRuntimeException& ex) { // Build option disables fallback to CPU on compilation failures with NPU. #if defined(OPENVINO_DISABLE_NPU_FALLBACK) @@ -491,7 +507,7 @@ void BackendManager::Compute(OrtKernelContext* context) { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, session_context_, subgraph_context_, - ep_ctx_handle_); + model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 7ae647188976d..f77f303c70991 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -51,9 +51,8 @@ class BackendManager { std::shared_ptr concrete_backend_; std::map> backend_map_; SubGraphContext subgraph_context_; + EPCtxHandler& ep_ctx_handle_; SessionContext session_context_; - EPCtxHandler ep_ctx_handle_{}; - std::string openvino_sdk_version_{}; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 6c28db5803cb1..84de5eb4f16f9 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -40,7 +40,9 @@ struct static_cast_int64 { }; std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContext& session_context, +CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, + const SessionContext& session_context, + const SubGraphContext& subgraph_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; @@ -50,7 +52,7 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContex auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name); // Check for Constant Folding - if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { + if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; pass_const_obj.run_on_model(ov_model); auto& results = const_cast(ov_model.get()->get_results()); diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 4a500a3f146f7..2765fe0e9b1c7 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -63,6 +63,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const SessionContext& session_context, + const SubGraphContext& subgraph_context, std::map>& const_outputs_map); void printPerformanceCounts(const std::vector& performanceMap, diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index 4b3e57d087381..2fd9a7fa0a537 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -14,7 +14,7 @@ std::shared_ptr BackendFactory::MakeBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ep_ctx_handle) { + ptr_stream_t& model_stream) { std::string type = session_context.device_type; if (type == "CPU" || type.find("GPU") != std::string::npos || type.find("NPU") != std::string::npos || @@ -23,7 +23,7 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p type.find("AUTO") != std::string::npos) { std::shared_ptr concrete_backend_; try { - concrete_backend_ = std::make_shared(model_proto, session_context, subgraph_context, ep_ctx_handle); + concrete_backend_ = std::make_shared(model_proto, session_context, subgraph_context, model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 7dbd8bd5e979b..bacf25effb0f3 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -23,12 +23,10 @@ using namespace backend_utils; BasicBackend::BasicBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ep_ctx_handle) + ptr_stream_t& model_stream) : session_context_(session_context), subgraph_context_(subgraph_context) { std::string& hw_target = session_context_.device_type; - is_ep_ctx_graph_ = ep_ctx_handle.IsValidOVEPCtxGraph(); - if (ValidateSubgraph(const_outputs_map_)) return; @@ -61,13 +59,12 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr try { std::string dev_prec = session_context.device_type + "_" + session_context_.precision_str; - if (session_context.is_wholly_supported_graph) { // Full graph is supported + if (subgraph_context_.is_wholly_supported_graph) { // Full graph is supported #if defined(IO_BUFFER_ENABLED) - if (is_ep_ctx_graph_) { - std::istringstream model_stream(ep_ctx_handle.GetModelBlobString()); - exe_network_ = session_context_.ie_core.ImportModel(model_stream, - remote_context_, - subgraph_context_.subgraph_name); + if (subgraph_context_.is_ep_ctx_graph) { + exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + remote_context_, + subgraph_context_.subgraph_name); } else if ((session_context.device_type.find("GPU") != std::string::npos) && (session_context_.context != nullptr)) { LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; @@ -82,28 +79,28 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); } #else // !IO_BUFFER_ENABLED - std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : session_context_.model_precision; - if (is_ep_ctx_graph_) { + std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : subgraph_context_.model_precision; + if (subgraph_context_.is_ep_ctx_graph) { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob - exe_network_ = session_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), - hw_target, - device_config, - session_context_.ep_context_embed_mode, - subgraph_context_.subgraph_name); + exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + hw_target, + device_config, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed } else if (session_context_.export_ep_ctx_blob && hw_target.find("NPU") != std::string::npos && - !session_context_.has_external_weights) { + !subgraph_context_.has_external_weights) { std::shared_ptr ov_model; { const std::string model = model_proto->SerializeAsString(); - if (!subgraph_context.has_dynamic_input_shape) { + if (!subgraph_context_.has_dynamic_input_shape) { delete model_proto.release(); } ov_model = session_context_.ie_core.Get().read_model(model, ov::Tensor()); } exe_network_ = OVExeNetwork(session_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config)); - } else if (!session_context_.has_external_weights && + } else if (!subgraph_context_.has_external_weights && (!subgraph_context_.has_dynamic_input_shape) && ((hw_target.find("AUTO") == std::string::npos) || (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2))) { @@ -111,17 +108,17 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Inputs with static dimenstions const std::string model = model_proto->SerializeAsString(); exe_network_ = session_context_.ie_core.CompileModel(model, - hw_target, - device_config, - subgraph_context_.subgraph_name); + hw_target, + device_config, + subgraph_context_.subgraph_name); } else { // For all other types use ov::Model Type - auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_); + auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_); exe_network_ = session_context_.ie_core.CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } #endif } else { // Full graph is not supported - auto ov_model = CreateOVModel(*model_proto, session_context_, const_outputs_map_); + auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_); exe_network_ = session_context_.ie_core.CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } @@ -159,8 +156,8 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { device_config.emplace(ov::hint::inference_precision(ov::element::undefined)); device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY)); } else { - if (session_context_.model_precision != "") - device_config.emplace(ov::hint::inference_precision(session_context_.model_precision)); + if (!subgraph_context_.model_precision.empty()) + device_config.emplace(ov::hint::inference_precision(subgraph_context_.model_precision)); } } #ifndef NDEBUG @@ -281,7 +278,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } else { if (target_config.count(session_context_.device_type)) { auto supported_properties = session_context_.ie_core.Get().get_property(session_context_.device_type, - ov::supported_properties); + ov::supported_properties); set_target_properties(session_context_.device_type, target_config.at(session_context_.device_type), supported_properties); } @@ -291,7 +288,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { void BasicBackend::EnableCaching(ov::AnyMap& device_config) { // cache_dir argument has no effect when working with an embed-mode EPContext Graph - if (is_ep_ctx_graph_) return; + if (subgraph_context_.is_ep_ctx_graph) return; if (!session_context_.cache_dir.empty() && !session_context_.export_ep_ctx_blob) { LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; @@ -300,7 +297,7 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) { device_property = std::make_pair("CACHE_DIR", session_context_.cache_dir); device_config.emplace(ov::device::properties("GPU", device_property)); } else { - session_context_.ie_core.SetCache(session_context_.cache_dir); + session_context_.ie_core.SetCache(session_context_.cache_dir.string()); } } } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 0aab336ce909f..177784a71f575 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -32,7 +32,7 @@ class BasicBackend : public IBackend { BasicBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ep_ctx_handle); + ptr_stream_t& model_stream); void Infer(OrtKernelContext* context) override; ov::CompiledModel& GetOVCompiledModel() override { @@ -61,7 +61,6 @@ class BasicBackend : public IBackend { OVExeNetwork exe_network_; std::map> const_outputs_map_; std::unique_ptr inferRequestsQueue_; - bool is_ep_ctx_graph_{false}; #if defined IO_BUFFER_ENABLED OVRemoteContextPtr remote_context_; #endif diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 23256b8df6fd0..e9405b5ac5142 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "core/providers/openvino/ov_interface.h" namespace onnxruntime { @@ -15,19 +16,16 @@ namespace openvino_ep { // Holds context applicable to the entire EP instance. struct SessionContext { OVCore ie_core; - bool is_wholly_supported_graph = false; bool enable_opencl_throttling = false; bool disable_dynamic_shapes = false; bool ep_context_embed_mode = false; bool export_ep_ctx_blob = false; bool enable_qdq_optimizer = false; bool disable_cpu_fallback = false; - bool has_external_weights = false; size_t num_of_threads; std::string device_type; std::string precision_str; - std::string model_precision; - std::string cache_dir; + std::filesystem::path cache_dir; std::map load_config; std::string model_priority = "DEFAULT"; int num_streams; @@ -38,6 +36,7 @@ struct SessionContext { void* context = 0; bool use_api_2; std::vector OpenVINO_Version = {}; // Ov Major and OV minor version from OV headers + std::string openvino_sdk_version; }; // Holds context specific to subgraph. @@ -51,6 +50,10 @@ struct SubGraphContext { std::vector input_indexes; std::unordered_map input_names; std::unordered_map output_names; + bool is_wholly_supported_graph = false; + bool has_external_weights = false; + std::string model_precision; + bool is_ep_ctx_graph = false; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 6d4aad3aec919..0d440eee598d3 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -4,6 +4,7 @@ #pragma once #include +#include #define ORT_API_MANUAL_INIT #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/onnx_ctx_model_helper.h" @@ -16,14 +17,14 @@ class IBackend { virtual void Infer(OrtKernelContext* context) = 0; virtual ov::CompiledModel& GetOVCompiledModel() = 0; }; - +using ptr_stream_t = std::unique_ptr; class BackendFactory { public: static std::shared_ptr MakeBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, - EPCtxHandler& ctx_handle); + ptr_stream_t& model_stream); }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 6d159db3b390d..907650257c3f2 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -11,25 +11,45 @@ namespace onnxruntime { namespace openvino_ep { +EPCtxHandler::EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger) : openvino_sdk_version_(ov_sdk_version), logger_(logger) { + epctx_model_ = Model::Create("ovep_context_model", false, logger_); +} + /* Export the serialized blob string embedded onto an EPContext Node * along with other metadata necessary to validate the graph on import */ -Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, - const std::string& graph_name, - const logging::Logger& logger, - const bool& ep_context_embed_mode, - std::string&& model_blob_str, - const std::string& openvino_sdk_version) const { - auto& metadata = graph_viewer.GetGraph().GetModel().MetaData(); - auto model_build = graph_viewer.CreateModel(logger, metadata); - auto& graph_build = model_build->MainGraph(); +Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) { + // Serialize modelproto to string + auto model_proto = epctx_model_->ToProto(); + model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + + // Finally, dump the model + std::ofstream epctx_onnx_model(model_name, + std::ios::out | std::ios::trunc | std::ios::binary); + if (!epctx_onnx_model) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file"); + } + + if (!model_proto->SerializeToOstream(epctx_onnx_model)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file"); + } + LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node"; + + return Status::OK(); +} + +Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, + const std::string& graph_name, + const bool ep_context_embed_mode, + std::string&& model_blob_str) const { + auto& graph = epctx_model_->MainGraph(); // Get graph inputs and outputs const auto& viewer_inputs = graph_viewer.GetInputs(); const auto& viewer_outputs = graph_viewer.GetOutputs(); std::vector inputs(viewer_inputs.size()), outputs(viewer_outputs.size()); - auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph_build.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); }; + auto transform_f = [&](const onnxruntime::NodeArg* iter) { return &graph.GetOrCreateNodeArg(iter->Name(), iter->TypeAsProto()); }; auto fill_vectors = [transform_f](auto& src, auto& dst) { std::transform(src.begin(), src.end(), dst.begin(), transform_f); }; @@ -60,7 +80,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, auto sdk_version_attr = ONNX_NAMESPACE::AttributeProto::Create(); sdk_version_attr->set_name(EP_SDK_VER); sdk_version_attr->set_type(onnx::AttributeProto_AttributeType_STRING); - sdk_version_attr->set_s(openvino_sdk_version); + sdk_version_attr->set_s(openvino_sdk_version_); node_attributes->emplace(EP_SDK_VER, std::move(*sdk_version_attr)); // source @@ -70,73 +90,65 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, source_attr->set_s(kOpenVINOExecutionProvider); node_attributes->emplace(SOURCE, std::move(*source_attr)); } + // Create EP context node - graph_build.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain); - ORT_ENFORCE(graph_build.Resolve().IsOK()); + graph.AddNode(graph_name, EPCONTEXT_OP, "", inputs, outputs, std::move(*node_attributes), kMSDomain); - { - // Serialize modelproto to string - auto model_proto = model_build->ToProto(); - model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); - - // Finally, dump the model - std::ofstream epctx_onnx_model(graph_name, - std::ios::out | std::ios::trunc | std::ios::binary); - if (!epctx_onnx_model) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to create epctx onnx model file"); - } - - if (!model_proto->SerializeToOstream(epctx_onnx_model)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize model to file"); - } - } - LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node"; + ORT_ENFORCE(graph.Resolve().IsOK()); return Status::OK(); } -Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode) { - auto node = graph_viewer.GetNode(0); +std::unique_ptr EPCtxHandler::GetModelBlobStream(const GraphViewer& graph_viewer) const { + auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin(); + auto node = graph_viewer.GetNode(first_index); + ORT_ENFORCE(node != nullptr); auto& attrs = node->GetAttributes(); - ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0); - ep_cache_context_attribute_ = &attrs.at(EP_CACHE_CONTEXT); + ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1); + const auto& ep_cache_context_attribute = attrs.at(EP_CACHE_CONTEXT); + const auto& cache_context = ep_cache_context_attribute.s(); - ep_context_embed_mode = static_cast(attrs.at(EMBED_MODE).i()); - LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; - - is_valid_ep_ctx_graph_ = true; - return Status::OK(); -} + ORT_ENFORCE(attrs.count(EMBED_MODE) == 1); + bool ep_context_embed_mode = static_cast(attrs.at(EMBED_MODE).i()); -const std::string& EPCtxHandler::GetModelBlobStream() const { - static std::string empty; - if (ep_cache_context_attribute_ != nullptr) { - return ep_cache_context_attribute_->s(); + std::unique_ptr result; + if (ep_context_embed_mode) { + result.reset((std::istream*)new std::istringstream(cache_context)); } else { - return empty; + result.reset((std::istream*)new std::ifstream(cache_context, std::ios_base::binary | std::ios_base::in)); } + LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; + return result; } -bool EPCtxHandler::CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const { - for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) { - auto node = graph_viewer.GetNode(i); - auto& attrs = node->GetAttributes(); - - // Check for correct Op Type, EP SOURCE, and SDK version - if (node != nullptr && node->OpType() == EPCONTEXT_OP) { - if (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider) { - if (attrs.at(EP_SDK_VER).s() == openvino_sdk_version) { - return true; - } else { - ORT_THROW("[Invalid Graph] Versions of OpenVINO used to export blob (" + attrs.at(EP_SDK_VER).s() + - ") and current runtime (" + openvino_sdk_version + ") don't match."); - } - } +bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const { + if (graph_viewer.NumberOfNodes() == 1) { + auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin(); + if (auto node = graph_viewer.GetNode(first_index); (node != nullptr) && CheckForOVEPCtxNode(*node)) { + return true; } } return false; } +bool EPCtxHandler::CheckForOVEPCtxNode(const Node& node) const { + // Check for correct Op Type, EP SOURCE, and SDK version + if (node.OpType() == EPCONTEXT_OP) { + auto& attrs = node.GetAttributes(); + bool result = (attrs.count(SOURCE) == 1) && (attrs.at(SOURCE).s() == kOpenVINOExecutionProvider); + result &= (attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_); + result &= attrs.count(EMBED_MODE) == 1; + result &= attrs.count(EP_CACHE_CONTEXT) == 1; + return result; + } + return false; +} + +InlinedVector EPCtxHandler::GetEPCtxNodes() const { + const auto& epctx_nodes{epctx_model_->MainGraph().Nodes()}; + return InlinedVector(epctx_nodes.begin(), epctx_nodes.end()); +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h index caab33b7db775..7e5d5180b363b 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h @@ -22,22 +22,22 @@ static const char SOURCE[] = "source"; class EPCtxHandler { public: - EPCtxHandler() = default; - EPCtxHandler(const EPCtxHandler&) = delete; - Status ExportEPCtxModel(const GraphViewer& graph_viewer, - const std::string& graph_name, - const logging::Logger& logger, - const bool& ep_context_embed_mode, - std::string&& model_blob_str, - const std::string& openvino_sdk_version) const; - Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer, bool& ep_context_embed_mode); - bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const; - bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; } - const std::string& GetModelBlobStream() const; + EPCtxHandler(std::string ov_sdk_version, const logging::Logger& logger); + EPCtxHandler(const EPCtxHandler&) = delete; // No copy constructor + Status ExportEPCtxModel(const std::string& model_name); + bool CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const; + bool CheckForOVEPCtxNode(const Node& node) const; + Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, + const std::string& graph_name, + const bool ep_context_embed_mode, + std::string&& model_blob_str) const; + std::unique_ptr GetModelBlobStream(const GraphViewer& graph_viewer) const; + InlinedVector GetEPCtxNodes() const; private: - bool is_valid_ep_ctx_graph_{false}; - const onnx::AttributeProto* ep_cache_context_attribute_; + const std::string openvino_sdk_version_; + std::unique_ptr epctx_model_; + const logging::Logger& logger_; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index e5ffde62eeedb..0da0813e9143e 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/contexts.h" @@ -20,33 +21,39 @@ #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz)) namespace onnxruntime { +openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInfo& info) { + openvino_ep::SessionContext result = { + .enable_opencl_throttling = info.enable_opencl_throttling_, + .disable_dynamic_shapes = info.disable_dynamic_shapes_, + .ep_context_embed_mode = info.so_epctx_embed_mode_, + .export_ep_ctx_blob = info.export_ep_ctx_blob_, + .enable_qdq_optimizer = info.enable_qdq_optimizer_, + .disable_cpu_fallback = info.disable_cpu_fallback_, + .num_of_threads = info.num_of_threads_, + .device_type = info.device_type_, + .precision_str = info.precision_, + .cache_dir = info.cache_dir_, + .load_config = info.load_config_, + .model_priority = info.model_priority_, + .num_streams = info.num_streams_, + .context = info.context_, + .OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}, + .openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR), + }; + return result; +} OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info) - : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} { + : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider}, + session_context_{GetSessionContext(info)}, + ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} { InitProviderOrtApi(); - session_context_ = std::make_unique(); - session_context_->device_type = info.device_type_; - session_context_->precision_str = info.precision_; - session_context_->cache_dir = info.cache_dir_; - session_context_->load_config = info.load_config_; - session_context_->model_priority = info.model_priority_; - session_context_->num_streams = info.num_streams_; - session_context_->context = info.context_; - session_context_->enable_opencl_throttling = info.enable_opencl_throttling_; - session_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_; - session_context_->num_of_threads = info.num_of_threads_; - session_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; - session_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_; - session_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_; - session_context_->disable_cpu_fallback = info.disable_cpu_fallback_; - session_context_->ep_context_embed_mode = info.so_epctx_embed_mode_; - // to check if target device is available // using ie_core capability GetAvailableDevices to fetch list of devices plugged in if (info.cache_dir_.empty()) { bool device_found = false; - std::vector available_devices = session_context_->ie_core.GetAvailableDevices(); + std::vector available_devices = session_context_.ie_core.GetAvailableDevices(); // Checking for device_type configuration if (info.device_type_ != "") { if (info.device_type_.find("HETERO") != std::string::npos || @@ -85,89 +92,62 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const { std::vector> result; - std::string openvino_sdk_version = std::to_string(session_context_->OpenVINO_Version.at(0)) + "." + - std::to_string(session_context_->OpenVINO_Version.at(1)); - - // Check for valid ctx node and maintain state for validity - if (ep_ctx_handle_.CheckForOVEPCtxNode(graph_viewer, std::move(openvino_sdk_version))) - ORT_ENFORCE(graph_viewer.NumberOfNodes() == 1, - "[Invalid Graph] EPContext Model with OpenVINO compiled blob should not have more than one node."); - // Enable CI Logs if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) { std::cout << "In the OpenVINO EP" << std::endl; } - session_context_->onnx_model_path_name = graph_viewer.ModelPath().string(); - - session_context_->onnx_opset_version = - graph_viewer.DomainToVersionMap().at(kOnnxDomain); - - session_context_->model_precision = [&](const GraphViewer& graph_viewer) { - // return empty if graph has no inputs or if types are not one of FP32/FP16 - // else assume the type of the first input - if (graph_viewer.GetInputs().empty()) { - return ""; - } else { - auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type(); - if (session_context_->precision_str == "ACCURACY" && - session_context_->device_type.find("GPU") != std::string::npos) { - if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { - return "FP32"; - } else if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16) { - return "FP16"; - } - } - } - return ""; - }(graph_viewer); - openvino_ep::GetCapability obj(graph_viewer, - session_context_->device_type, - session_context_->enable_qdq_optimizer); + openvino_ep::GetCapability obj(ep_ctx_handle_, + graph_viewer, + session_context_.device_type, + session_context_.enable_qdq_optimizer); result = obj.Execute(); - session_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph(); - session_context_->has_external_weights = obj.HasExternalWeights(); - return result; } common::Status OpenVINOExecutionProvider::Compile( const std::vector& fused_nodes, std::vector& node_compute_funcs) { + auto& logger = *GetLogger(); + Status status = Status::OK(); + + // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext + session_context_.onnx_model_path_name = fused_nodes[0].filtered_graph.get().ModelPath().string(); + session_context_.onnx_opset_version = + fused_nodes[0].filtered_graph.get().DomainToVersionMap().at(kOnnxDomain); + for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; const Node& fused_node = fused_node_graph.fused_node; NodeComputeInfo compute_info; - session_context_->use_api_2 = true; + session_context_.use_api_2 = true; // During backend creation, we check if user wants to use precompiled blob onnx model or the original model // For precompiled blob, directly load the model instead of compiling the model // For original model, check if the user wants to export a model with pre-compiled blob - std::shared_ptr backend_manager = - std::make_shared(*session_context_, - fused_node, - graph_body_viewer, - *GetLogger(), - ep_ctx_handle_); - backend_manager_ = backend_manager; + auto& backend_manager = backend_managers_.emplace_back(session_context_, + fused_node, + graph_body_viewer, + logger, + ep_ctx_handle_); + compute_info.create_state_func = - [backend_manager](ComputeContext* context, FunctionState* state) { - OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(); + [&backend_manager](ComputeContext* context, FunctionState* state) { + OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(backend_manager); p->allocate_func = context->allocate_func; p->destroy_func = context->release_func; p->allocator_handle = context->allocator_handle; - p->backend_manager = backend_manager; *state = static_cast(p); return 0; }; compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { auto function_state = static_cast(state); try { - function_state->backend_manager->Compute(context); + function_state->backend_manager.Compute(context); } catch (const std::exception& ex) { return common::Status(common::ONNXRUNTIME, common::FAIL, ex.what()); } @@ -182,18 +162,22 @@ common::Status OpenVINOExecutionProvider::Compile( } }; node_compute_funcs.push_back(compute_info); + + if (!status.IsOK()) { + break; + } } - return Status::OK(); + return status; } #ifdef USE_OVEP_NPU_MEMORY std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() { - if (session_context_->device_type.find("NPU") != std::string::npos) { + if (session_context_.device_type.find("NPU") != std::string::npos) { AllocatorCreationInfo npu_allocator_info{ [this](OrtDevice::DeviceId device_id) { return std::make_unique( - session_context_->ie_core.Get(), + session_context_.ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU); @@ -232,8 +216,10 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::spanGetOVCompiledModel(); - ov_compiled_model.set_property(ov::workload_type(workload_type)); + for (auto& backend : backend_managers_) { + ov::CompiledModel& ov_compiled_model = backend.GetOVCompiledModel(); + ov_compiled_model.set_property(ov::workload_type(workload_type)); + } } } else { // Handle unknown options @@ -242,4 +228,9 @@ common::Status OpenVINOExecutionProvider::SetEpDynamicOptions(gsl::span OpenVINOExecutionProvider::GetEpContextNodes() const { + return ep_ctx_handle_.GetEPCtxNodes(); +} + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 26a67ba04756b..9b48d9e5ce3a3 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -169,10 +169,11 @@ struct OpenVINOExecutionProviderInfo { }; struct OpenVINOEPFunctionState { + OpenVINOEPFunctionState(openvino_ep::BackendManager& bm) : backend_manager(bm) {} AllocateFunc allocate_func = nullptr; DestroyFunc destroy_func = nullptr; AllocatorHandle allocator_handle = nullptr; - std::shared_ptr backend_manager; + openvino_ep::BackendManager& backend_manager; }; // Logical device representation. @@ -194,13 +195,17 @@ class OpenVINOExecutionProvider : public IExecutionProvider { const void* GetExecutionHandle() const noexcept override { return nullptr; } + + const InlinedVector GetEpContextNodes() const override; + #ifdef USE_OVEP_NPU_MEMORY std::vector CreatePreferredAllocators() override; #endif private: - std::unique_ptr session_context_; - std::shared_ptr backend_manager_; - openvino_ep::EPCtxHandler ep_ctx_handle_{}; + openvino_ep::SessionContext session_context_; + std::list backend_managers_; // EP session owns the backend objects + + openvino_ep::EPCtxHandler ep_ctx_handle_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 12ab7ecede031..804db5b726fc5 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -109,22 +109,13 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, } } -OVExeNetwork OVCore::ImportModel(const std::string& model_string, +OVExeNetwork OVCore::ImportModel(std::istream& model_stream, std::string hw_target, const ov::AnyMap& device_config, - bool embed_mode, std::string name) { try { ov::CompiledModel obj; - if (embed_mode) { - std::istringstream model_stream(model_string); - obj = oe.import_model(model_stream, hw_target, device_config); - } else { - std::ifstream modelStream(model_string, std::ios_base::binary | std::ios_base::in); - obj = oe.import_model(modelStream, - hw_target, - {}); - } + obj = oe.import_model(model_stream, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index c3417003f8e1f..550c7962cca13 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -54,10 +54,9 @@ class OVCore { ov::AnyMap& device_config, const std::string& name); // OV Interface for Import model Stream - OVExeNetwork ImportModel(const std::string& model_string, + OVExeNetwork ImportModel(std::istream& model_stream, std::string hw_target, const ov::AnyMap& device_config, - bool embed_mode, std::string name); #ifdef IO_BUFFER_ENABLED OVExeNetwork CompileModel(std::shared_ptr& model, diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 9e62076ca8777..b9f01cc261f52 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License #include #include +#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/backend_utils.h" @@ -26,10 +27,12 @@ namespace onnxruntime { namespace openvino_ep { // Constructor -GetCapability::GetCapability(const GraphViewer& graph_viewer_param, +GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler, + const GraphViewer& graph_viewer_param, const std::string device_type_param, - const bool enable_qdq_optimizer) - : graph_viewer_(graph_viewer_param), device_type_(device_type_param) { + const bool enable_qdq_optimizer) : ep_ctx_handler_(ep_ctx_handler), + graph_viewer_(graph_viewer_param), + device_type_(device_type_param) { bool npu_qdq_optimizer_enabled = false; if (device_type_.find("NPU") != std::string::npos) { device_type_ = "CPU"; @@ -56,6 +59,42 @@ std::vector> GetCapability::Execute() { return result; } + auto Iterable2String = [](U& strings, const V& node_args) { + constexpr bool has_name = requires(V v) { + (*v.begin())->Name(); + }; + for (const auto& arg : node_args) { + if constexpr (has_name) { + strings.push_back(arg->Name()); + } else { + strings.push_back(arg); + } + } + }; + + // Check for EpContext nodes + const auto& nodes = graph_viewer_.GetNodesInTopologicalOrder(); + for (const auto node_index : nodes) { + const auto& node = *graph_viewer_.GetNode(node_index); + if (ep_ctx_handler_.CheckForOVEPCtxNode(node)) { + std::vector inputs; + std::vector outputs; + + Iterable2String(inputs, node.InputDefs()); + Iterable2String(outputs, node.OutputDefs()); + + auto sub_graph = IndexedSubGraph::Create(); + sub_graph->Nodes().push_back(node_index); + auto meta_def = IndexedSubGraph_MetaDef::Create(); + meta_def->name() = node.Name(); + meta_def->domain() = kMSDomain; + meta_def->inputs() = inputs; + meta_def->outputs() = outputs; + sub_graph->SetMetaDef(std::move(meta_def)); + result.push_back(ComputeCapability::Create(std::move(sub_graph))); + } + } + // This is a list of initializers that nGraph considers as constants. Example weights, reshape shape etc. std::unordered_set ng_required_initializers; @@ -75,8 +114,7 @@ std::vector> GetCapability::Execute() { std::vector inputs; std::vector outputs; // Fill inputs with names - std::for_each(graph_viewer_.GetInputs().begin(), graph_viewer_.GetInputs().end(), - [&inputs](const NodeArg* node_arg) { inputs.push_back(node_arg->Name()); }); + Iterable2String(inputs, graph_viewer_.GetInputs()); /* In scenarios, when there are no inputs or all inputs being initializers, ConstantFolding optimization in onnxruntime pre-computes the value.*/ @@ -84,8 +122,6 @@ std::vector> GetCapability::Execute() { return result; } - const std::vector& nodes = graph_viewer_.GetNodesInTopologicalOrder(); - const Node* node = graph_viewer_.GetNode(nodes[0]); // Handle cases where lone, reoccuring Ops in smaller models cannot be supported in OpenVINO @@ -105,12 +141,10 @@ std::vector> GetCapability::Execute() { } // Initializers need to be part of meta_def->inputs - std::for_each(ng_required_initializers.begin(), ng_required_initializers.end(), - [&inputs](const std::string& initializer) { inputs.push_back(initializer); }); + Iterable2String(inputs, ng_required_initializers); // Fill outputs with names - std::for_each(graph_viewer_.GetOutputs().begin(), graph_viewer_.GetOutputs().end(), - [&outputs](const NodeArg* node_arg) { outputs.push_back(node_arg->Name()); }); + Iterable2String(outputs, graph_viewer_.GetOutputs()); // Create and add this graph to result. AppendClusterToSubGraph(graph_viewer_.GetNodesInTopologicalOrder(), inputs, outputs, result); diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h index 2f87c4c73d892..364e79a76f154 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.h +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h @@ -6,12 +6,14 @@ #include #include #include "core/providers/openvino/ov_versions/data_ops.h" +#include "core/providers/openvino/onnx_ctx_model_helper.h" namespace onnxruntime { namespace openvino_ep { class GetCapability { private: + const EPCtxHandler& ep_ctx_handler_; const GraphViewer& graph_viewer_; std::string device_type_; DataOps* data_ops_; @@ -19,7 +21,8 @@ class GetCapability { bool has_external_weights_ = false; public: - GetCapability(const GraphViewer& graph_viewer_param, + GetCapability(const EPCtxHandler& ep_ctx_handler, + const GraphViewer& graph_viewer_param, const std::string device_type_param, const bool enable_qdq_optimizer); virtual std::vector> Execute(); diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index e434935343663..4feedd75f8004 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -991,7 +991,8 @@ struct Model final { const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) { return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger); } - static std::unique_ptr Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) { + static std::unique_ptr Create(const std::string& graph_name, bool is_onnx_domain_only, + const logging::Logger& logger) { return g_host->Model__construct(graph_name, is_onnx_domain_only, logger); } static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast(p)); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index d7c6dab72fde8..2644c8f6ffb36 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1179,7 +1179,8 @@ struct ProviderHostImpl : ProviderHost { const logging::Logger& logger) override { return std::make_unique(model_proto, model_path, local_registries, logger); } - std::unique_ptr Model__construct(const std::string& graph_name, bool is_onnx_domain_only, + std::unique_ptr Model__construct(const std::string& graph_name, + bool is_onnx_domain_only, const logging::Logger& logger) override { return std::make_unique(graph_name, is_onnx_domain_only, logger); } From 37cee3f04a48ce2589ed8828bd81001dea708fd3 Mon Sep 17 00:00:00 2001 From: saurabhkale117 Date: Thu, 9 Jan 2025 12:40:27 +0530 Subject: [PATCH 03/35] enable config option for ovep weight sharing --- onnxruntime/core/providers/openvino/contexts.h | 1 + .../core/providers/openvino/openvino_execution_provider.cc | 1 + .../core/providers/openvino/openvino_execution_provider.h | 6 ++++-- .../core/providers/openvino/openvino_provider_factory.cc | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index e9405b5ac5142..95954ae204047 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -19,6 +19,7 @@ struct SessionContext { bool enable_opencl_throttling = false; bool disable_dynamic_shapes = false; bool ep_context_embed_mode = false; + bool enable_ovep_weight_sharing = false; bool export_ep_ctx_blob = false; bool enable_qdq_optimizer = false; bool disable_cpu_fallback = false; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 0da0813e9143e..43924386eeee1 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -26,6 +26,7 @@ openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInf .enable_opencl_throttling = info.enable_opencl_throttling_, .disable_dynamic_shapes = info.disable_dynamic_shapes_, .ep_context_embed_mode = info.so_epctx_embed_mode_, + .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_; .export_ep_ctx_blob = info.export_ep_ctx_blob_, .enable_qdq_optimizer = info.enable_qdq_optimizer_, .disable_cpu_fallback = info.disable_cpu_fallback_, diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 9b48d9e5ce3a3..b5206c196cb60 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -91,6 +91,7 @@ struct OpenVINOExecutionProviderInfo { bool enable_qdq_optimizer_{false}; bool disable_cpu_fallback_{false}; bool so_epctx_embed_mode_{false}; + bool so_enable_ovep_weight_sharing_{false}; OpenVINOExecutionProviderInfo() = delete; @@ -102,7 +103,7 @@ struct OpenVINOExecutionProviderInfo { void* context, bool enable_opencl_throttling, bool disable_dynamic_shapes, bool export_ep_ctx_blob, bool enable_qdq_optimizer, bool disable_cpu_fallback, - bool so_epctx_embed_mode) + bool so_epctx_embed_mode, , bool so_enable_ovep_weight_sharing) : precision_(std::move(precision)), num_of_threads_(num_of_threads), load_config_(std::move(load_config)), @@ -115,7 +116,8 @@ struct OpenVINOExecutionProviderInfo { export_ep_ctx_blob_(export_ep_ctx_blob), enable_qdq_optimizer_(enable_qdq_optimizer), disable_cpu_fallback_(disable_cpu_fallback), - so_epctx_embed_mode_{so_epctx_embed_mode} { + so_epctx_embed_mode_{so_epctx_embed_mode}, + so_enable_ovep_weight_sharing_{so_enable_ovep_weight_sharing} { std::set ov_supported_device_types = {"CPU", "GPU", "GPU.0", "GPU.1", "NPU"}; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 5855cb594a08e..951b8223b4dbc 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -55,6 +55,7 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str(); + bool so_enable_ovep_weight_sharing = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; if (so_export_ep_ctx_blob && !so_cache_path.empty()) { cache_dir_ = std::move(so_cache_path); @@ -76,7 +77,7 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_, cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_, disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_, - so_disable_cpu_fallback, so_epctx_embed_mode); + so_disable_cpu_fallback, so_epctx_embed_mode, so_enable_ovep_weight_sharing); return std::make_unique(info); } From 409cb476af405db18d76a049f534e82f3ae06236 Mon Sep 17 00:00:00 2001 From: saurabhkale117 Date: Thu, 9 Jan 2025 13:45:11 +0530 Subject: [PATCH 04/35] add config option for ovep weight sharing --- .../core/providers/openvino/openvino_execution_provider.cc | 2 +- .../core/providers/openvino/openvino_execution_provider.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 43924386eeee1..817d0817cbfc6 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -26,7 +26,7 @@ openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInf .enable_opencl_throttling = info.enable_opencl_throttling_, .disable_dynamic_shapes = info.disable_dynamic_shapes_, .ep_context_embed_mode = info.so_epctx_embed_mode_, - .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_; + .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_, .export_ep_ctx_blob = info.export_ep_ctx_blob_, .enable_qdq_optimizer = info.enable_qdq_optimizer_, .disable_cpu_fallback = info.disable_cpu_fallback_, diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index b5206c196cb60..1b3990310fc61 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -103,7 +103,7 @@ struct OpenVINOExecutionProviderInfo { void* context, bool enable_opencl_throttling, bool disable_dynamic_shapes, bool export_ep_ctx_blob, bool enable_qdq_optimizer, bool disable_cpu_fallback, - bool so_epctx_embed_mode, , bool so_enable_ovep_weight_sharing) + bool so_epctx_embed_mode, bool so_enable_ovep_weight_sharing) : precision_(std::move(precision)), num_of_threads_(num_of_threads), load_config_(std::move(load_config)), From 3b2b7e9a89b02ee868068867c12d89a887777c46 Mon Sep 17 00:00:00 2001 From: Preetha Veeramalai Date: Wed, 8 Jan 2025 20:52:19 -0800 Subject: [PATCH 05/35] Refactor the conditional blocks in OVEP for compilation --- cmake/onnxruntime_providers_openvino.cmake | 2 +- .../core/providers/openvino/backend_utils.cc | 3 +- .../core/providers/openvino/backend_utils.h | 2 +- .../openvino/backends/basic_backend.cc | 89 +++++++++---------- .../openvino/openvino_provider_factory.cc | 8 ++ 5 files changed, 54 insertions(+), 50 deletions(-) diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index 01a0d3ce3badc..ff2d4b388e82e 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -30,7 +30,7 @@ endif() list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES}) - if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS})) + if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS}) AND onnxruntime_USE_OPENVINO_GPU) add_definitions(-DIO_BUFFER_ENABLED=1) list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS}) endif() diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 84de5eb4f16f9..c447a7847434a 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -40,14 +40,13 @@ struct static_cast_int64 { }; std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, +CreateOVModel(const std::string model, const SessionContext& session_context, const SubGraphContext& subgraph_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; } - const std::string model = model_proto.SerializeAsString(); try { auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name); diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 2765fe0e9b1c7..0d7378072cb1b 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -61,7 +61,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, size_t batch_slice_idx); std::shared_ptr -CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, +CreateOVModel(const std::string model, const SessionContext& session_context, const SubGraphContext& subgraph_context, std::map>& const_outputs_map); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index bacf25effb0f3..e9882caa1372b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -57,71 +57,68 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } try { - std::string dev_prec = session_context.device_type + "_" + session_context_.precision_str; - - if (subgraph_context_.is_wholly_supported_graph) { // Full graph is supported + // IO_BUFFER is enabled on GPU HW. + // Pre-requisite is provider_option "context" must be set #if defined(IO_BUFFER_ENABLED) - if (subgraph_context_.is_ep_ctx_graph) { - exe_network_ = session_context_.ie_core.ImportModel(*model_stream, - remote_context_, - subgraph_context_.subgraph_name); - } else if ((session_context.device_type.find("GPU") != std::string::npos) && - (session_context_.context != nullptr)) { - LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; - cl_context ctx = static_cast(session_context_.context); - remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx); - ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_); - exe_network_ = session_context_.ie_core.CompileModel( - ie_cnn_network_, remote_context_, subgraph_context_.subgraph_name); - } else { - ie_cnn_network_ = CreateOVModel(model_proto, session_context_, subgraph_context_, const_outputs_map_); - exe_network_ = session_context_.ie_core.CompileModel( - ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); + cl_context ctx = static_cast(session_context_.context); + remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx); + if (subgraph_context_.is_ep_ctx_graph) { + exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + remote_context_, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed + } else { + std::shared_ptr ov_model; + { + const std::string model = model_proto->SerializeAsString(); + if (!subgraph_context.has_dynamic_input_shape) { + delete model_proto.release(); + } + ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); + } + LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; + exe_network_ = session_context_.ie_core.CompileModel( + ov_model, remote_context_, subgraph_context_.subgraph_name); } #else // !IO_BUFFER_ENABLED - std::string prec_str = (session_context_.precision_str != "ACCURACY") ? session_context_.precision_str : subgraph_context_.model_precision; + auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) || + (session_context_.OpenVINO_Version.at(0) >= 2024 && + session_context_.OpenVINO_Version.at(1) > 2)); if (subgraph_context_.is_ep_ctx_graph) { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + hw_target, + device_config, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed + } else if (!subgraph_context_.has_external_weights && + !subgraph_context_.has_dynamic_input_shape && + !session_context_.export_ep_ctx_blob && + auto_unified_compile){ + // Unified OV compile_model is efficient when ov model caching is enabled + // Unified OV compile_model API is supported with AUTO from version 2024.3 and above + // Inputs with static dimenstions + // Not enabled for models with external weights and when ep context is set. + const std::string model = model_proto->SerializeAsString(); + exe_network_ = session_context_.ie_core.CompileModel(model, hw_target, device_config, subgraph_context_.subgraph_name); - model_stream.reset(); // Delete stream after it is no longer needed - } else if (session_context_.export_ep_ctx_blob && - hw_target.find("NPU") != std::string::npos && - !subgraph_context_.has_external_weights) { - std::shared_ptr ov_model; + } else { // For all other types use ov::core read_model() to generate OV IR + // followed by ov::core compile_model() + std::shared_ptr ov_model; { const std::string model = model_proto->SerializeAsString(); - if (!subgraph_context_.has_dynamic_input_shape) { + if (!subgraph_context.has_dynamic_input_shape) { delete model_proto.release(); } - ov_model = session_context_.ie_core.Get().read_model(model, ov::Tensor()); + ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); } - exe_network_ = OVExeNetwork(session_context_.ie_core.Get().compile_model(ov_model, hw_target, device_config)); - } else if (!subgraph_context_.has_external_weights && - (!subgraph_context_.has_dynamic_input_shape) && - ((hw_target.find("AUTO") == std::string::npos) || - (session_context_.OpenVINO_Version.at(0) >= 2024 && session_context_.OpenVINO_Version.at(1) > 2))) { - // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above - // Inputs with static dimenstions - const std::string model = model_proto->SerializeAsString(); - exe_network_ = session_context_.ie_core.CompileModel(model, - hw_target, - device_config, - subgraph_context_.subgraph_name); - } else { // For all other types use ov::Model Type - auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_); exe_network_ = session_context_.ie_core.CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } #endif - } else { // Full graph is not supported - auto ov_model = CreateOVModel(*model_proto, session_context_, subgraph_context_, const_outputs_map_); - exe_network_ = session_context_.ie_core.CompileModel( - ov_model, hw_target, device_config, subgraph_context_.subgraph_name); - } LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { ORT_THROW(msg); diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 951b8223b4dbc..09ee83d3a7cc4 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -271,6 +271,14 @@ struct OpenVINO_Provider : Provider { uint64_t number = std::strtoull(str.c_str(), nullptr, 16); context = reinterpret_cast(number); } +#if defined(IO_BUFFER_ENABLED) + // a valid context must be provided to enable IO Buffer optimizations + if(context==nullptr){ + #undef IO_BUFFER_ENABLED + #define IO_BUFFER_ENABLED=0 + LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization"; + } +#endif if (provider_options_map.find("num_of_threads") != provider_options_map.end()) { if (!std::all_of(provider_options_map.at("num_of_threads").begin(), From 3949bf51464897ade294a2359409d57a4be7fb01 Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Mon, 30 Dec 2024 16:17:58 +0530 Subject: [PATCH 06/35] Convert initializers with external data to graph inputs --- .../qdq_transformations/qdq_stripping.cc | 109 +++++++++++++++--- 1 file changed, 91 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 387aaf9985b4c..894f418b93482 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -625,6 +625,50 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph, KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node); } +// Keep track of inputs across multiple calls +static std::vector accumulated_inputs; +static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, + const onnxruntime::GraphViewer& src_graph, + const std::string& initializer_name) { + // Get the initializer from source graph + const auto& src_initializers = src_graph.GetAllInitializedTensors(); + auto init_iter = src_initializers.find(initializer_name); + + if (init_iter == src_initializers.end()) { + // Initializer not found + return; + } + + const ONNX_NAMESPACE::TensorProto* tensor_proto = init_iter->second; + + // Create TypeProto for the initializer + std::unique_ptr type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto* tensor_type = type_proto->mutable_tensor_type(); + tensor_type->set_elem_type(tensor_proto->data_type()); + + for (int i = 0; i < tensor_proto->dims_size(); ++i) { + tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i)); + } + + // Create NodeArg for the initializer + auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get()); + + // Check if input already exists in accumulated inputs + bool input_exists = false; + for (const auto* existing_input : accumulated_inputs) { + if (existing_input->Name() == initializer_name) { + input_exists = true; + break; + } + } + + if (!input_exists) { + // Add to accumulated inputs + accumulated_inputs.push_back(&input_arg); + } +} + + // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, @@ -665,7 +709,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, dst_graph_outputs.push_back(&ep_graph_output_arg); } - dst_graph.SetInputs(dst_graph_inputs); + // Will set inputs after deciding fate oif all internal and external initializers + // dst_graph.SetInputs(dst_graph_inputs); dst_graph.SetOutputs(dst_graph_outputs); // TODO(sspintel): add Graph::SetName() provider api @@ -723,9 +768,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, seen_node_units.insert(node_unit); } - // - // Copy initializers to dst graph. - // + + // Copy initializers to dst graph. std::unordered_set current_scope_initializer_set; @@ -739,25 +783,54 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, std::sort(const_inits.begin(), const_inits.end()); for (auto& it : const_inits) { - if (initializers_to_keep.count(it)) - dst_graph.AddInitializedTensor(*(initializers.at(it))); - current_scope_initializer_set.insert(it); + const auto* initializer_tensor = initializers.at(it); + + // Check if the initializer has external data + if (initializer_tensor->has_data_location() && + initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { + // Add initializer with external data as input + AddInitializerAsInput(dst_graph, src_graph, it); + } else { + // Add as an initialized tensor if it does not have external data + if (initializers_to_keep.count(it)) { + dst_graph.AddInitializedTensor(*initializer_tensor); + } + } + + current_scope_initializer_set.insert(it); } - // handle outer scope value which is a constant initializer + // Handle outer-scope constant initializers for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) { - const auto& node = src_graph.GetNode(node_idx); - for (const auto& input : node->InputDefs()) { - if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) { - continue; + const auto& node = src_graph.GetNode(node_idx); + for (const auto& input : node->InputDefs()) { + if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) { + continue; + } + + if (src_graph.IsConstantInitializer(input->Name(), true)) { + const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true); + + // Check if the initializer has external data + if (initializer_tensor->has_data_location() && + initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { + // Add initializer as input if it has external data + AddInitializerAsInput(dst_graph, src_graph, input->Name()); + } else { + // Add as an initialized tensor if it does not have external data + if (initializers_to_keep.count(input->Name())) { + dst_graph.AddInitializedTensor(*initializer_tensor); + } + } + + current_scope_initializer_set.insert(input->Name()); + } } - if (src_graph.IsConstantInitializer(input->Name(), true)) { - if (initializers_to_keep.count(input->Name())) - dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true))); - current_scope_initializer_set.insert(input->Name()); - } - } } + accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end()); + + // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph + dst_graph.SetInputs(accumulated_inputs); // Validate graph, remove unnecessary initializers, and run type/shape inference. ORT_RETURN_IF_ERROR(dst_graph.Resolve()); From 28c928a50c6c04d50745fb3c3490e8d87927c069 Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Thu, 2 Jan 2025 15:54:31 +0530 Subject: [PATCH 07/35] create, store and export metadata for ovep weight sharing --- .../qdq_transformations/qdq_stripping.cc | 112 +++++++++++++++++- 1 file changed, 107 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 894f418b93482..cca49311c7b15 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -668,6 +668,55 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, } } +bool writeString(std::ofstream& outfile, const std::string& str) { + size_t size = str.size(); + outfile.write(reinterpret_cast(&size), sizeof(size)); + if (!outfile.good()) return false; + + outfile.write(str.c_str(), size); + return outfile.good(); +} + +bool writeStringVector(std::ofstream& outfile, const std::vector& vec) { + size_t size = vec.size(); + outfile.write(reinterpret_cast(&size), sizeof(size)); + if (!outfile.good()) return false; + + for (const auto& str : vec) { + if (!writeString(outfile, str)) { + return false; + } + } + return true; +} + +// Main function to dump the map to a binary file +bool dumpMetaDataMapToBinary(const std::unordered_map>& map, const std::string& filename) { + + std::ofstream outfile(filename, std::ios::binary); + if (!outfile.is_open()) { + ORT_THROW("Error: Could not open file for writing metadata."); + return false; + } + + // Write the size of the map + size_t map_size = map.size(); + outfile.write(reinterpret_cast(&map_size), sizeof(map_size)); + if (!outfile.good()) { + ORT_THROW("Error: Failed to write map size."); + return false; + } + + // Write each key-value pair + for (const auto& pair : map) { + if (!writeString(outfile, pair.first) || !writeStringVector(outfile, pair.second)) { + ORT_THROW("Error: Failed to write map data."); + return false; + } + } + + return true; +} // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, @@ -782,14 +831,40 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, } std::sort(const_inits.begin(), const_inits.end()); + // initialize map for creating metadata for initilizers with external weights + std::unordered_map> metadata_map; + + // metadata structure: initializer_name as key + // and [location, offset, length] as value + for (auto& it : const_inits) { const auto* initializer_tensor = initializers.at(it); // Check if the initializer has external data if (initializer_tensor->has_data_location() && initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { - // Add initializer with external data as input - AddInitializerAsInput(dst_graph, src_graph, it); + if (enable_ovep_weight_sharing) { + + // Cast away const to access mutable_external_data + struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast(initializer_tensor); + + // get meta data about the initilizers with external data + struct ONNX_NAMESPACE::StringStringEntryProtos* external_data = non_const_initializer_tensor->mutable_external_data(); + + std::vector init_info; + // init_info structure: [location, offset, length] + + for (int i = 0 ; i < external_data->size() ; i++) { + init_info.push_back(*external_data->at(i).mutable_value()); + } + + metadata_map.emplace(initializer_tensor->name(), init_info); + // Add initializer with external data as input + AddInitializerAsInput(dst_graph, src_graph, it); + } else if (initializers_to_keep.count(it)) { + dst_graph.AddInitializedTensor(*initializer_tensor); + } + } else { // Add as an initialized tensor if it does not have external data if (initializers_to_keep.count(it)) { @@ -810,12 +885,30 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, if (src_graph.IsConstantInitializer(input->Name(), true)) { const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true); - // Check if the initializer has external data if (initializer_tensor->has_data_location() && initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { - // Add initializer as input if it has external data - AddInitializerAsInput(dst_graph, src_graph, input->Name()); + if (enable_ovep_weight_sharing) { + + // Cast away const to access mutable_external_data + struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast(initializer_tensor); + + // get meta data about the initilizers with external data + struct ONNX_NAMESPACE::StringStringEntryProtos* external_data = non_const_initializer_tensor->mutable_external_data(); + + std::vector init_info; + for (int i = 0 ; i < external_data->size() ; i++) { + init_info.push_back(*external_data->at(i).mutable_value()); + } + + metadata_map.emplace(initializer_tensor->name(), init_info); + + // Add initializer as input if it has external data + AddInitializerAsInput(dst_graph, src_graph, input->Name()); + } else if (initializers_to_keep.count(input->Name())) { + dst_graph.AddInitializedTensor(*initializer_tensor); + } + } else { // Add as an initialized tensor if it does not have external data if (initializers_to_keep.count(input->Name())) { @@ -827,6 +920,15 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, } } } + + if (enable_ovep_weight_sharing) { + // creating bin file of metadata_map and dumping the bin file + dumpMetaDataMapToBinary(metadata_map, "metadata.bin"); + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped."; + } else{ + ORT_THROW("Unable to write metadata to file."); + } + accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end()); // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph From 7a89c5a0c92871629fdd8d7b096736ef25b2d11e Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Fri, 3 Jan 2025 15:11:01 +0530 Subject: [PATCH 08/35] fix error handling in weight sharing --- .../openvino/qdq_transformations/qdq_stripping.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index cca49311c7b15..54cc15fa1ed5b 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -920,13 +920,13 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, } } } - if (enable_ovep_weight_sharing) { // creating bin file of metadata_map and dumping the bin file - dumpMetaDataMapToBinary(metadata_map, "metadata.bin"); - LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped."; - } else{ - ORT_THROW("Unable to write metadata to file."); + if (dumpMetaDataMapToBinary(metadata_map, "metadata.bin")) { + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped."; + } else { + ORT_THROW("Error: Unable to write metadat to file."); + } } accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end()); From ad66ae09c2251bbfebeecd05c401e0443ae064a0 Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Thu, 9 Jan 2025 23:49:10 -0800 Subject: [PATCH 09/35] fix crash issue while setting up inputs for wai model --- .../qdq_transformations/qdq_stripping.cc | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 54cc15fa1ed5b..def51f34fe2ed 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -625,9 +625,8 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph, KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node); } -// Keep track of inputs across multiple calls -static std::vector accumulated_inputs; -static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, +static void AddInitializerAsInput (onnxruntime::Graph& dst_graph, + InlinedVector& accumulated_inputs, const onnxruntime::GraphViewer& src_graph, const std::string& initializer_name) { // Get the initializer from source graph @@ -759,6 +758,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, } // Will set inputs after deciding fate oif all internal and external initializers + // accumulated_inputs container will store input of the original graph and initializer with ext data + InlinedVector accumulated_inputs; // dst_graph.SetInputs(dst_graph_inputs); dst_graph.SetOutputs(dst_graph_outputs); @@ -842,8 +843,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, // Check if the initializer has external data if (initializer_tensor->has_data_location() && - initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { - if (enable_ovep_weight_sharing) { + initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + enable_ovep_weight_sharing) { // Cast away const to access mutable_external_data struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast(initializer_tensor); @@ -860,16 +861,13 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, metadata_map.emplace(initializer_tensor->name(), init_info); // Add initializer with external data as input - AddInitializerAsInput(dst_graph, src_graph, it); - } else if (initializers_to_keep.count(it)) { - dst_graph.AddInitializedTensor(*initializer_tensor); - } + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it); } else { // Add as an initialized tensor if it does not have external data - if (initializers_to_keep.count(it)) { - dst_graph.AddInitializedTensor(*initializer_tensor); - } + if (initializers_to_keep.count(it)) + dst_graph.AddInitializedTensor(*(initializers.at(it))); + } current_scope_initializer_set.insert(it); @@ -887,8 +885,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true); // Check if the initializer has external data if (initializer_tensor->has_data_location() && - initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { - if (enable_ovep_weight_sharing) { + initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + enable_ovep_weight_sharing) { // Cast away const to access mutable_external_data struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast(initializer_tensor); @@ -904,15 +902,12 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, metadata_map.emplace(initializer_tensor->name(), init_info); // Add initializer as input if it has external data - AddInitializerAsInput(dst_graph, src_graph, input->Name()); - } else if (initializers_to_keep.count(input->Name())) { - dst_graph.AddInitializedTensor(*initializer_tensor); - } + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name()); } else { // Add as an initialized tensor if it does not have external data if (initializers_to_keep.count(input->Name())) { - dst_graph.AddInitializedTensor(*initializer_tensor); + dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true))); } } From c17b27647141cb53d8a9e9f7f5df8aecc95e1a31 Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Fri, 10 Jan 2025 00:54:31 -0800 Subject: [PATCH 10/35] pass weight sharing option to OVEP qdq stripping pass --- onnxruntime/core/providers/openvino/backend_manager.cc | 2 +- .../providers/openvino/qdq_transformations/qdq_stripping.cc | 1 + .../core/providers/openvino/qdq_transformations/qdq_stripping.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 04c1ffebb838d..cbba5aa7152ba 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -362,7 +362,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, IsQDQGraph(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1"; std::unique_ptr model; - Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, model); + Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.enable_ovep_weight_sharing, model); auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index def51f34fe2ed..7c1e850b0b7a0 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -720,6 +720,7 @@ bool dumpMetaDataMapToBinary(const std::unordered_map& model) { // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc // with the following differences: diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h index 94a8eb4d5da17..5b777a388adda 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h @@ -12,6 +12,7 @@ namespace openvino_ep { // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, + bool enable_ovep_weight_sharing, /*out*/ std::unique_ptr& model); } // namespace openvino_ep From 5e734f173ab72ed8cc93e7b045e1415b3139cc93 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 9 Jan 2025 14:52:52 -0800 Subject: [PATCH 11/35] Aligning OVEP variable names to match the session option value they hold --- .../providers/openvino/backend_manager.cc | 10 +-- .../openvino/backends/basic_backend.cc | 74 +++++++++---------- .../core/providers/openvino/contexts.h | 8 +- .../openvino/onnx_ctx_model_helper.cc | 15 ++-- .../openvino/onnx_ctx_model_helper.h | 2 +- .../openvino/openvino_execution_provider.cc | 8 +- .../openvino/openvino_execution_provider.h | 22 +++--- .../openvino/openvino_provider_factory.cc | 32 ++++---- 8 files changed, 85 insertions(+), 86 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index cbba5aa7152ba..790e5f9c4e445 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -140,7 +140,7 @@ BackendManager::BackendManager(const SessionContext& session_context, } catch (const OnnxRuntimeException& ex) { std::string exception_str = ex.what(); bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos && - !session_context_.disable_cpu_fallback && + !session_context_.so_disable_cpu_ep_fallback && !subgraph_context_.is_ep_ctx_graph; #if defined(OPENVINO_DISABLE_NPU_FALLBACK) eligible_for_cpu_fallback = false; @@ -187,7 +187,7 @@ BackendManager::BackendManager(const SessionContext& session_context, } } } - if (session_context_.export_ep_ctx_blob && !subgraph_context_.is_ep_ctx_graph) { + if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) { auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, logger); if ((!status.IsOK())) { @@ -214,7 +214,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie // If not embed_mode, dump the blob here and only pass on the path to the blob std::string model_blob_str; auto compiled_model = concrete_backend_->GetOVCompiledModel(); - if (session_context_.ep_context_embed_mode) { + if (session_context_.so_context_embed_mode) { // Internal blob std::ostringstream model_blob_stream; compiled_model.export_model(model_blob_stream); @@ -245,7 +245,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer, subgraph_context_.subgraph_name, - session_context_.ep_context_embed_mode, + session_context_.so_context_embed_mode, std::move(model_blob_str))); return Status::OK(); @@ -496,7 +496,7 @@ void BackendManager::Compute(OrtKernelContext* context) { ORT_THROW(ex.what()); #else if (session_context_.device_type.find("NPU") != std::string::npos && - !session_context_.disable_cpu_fallback) { + !session_context_.so_disable_cpu_ep_fallback) { LOGS_DEFAULT(WARNING) << ex.what(); LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index e9882caa1372b..1c3a3f9e425d4 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -64,9 +64,9 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx); if (subgraph_context_.is_ep_ctx_graph) { exe_network_ = session_context_.ie_core.ImportModel(*model_stream, - remote_context_, - subgraph_context_.subgraph_name); - model_stream.reset(); // Delete stream after it is no longer needed + remote_context_, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed } else { std::shared_ptr ov_model; { @@ -79,45 +79,45 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; exe_network_ = session_context_.ie_core.CompileModel( ov_model, remote_context_, subgraph_context_.subgraph_name); - } + } #else // !IO_BUFFER_ENABLED - auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) || - (session_context_.OpenVINO_Version.at(0) >= 2024 && - session_context_.OpenVINO_Version.at(1) > 2)); - if (subgraph_context_.is_ep_ctx_graph) { - // If the blob is held in an EPContext node, then skip FE+Compile - // and directly move on to creating a backend with the executable blob - exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) || + (session_context_.OpenVINO_Version.at(0) >= 2024 && + session_context_.OpenVINO_Version.at(1) > 2)); + if (subgraph_context_.is_ep_ctx_graph) { + // If the blob is held in an EPContext node, then skip FE+Compile + // and directly move on to creating a backend with the executable blob + exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + hw_target, + device_config, + subgraph_context_.subgraph_name); + model_stream.reset(); // Delete stream after it is no longer needed + } else if (!subgraph_context_.has_external_weights && + !subgraph_context_.has_dynamic_input_shape && + !session_context_.so_context_enable && + auto_unified_compile) { + // Unified OV compile_model is efficient when ov model caching is enabled + // Unified OV compile_model API is supported with AUTO from version 2024.3 and above + // Inputs with static dimenstions + // Not enabled for models with external weights and when ep context is set. + const std::string model = model_proto->SerializeAsString(); + exe_network_ = session_context_.ie_core.CompileModel(model, hw_target, device_config, subgraph_context_.subgraph_name); - model_stream.reset(); // Delete stream after it is no longer needed - } else if (!subgraph_context_.has_external_weights && - !subgraph_context_.has_dynamic_input_shape && - !session_context_.export_ep_ctx_blob && - auto_unified_compile){ - // Unified OV compile_model is efficient when ov model caching is enabled - // Unified OV compile_model API is supported with AUTO from version 2024.3 and above - // Inputs with static dimenstions - // Not enabled for models with external weights and when ep context is set. + } else { // For all other types use ov::core read_model() to generate OV IR + // followed by ov::core compile_model() + std::shared_ptr ov_model; + { const std::string model = model_proto->SerializeAsString(); - exe_network_ = session_context_.ie_core.CompileModel(model, - hw_target, - device_config, - subgraph_context_.subgraph_name); - } else { // For all other types use ov::core read_model() to generate OV IR - // followed by ov::core compile_model() - std::shared_ptr ov_model; - { - const std::string model = model_proto->SerializeAsString(); - if (!subgraph_context.has_dynamic_input_shape) { - delete model_proto.release(); - } - ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); + if (!subgraph_context.has_dynamic_input_shape) { + delete model_proto.release(); } - exe_network_ = session_context_.ie_core.CompileModel( - ov_model, hw_target, device_config, subgraph_context_.subgraph_name); + ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); } + exe_network_ = session_context_.ie_core.CompileModel( + ov_model, hw_target, device_config, subgraph_context_.subgraph_name); + } #endif LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } catch (const char* msg) { @@ -178,7 +178,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } device_config.emplace(ov::device::properties("NPU", device_property)); #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024)) - if (session_context_.export_ep_ctx_blob) { + if (session_context_.so_context_enable) { session_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); } #endif @@ -287,7 +287,7 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) { // cache_dir argument has no effect when working with an embed-mode EPContext Graph if (subgraph_context_.is_ep_ctx_graph) return; - if (!session_context_.cache_dir.empty() && !session_context_.export_ep_ctx_blob) { + if (!session_context_.cache_dir.empty() && !session_context_.so_context_enable) { LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; if (session_context_.device_type.find("AUTO:GPU") != std::string::npos) { std::pair device_property; diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 95954ae204047..62e2cfcaa9d98 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -18,11 +18,11 @@ struct SessionContext { OVCore ie_core; bool enable_opencl_throttling = false; bool disable_dynamic_shapes = false; - bool ep_context_embed_mode = false; - bool enable_ovep_weight_sharing = false; - bool export_ep_ctx_blob = false; + bool so_context_embed_mode = false; + bool so_share_ep_contexts = false; + bool so_context_enable = false; bool enable_qdq_optimizer = false; - bool disable_cpu_fallback = false; + bool so_disable_cpu_ep_fallback = false; size_t num_of_threads; std::string device_type; std::string precision_str; diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 907650257c3f2..1c6b0a0467836 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -41,7 +41,7 @@ Status EPCtxHandler::ExportEPCtxModel(const std::string& model_name) { Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, const std::string& graph_name, - const bool ep_context_embed_mode, + const bool embed_mode, std::string&& model_blob_str) const { auto& graph = epctx_model_->MainGraph(); @@ -66,7 +66,7 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, auto embed_mode_attr = ONNX_NAMESPACE::AttributeProto::Create(); embed_mode_attr->set_name(EMBED_MODE); embed_mode_attr->set_type(onnx::AttributeProto_AttributeType_INT); - embed_mode_attr->set_i(ep_context_embed_mode); + embed_mode_attr->set_i(embed_mode); node_attributes->emplace(EMBED_MODE, std::move(*embed_mode_attr)); // ep context @@ -106,17 +106,16 @@ std::unique_ptr EPCtxHandler::GetModelBlobStream(const GraphViewer auto& attrs = node->GetAttributes(); ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) == 1); - const auto& ep_cache_context_attribute = attrs.at(EP_CACHE_CONTEXT); - const auto& cache_context = ep_cache_context_attribute.s(); + const auto& ep_cache_context = attrs.at(EP_CACHE_CONTEXT).s(); ORT_ENFORCE(attrs.count(EMBED_MODE) == 1); - bool ep_context_embed_mode = static_cast(attrs.at(EMBED_MODE).i()); + bool embed_mode = static_cast(attrs.at(EMBED_MODE).i()); std::unique_ptr result; - if (ep_context_embed_mode) { - result.reset((std::istream*)new std::istringstream(cache_context)); + if (embed_mode) { + result.reset((std::istream*)new std::istringstream(ep_cache_context)); } else { - result.reset((std::istream*)new std::ifstream(cache_context, std::ios_base::binary | std::ios_base::in)); + result.reset((std::istream*)new std::ifstream(ep_cache_context, std::ios_base::binary | std::ios_base::in)); } LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; return result; diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h index 7e5d5180b363b..f644e2607904d 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h @@ -29,7 +29,7 @@ class EPCtxHandler { bool CheckForOVEPCtxNode(const Node& node) const; Status AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, const std::string& graph_name, - const bool ep_context_embed_mode, + const bool embed_mode, std::string&& model_blob_str) const; std::unique_ptr GetModelBlobStream(const GraphViewer& graph_viewer) const; InlinedVector GetEPCtxNodes() const; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 817d0817cbfc6..50eed5443b8df 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -25,11 +25,11 @@ openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInf openvino_ep::SessionContext result = { .enable_opencl_throttling = info.enable_opencl_throttling_, .disable_dynamic_shapes = info.disable_dynamic_shapes_, - .ep_context_embed_mode = info.so_epctx_embed_mode_, - .enable_ovep_weight_sharing = info.so_enable_ovep_weight_sharing_, - .export_ep_ctx_blob = info.export_ep_ctx_blob_, + .so_context_embed_mode = info.so_context_embed_mode_, + .so_share_ep_contexts = info.so_share_ep_contexts_, + .so_context_enable = info.so_context_enable_, .enable_qdq_optimizer = info.enable_qdq_optimizer_, - .disable_cpu_fallback = info.disable_cpu_fallback_, + .so_disable_cpu_ep_fallback = info.so_disable_cpu_ep_fallback_, .num_of_threads = info.num_of_threads_, .device_type = info.device_type_, .precision_str = info.precision_, diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 1b3990310fc61..5644639c705f8 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -87,11 +87,11 @@ struct OpenVINOExecutionProviderInfo { void* context_{NULL}; bool enable_opencl_throttling_{false}; bool disable_dynamic_shapes_{false}; - bool export_ep_ctx_blob_{false}; + bool so_context_enable_{false}; bool enable_qdq_optimizer_{false}; - bool disable_cpu_fallback_{false}; - bool so_epctx_embed_mode_{false}; - bool so_enable_ovep_weight_sharing_{false}; + bool so_disable_cpu_ep_fallback_{false}; + bool so_context_embed_mode_{false}; + bool so_share_ep_contexts_{false}; OpenVINOExecutionProviderInfo() = delete; @@ -101,9 +101,9 @@ struct OpenVINOExecutionProviderInfo { const std::string& cache_dir, const std::string& model_priority, int num_streams, void* context, bool enable_opencl_throttling, - bool disable_dynamic_shapes, bool export_ep_ctx_blob, - bool enable_qdq_optimizer, bool disable_cpu_fallback, - bool so_epctx_embed_mode, bool so_enable_ovep_weight_sharing) + bool disable_dynamic_shapes, bool so_context_enable, + bool enable_qdq_optimizer, bool so_disable_cpu_ep_fallback, + bool so_context_embed_mode, bool so_share_ep_contexts) : precision_(std::move(precision)), num_of_threads_(num_of_threads), load_config_(std::move(load_config)), @@ -113,11 +113,11 @@ struct OpenVINOExecutionProviderInfo { context_(context), enable_opencl_throttling_(enable_opencl_throttling), disable_dynamic_shapes_(disable_dynamic_shapes), - export_ep_ctx_blob_(export_ep_ctx_blob), + so_context_enable_(so_context_enable), enable_qdq_optimizer_(enable_qdq_optimizer), - disable_cpu_fallback_(disable_cpu_fallback), - so_epctx_embed_mode_{so_epctx_embed_mode}, - so_enable_ovep_weight_sharing_{so_enable_ovep_weight_sharing} { + so_disable_cpu_ep_fallback_(so_disable_cpu_ep_fallback), + so_context_embed_mode_{so_context_embed_mode}, + so_share_ep_contexts_{so_share_ep_contexts} { std::set ov_supported_device_types = {"CPU", "GPU", "GPU.0", "GPU.1", "NPU"}; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 09ee83d3a7cc4..92c4948565a1d 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -51,14 +51,14 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { }; std::unique_ptr OpenVINOProviderFactory::CreateProvider() { - bool so_disable_cpu_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; - bool so_export_ep_ctx_blob = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; - bool so_epctx_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; - std::string so_cache_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str(); - bool so_enable_ovep_weight_sharing = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; - - if (so_export_ep_ctx_blob && !so_cache_path.empty()) { - cache_dir_ = std::move(so_cache_path); + bool so_disable_cpu_ep_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; + bool so_context_enable = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; + bool so_context_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; + std::string so_context_file_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data(); + bool so_share_ep_contexts = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; + + if (so_context_enable && !so_context_file_path.empty()) { + cache_dir_ = std::move(so_context_file_path); auto file_path = std::filesystem::path(cache_dir_); // ep_context_file_path_ file extension must be .onnx if (file_path.extension().generic_string() == ".onnx") { @@ -76,8 +76,8 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_, cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_, - disable_dynamic_shapes_, so_export_ep_ctx_blob, enable_qdq_optimizer_, - so_disable_cpu_fallback, so_epctx_embed_mode, so_enable_ovep_weight_sharing); + disable_dynamic_shapes_, so_context_enable, enable_qdq_optimizer_, + so_disable_cpu_ep_fallback, so_context_embed_mode, so_share_ep_contexts); return std::make_unique(info); } @@ -272,12 +272,12 @@ struct OpenVINO_Provider : Provider { context = reinterpret_cast(number); } #if defined(IO_BUFFER_ENABLED) - // a valid context must be provided to enable IO Buffer optimizations - if(context==nullptr){ - #undef IO_BUFFER_ENABLED - #define IO_BUFFER_ENABLED=0 - LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization"; - } + // a valid context must be provided to enable IO Buffer optimizations + if (context == nullptr) { +#undef IO_BUFFER_ENABLED +#define IO_BUFFER_ENABLED = 0 + LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization"; + } #endif if (provider_options_map.find("num_of_threads") != provider_options_map.end()) { From c9fb7577ffe8cc9fa7e05feed438f665fc309a80 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Fri, 10 Jan 2025 12:08:07 -0800 Subject: [PATCH 12/35] Add plumbing for context sharing plus refactoring around option handling --- .../providers/openvino/backend_manager.cc | 14 +- .../core/providers/openvino/backend_manager.h | 3 +- .../openvino/backends/basic_backend.cc | 6 +- .../core/providers/openvino/contexts.h | 76 ++++-- .../openvino/openvino_execution_provider.cc | 153 ++++++++---- .../openvino/openvino_execution_provider.h | 144 +---------- .../openvino/openvino_provider_factory.cc | 228 ++++++------------ 7 files changed, 263 insertions(+), 361 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 790e5f9c4e445..6e308d78ca066 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -46,7 +46,7 @@ BackendManager::BackendManager(const SessionContext& session_context, return ""; } else { auto input_type = graph_viewer.GetInputs()[0]->TypeAsProto()->tensor_type().elem_type(); - if (session_context_.precision_str == "ACCURACY" && + if (session_context_.precision == "ACCURACY" && session_context_.device_type.find("GPU") != std::string::npos) { if (input_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) { return "FP32"; @@ -150,7 +150,7 @@ BackendManager::BackendManager(const SessionContext& session_context, LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; session_context_.device_type = "CPU"; - session_context_.precision_str = "FP32"; + session_context_.precision = "FP32"; try { concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, @@ -188,8 +188,7 @@ BackendManager::BackendManager(const SessionContext& session_context, } } if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) { - auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, - logger); + auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph); if ((!status.IsOK())) { ORT_THROW(status); } @@ -200,8 +199,7 @@ BackendManager::BackendManager(const SessionContext& session_context, // precompiled blob is set. If that's the case: // By default, create model in embed mode where the blob stream is exported as data within // the EPContext node. -Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer, - const logging::Logger& logger) { +Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& graph_body_viewer) { if (session_context_.disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape) { std::string exception_str = "Exporting dynamically compiled models at runtime is not supported. " @@ -362,7 +360,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, IsQDQGraph(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1"; std::unique_ptr model; - Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.enable_ovep_weight_sharing, model); + Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model); auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); @@ -501,7 +499,7 @@ void BackendManager::Compute(OrtKernelContext* context) { LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." << "Falling back to OV CPU for execution"; session_context_.device_type = "CPU"; - session_context_.precision_str = "FP32"; + session_context_.precision = "FP32"; key = MakeMapKeyString(tensor_shapes, session_context_.device_type); try { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index f77f303c70991..43dc9ceaa558e 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -27,8 +27,7 @@ class BackendManager { void Compute(OrtKernelContext* context); void ShutdownBackendManager(); SessionContext& GetSessionContext(); - Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph, - const logging::Logger& logger); + Status ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphViewer& subgraph); ov::CompiledModel& GetOVCompiledModel(); private: diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 1c3a3f9e425d4..fb0fdc9b5e85b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -140,14 +140,14 @@ bool BasicBackend::ValidateSubgraph(std::map= 2024) { device_config.emplace(ov::hint::inference_precision(ov::element::undefined)); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 62e2cfcaa9d98..2947d43b4600b 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -13,31 +13,71 @@ namespace onnxruntime { namespace openvino_ep { +namespace fs = std::filesystem; + +struct SharedContext { + struct shared_weight_key { + std::string_view name; + std::string location; + }; + struct shared_weight_value { + unsigned int data_offset; + unsigned int size; + ov::Tensor* tensor; + }; + std::map shared_weight_map; + fs::path bin_pathname; +}; + +using config_t = std::map; + +struct ProviderInfo { + std::string device_type{""}; // [device_type]: Overrides the accelerator hardware type and + // precision with these values at runtime. + std::string precision{""}; // [precision]: Sets the inference precision for execution. + // Supported precision for devices are + // CPU=FP32, GPU=FP32,FP16, NPU=FP16. + // Not setting precision will execute with optimized precision for + // best inference latency. set Precision=ACCURACY for executing + // models with input precision for best accuracy. + uint32_t num_of_threads{0}; // [num_of_threads]: Overrides the accelerator default value of + // number of threads with this value at runtime. + config_t load_config{}; // JSON config map to load custom OV parameters. + fs::path cache_dir{""}; // [cache_dir]: specify the path to + // dump and load the blobs for the model caching/kernel caching + // (GPU) feature. If blob files are already present, + // it will be directly loaded. + std::string model_priority{"DEFAULT"}; // High-level OpenVINO model priority hint + // Defines what model should be provided with more performant + // bounded resource first + uint32_t num_streams{1}; // [num_streams]: Option that specifies the number of parallel + // inference requests to be processed on a given `device_type`. + // Overrides the accelerator default value of number of streams + // with this value at runtime. + void* context{nullptr}; // OpenCL context + bool enable_opencl_throttling{false}; // [enable_opencl_throttling]: Enables OpenCL queue throttling for + // GPU device (Reduces CPU Utilization when using GPU) + bool disable_dynamic_shapes{false}; // [disable_dynamic_shapes]: Rewrite dynamic shaped models to + // static shape at runtime and execute. + bool enable_qdq_optimizer{false}; // Enables QDQ pruning for efficient inference latency with NPU + bool so_context_enable{false}; // ORT session option + bool so_disable_cpu_ep_fallback{false}; // ORT session option + bool so_context_embed_mode{false}; // ORT session option + bool so_share_ep_contexts{false}; // ORT session option +}; + // Holds context applicable to the entire EP instance. -struct SessionContext { +struct SessionContext : ProviderInfo { + SessionContext(const ProviderInfo& info) : ProviderInfo{info} {} + OVCore ie_core; - bool enable_opencl_throttling = false; - bool disable_dynamic_shapes = false; - bool so_context_embed_mode = false; - bool so_share_ep_contexts = false; - bool so_context_enable = false; - bool enable_qdq_optimizer = false; - bool so_disable_cpu_ep_fallback = false; - size_t num_of_threads; - std::string device_type; - std::string precision_str; - std::filesystem::path cache_dir; - std::map load_config; - std::string model_priority = "DEFAULT"; - int num_streams; std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; std::string onnx_model_name; std::string onnx_model_path_name; int onnx_opset_version; - void* context = 0; bool use_api_2; - std::vector OpenVINO_Version = {}; // Ov Major and OV minor version from OV headers - std::string openvino_sdk_version; + const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; + const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR); }; // Holds context specific to subgraph. diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 50eed5443b8df..ab7604e1344f2 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -18,63 +18,123 @@ #include "core/providers/openvino/ov_allocator.h" #endif -#define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz)) - namespace onnxruntime { -openvino_ep::SessionContext GetSessionContext(const OpenVINOExecutionProviderInfo& info) { - openvino_ep::SessionContext result = { - .enable_opencl_throttling = info.enable_opencl_throttling_, - .disable_dynamic_shapes = info.disable_dynamic_shapes_, - .so_context_embed_mode = info.so_context_embed_mode_, - .so_share_ep_contexts = info.so_share_ep_contexts_, - .so_context_enable = info.so_context_enable_, - .enable_qdq_optimizer = info.enable_qdq_optimizer_, - .so_disable_cpu_ep_fallback = info.so_disable_cpu_ep_fallback_, - .num_of_threads = info.num_of_threads_, - .device_type = info.device_type_, - .precision_str = info.precision_, - .cache_dir = info.cache_dir_, - .load_config = info.load_config_, - .model_priority = info.model_priority_, - .num_streams = info.num_streams_, - .context = info.context_, - .OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}, - .openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR), - }; - return result; +namespace openvino_ep { + +// Parking this code here for now before it's moved to the factory +static std::vector parseDevices(const std::string& device_string, + const std::vector& available_devices) { + std::string comma_separated_devices = device_string; + if (comma_separated_devices.find(":") != std::string::npos) { + comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1); + } + auto devices = split(comma_separated_devices, ','); + if (devices.size() < 2) { + print_build_options(); + ORT_THROW("Invalid device string: " + device_string); + } + std::set dev_options = {"CPU", "GPU", "NPU"}; + + for (auto& device : available_devices) { + if (dev_options.find(device) == dev_options.end()) { + auto dev_options_update = dev_options.emplace(device); + } + } + + for (const std::string& dev : devices) { + if (!std::count(dev_options.begin(), dev_options.end(), dev)) { + print_build_options(); + ORT_THROW("Invalid device string: " + device_string); + } + } + return devices; +} + +// Parking this code here for now before it's moved to the factory +void AdjustProviderInfo(ProviderInfo& info) { + std::set ov_supported_device_types = {"CPU", "GPU", + "GPU.0", "GPU.1", "NPU"}; + + OVDevices devices; + std::vector available_devices = devices.get_ov_devices(); + + for (auto& device : available_devices) { + if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { + ov_supported_device_types.emplace(device); + } + } + + if (info.device_type == "") { + LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" + << "No runtime device selection option provided."; +#if defined OPENVINO_CONFIG_CPU + device_type_ = "CPU"; + precision_ = "FP32"; +#elif defined OPENVINO_CONFIG_GPU + device_type_ = "GPU"; + precision_ = "FP16"; +#elif defined OPENVINO_CONFIG_NPU + info.device_type = "NPU"; + info.precision = "FP16"; +#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO +#ifdef DEVICE_NAME +#define DEVICE DEVICE_NAME +#endif + dev_type = DEVICE; + + if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { + std::vector devices = parseDevices(info.device_type, available_devices); + info.precision = "FP16"; + if (devices[0] == "CPU") { + info.precision = "FP32"; + } + info.device_type = std::move(dev_type); + } +#endif + } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) { + info.device_type = std::move(info.device_type); + } else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { + std::ignore = parseDevices(info.device_type, available_devices); + info.device_type = std::move(info.device_type); + } else { + ORT_THROW("Invalid device string: " + info.device_type); + } + LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" + << "Choosing Device: " << info.device_type << " , Precision: " << info.precision; } -OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info) +OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context) : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider}, - session_context_{GetSessionContext(info)}, + session_context_(info), + shared_context_{shared_context}, ep_ctx_handle_{session_context_.openvino_sdk_version, *GetLogger()} { InitProviderOrtApi(); // to check if target device is available // using ie_core capability GetAvailableDevices to fetch list of devices plugged in - if (info.cache_dir_.empty()) { + if (info.cache_dir.empty()) { bool device_found = false; std::vector available_devices = session_context_.ie_core.GetAvailableDevices(); // Checking for device_type configuration - if (info.device_type_ != "") { - if (info.device_type_.find("HETERO") != std::string::npos || - info.device_type_.find("MULTI") != std::string::npos || - info.device_type_.find("AUTO") != std::string::npos) { + if (info.device_type != "") { + if (info.device_type.find("HETERO") != std::string::npos || + info.device_type.find("MULTI") != std::string::npos || + info.device_type.find("AUTO") != std::string::npos) { device_found = true; } else { for (const std::string& device : available_devices) { - if (device.rfind(info.device_type_, 0) == 0) { - if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" || - info.precision_ == "FP16" || - info.precision_ == "ACCURACY")) { + if (device.rfind(info.device_type, 0) == 0) { + if (info.device_type.find("GPU") != std::string::npos && (info.precision == "FP32" || + info.precision == "FP16" || + info.precision == "ACCURACY")) { device_found = true; break; } - if (info.device_type_ == "CPU" && (info.precision_ == "FP32")) { + if (info.device_type == "CPU" && (info.precision == "FP32")) { device_found = true; break; } - if (info.device_type_.find("NPU") != std::string::npos) { + if (info.device_type.find("NPU") != std::string::npos) { device_found = true; break; } @@ -83,7 +143,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv } } if (!device_found) { - ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type_ + " is not available"); + ORT_THROW("[ERROR] [OpenVINO] Specified device - " + info.device_type + " is not available"); } } } @@ -118,6 +178,13 @@ common::Status OpenVINOExecutionProvider::Compile( session_context_.onnx_opset_version = fused_nodes[0].filtered_graph.get().DomainToVersionMap().at(kOnnxDomain); + struct OpenVINOEPFunctionState { + AllocateFunc allocate_func = nullptr; + DestroyFunc destroy_func = nullptr; + AllocatorHandle allocator_handle = nullptr; + BackendManager& backend_manager; + }; + for (const FusedNodeAndGraph& fused_node_graph : fused_nodes) { const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; const Node& fused_node = fused_node_graph.fused_node; @@ -138,13 +205,15 @@ common::Status OpenVINOExecutionProvider::Compile( compute_info.create_state_func = [&backend_manager](ComputeContext* context, FunctionState* state) { - OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(backend_manager); - p->allocate_func = context->allocate_func; - p->destroy_func = context->release_func; - p->allocator_handle = context->allocator_handle; + OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState{ + .allocate_func = context->allocate_func, + .destroy_func = context->release_func, + .allocator_handle = context->allocator_handle, + .backend_manager = backend_manager}; *state = static_cast(p); return 0; }; + compute_info.compute_func = [](FunctionState state, const OrtApi* /* api */, OrtKernelContext* context) { auto function_state = static_cast(state); try { @@ -162,6 +231,7 @@ common::Status OpenVINOExecutionProvider::Compile( delete function_state; } }; + node_compute_funcs.push_back(compute_info); if (!status.IsOK()) { @@ -234,4 +304,5 @@ const InlinedVector OpenVINOExecutionProvider::GetEpContextNodes() return ep_ctx_handle_.GetEPCtxNodes(); } +} // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 5644639c705f8..d35dc5513ed1d 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -13,8 +13,10 @@ #include #include "core/providers/openvino/backend_manager.h" +#include "core/providers/openvino/contexts.h" namespace onnxruntime { +namespace openvino_ep { struct OVDevices { ov::Core core; @@ -47,141 +49,10 @@ static std::vector split(const std::string& s, char delim) { return result; } -static std::vector parseDevices(const std::string& device_string, - const std::vector& available_devices) { - std::string comma_separated_devices = device_string; - if (comma_separated_devices.find(":") != std::string::npos) { - comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1); - } - auto devices = split(comma_separated_devices, ','); - if (devices.size() < 2) { - print_build_options(); - ORT_THROW("Invalid device string: " + device_string); - } - std::set dev_options = {"CPU", "GPU", "NPU"}; - - for (auto& device : available_devices) { - if (dev_options.find(device) == dev_options.end()) { - auto dev_options_update = dev_options.emplace(device); - } - } - - for (const std::string& dev : devices) { - if (!std::count(dev_options.begin(), dev_options.end(), dev)) { - print_build_options(); - ORT_THROW("Invalid device string: " + device_string); - } - } - return devices; -} - -// Information needed to construct OpenVINO execution providers. -struct OpenVINOExecutionProviderInfo { - std::string device_type_{""}; - std::string precision_{""}; - size_t num_of_threads_{0}; - std::map load_config_{}; - std::string cache_dir_{""}; - std::string model_priority_{""}; - int num_streams_{1}; - void* context_{NULL}; - bool enable_opencl_throttling_{false}; - bool disable_dynamic_shapes_{false}; - bool so_context_enable_{false}; - bool enable_qdq_optimizer_{false}; - bool so_disable_cpu_ep_fallback_{false}; - bool so_context_embed_mode_{false}; - bool so_share_ep_contexts_{false}; - - OpenVINOExecutionProviderInfo() = delete; - - explicit OpenVINOExecutionProviderInfo(std::string dev_type, const std::string& precision, - size_t num_of_threads, - const std::map& load_config, - const std::string& cache_dir, - const std::string& model_priority, int num_streams, - void* context, bool enable_opencl_throttling, - bool disable_dynamic_shapes, bool so_context_enable, - bool enable_qdq_optimizer, bool so_disable_cpu_ep_fallback, - bool so_context_embed_mode, bool so_share_ep_contexts) - : precision_(std::move(precision)), - num_of_threads_(num_of_threads), - load_config_(std::move(load_config)), - cache_dir_(std::move(cache_dir)), - model_priority_(std::move(model_priority)), - num_streams_(num_streams), - context_(context), - enable_opencl_throttling_(enable_opencl_throttling), - disable_dynamic_shapes_(disable_dynamic_shapes), - so_context_enable_(so_context_enable), - enable_qdq_optimizer_(enable_qdq_optimizer), - so_disable_cpu_ep_fallback_(so_disable_cpu_ep_fallback), - so_context_embed_mode_{so_context_embed_mode}, - so_share_ep_contexts_{so_share_ep_contexts} { - std::set ov_supported_device_types = {"CPU", "GPU", - "GPU.0", "GPU.1", "NPU"}; - - OVDevices devices; - std::vector available_devices = devices.get_ov_devices(); - - for (auto& device : available_devices) { - if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { - ov_supported_device_types.emplace(device); - } - } - - if (dev_type == "") { - LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" - << "No runtime device selection option provided."; -#if defined OPENVINO_CONFIG_CPU - device_type_ = "CPU"; - precision_ = "FP32"; -#elif defined OPENVINO_CONFIG_GPU - device_type_ = "GPU"; - precision_ = "FP16"; -#elif defined OPENVINO_CONFIG_NPU - device_type_ = "NPU"; - precision_ = "FP16"; -#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO -#ifdef DEVICE_NAME -#define DEVICE DEVICE_NAME -#endif - dev_type = DEVICE; - - if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { - std::vector devices = parseDevices(dev_type, available_devices); - precision_ = "FP16"; - if (devices[0] == "CPU") { - precision_ = "FP32"; - } - device_type_ = std::move(dev_type); - } -#endif - } else if (ov_supported_device_types.find(dev_type) != ov_supported_device_types.end()) { - device_type_ = std::move(dev_type); - } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { - std::vector devices = parseDevices(dev_type, available_devices); - device_type_ = std::move(dev_type); - } else { - ORT_THROW("Invalid device string: " + dev_type); - } - LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" - << "Choosing Device: " << device_type_ << " , Precision: " << precision_; - } -}; - -struct OpenVINOEPFunctionState { - OpenVINOEPFunctionState(openvino_ep::BackendManager& bm) : backend_manager(bm) {} - AllocateFunc allocate_func = nullptr; - DestroyFunc destroy_func = nullptr; - AllocatorHandle allocator_handle = nullptr; - openvino_ep::BackendManager& backend_manager; -}; - // Logical device representation. class OpenVINOExecutionProvider : public IExecutionProvider { public: - explicit OpenVINOExecutionProvider(const OpenVINOExecutionProviderInfo& info); + explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context = nullptr); ~OpenVINOExecutionProvider() = default; std::vector> @@ -204,10 +75,11 @@ class OpenVINOExecutionProvider : public IExecutionProvider { std::vector CreatePreferredAllocators() override; #endif private: - openvino_ep::SessionContext session_context_; - std::list backend_managers_; // EP session owns the backend objects - - openvino_ep::EPCtxHandler ep_ctx_handle_; + SessionContext session_context_; + SharedContext* shared_context_{nullptr}; + std::list backend_managers_; // EP session owns the backend objects + EPCtxHandler ep_ctx_handle_; }; +} // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 92c4948565a1d..2028979c1c87d 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -7,83 +7,29 @@ #include "core/providers/openvino/openvino_provider_factory.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/openvino_provider_factory_creator.h" +#include "core/providers/openvino/contexts.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "nlohmann/json.hpp" namespace onnxruntime { +namespace openvino_ep { struct OpenVINOProviderFactory : IExecutionProviderFactory { - OpenVINOProviderFactory(const std::string& device_type, const std::string& precision, - size_t num_of_threads, - const std::map& load_config, const std::string& cache_dir, - const std::string& model_priority, int num_streams, void* context, - bool enable_opencl_throttling, bool disable_dynamic_shapes, - bool enable_qdq_optimizer, const ConfigOptions& config_options) - : device_type_(device_type), - precision_(precision), - num_of_threads_(num_of_threads), - load_config_(load_config), - cache_dir_(cache_dir), - model_priority_(model_priority), - num_streams_(num_streams), - context_(context), - enable_opencl_throttling_(enable_opencl_throttling), - disable_dynamic_shapes_(disable_dynamic_shapes), - enable_qdq_optimizer_(enable_qdq_optimizer), - config_options_(config_options) {} + OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext* shared_context) + : provider_info_(provider_info), shared_context_(shared_context) {} ~OpenVINOProviderFactory() override {} std::unique_ptr CreateProvider() override; private: - std::string device_type_; - std::string precision_; - size_t num_of_threads_; - const std::map load_config_; - std::string cache_dir_; - std::string model_priority_; - int num_streams_; - void* context_; - bool enable_opencl_throttling_; - bool disable_dynamic_shapes_; - bool enable_qdq_optimizer_; - const ConfigOptions& config_options_; + ProviderInfo provider_info_; + SharedContext* shared_context_; }; std::unique_ptr OpenVINOProviderFactory::CreateProvider() { - bool so_disable_cpu_ep_fallback = config_options_.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; - bool so_context_enable = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; - bool so_context_embed_mode = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; - std::string so_context_file_path = config_options_.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data(); - bool so_share_ep_contexts = config_options_.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; - - if (so_context_enable && !so_context_file_path.empty()) { - cache_dir_ = std::move(so_context_file_path); - auto file_path = std::filesystem::path(cache_dir_); - // ep_context_file_path_ file extension must be .onnx - if (file_path.extension().generic_string() == ".onnx") { - // ep_context_file_path_ must be provided as a directory, create it if doesn't exist - auto parent_path = file_path.parent_path(); - if (!parent_path.empty() && !std::filesystem::is_directory(parent_path) && - !std::filesystem::create_directory(parent_path)) { - ORT_THROW("[ERROR] [OpenVINO] Failed to create directory : " + - file_path.parent_path().generic_string() + " \n"); - } - } else { - ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + cache_dir_ + " \n"); - } - } - - OpenVINOExecutionProviderInfo info(device_type_, precision_, num_of_threads_, load_config_, - cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_, - disable_dynamic_shapes_, so_context_enable, enable_qdq_optimizer_, - so_disable_cpu_ep_fallback, so_context_embed_mode, so_share_ep_contexts); - return std::make_unique(info); + return std::make_unique(provider_info_, shared_context_); } -} // namespace onnxruntime - -namespace onnxruntime { struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO { std::vector GetAvailableDevices() const override { openvino_ep::OVCore ie_core; @@ -96,43 +42,16 @@ struct OpenVINO_Provider : Provider { std::shared_ptr CreateExecutionProviderFactory(const void* void_params) override { // Extract the void_params into ProviderOptions and ConfigOptions - typedef std::pair ConfigBuffer; + using ConfigBuffer = std::pair; const ConfigBuffer* buffer = reinterpret_cast(void_params); - auto& provider_options_map = *buffer->first; - const ConfigOptions& config_options = buffer->second; - - std::string device_type = ""; // [device_type]: Overrides the accelerator hardware type and - // precision with these values at runtime. - std::string precision = ""; // [precision]: Sets the inference precision for execution. - // Supported precision for devices are - // CPU=FP32, GPU=FP32,FP16, NPU=FP16. - // Not setting precision will execute with optimized precision for - // best inference latency. set Precision=ACCURACY for executing - // models with input precision for best accuracy. - int num_of_threads = 0; // [num_of_threads]: Overrides the accelerator default value of - // number of threads with this value at runtime. - std::map load_config; // JSON config map to load custom OV parameters. - std::string cache_dir = ""; // [cache_dir]: specify the path to - // dump and load the blobs for the model caching/kernel caching - // (GPU) feature. If blob files are already present, - // it will be directly loaded. - std::string model_priority = "DEFAULT"; // High-level OpenVINO model priority hint - // Defines what model should be provided with more performant - // bounded resource first - int num_streams = 1; // [num_streams]: Option that specifies the number of parallel - // inference requests to be processed on a given `device_type`. - // Overrides the accelerator default value of number of streams - // with this value at runtime. - bool enable_opencl_throttling = false; // [enable_opencl_throttling]: Enables OpenCL queue throttling for - // GPU device (Reduces CPU Utilization when using GPU) - - bool enable_qdq_optimizer = false; // Enables QDQ pruning for efficient inference latency with NPU - - void* context = nullptr; + const auto& provider_options_map = *buffer->first; + const auto& config_options = buffer->second; + + ProviderInfo pi; std::string bool_flag = ""; if (provider_options_map.find("device_type") != provider_options_map.end()) { - device_type = provider_options_map.at("device_type").c_str(); + pi.device_type = provider_options_map.at("device_type").c_str(); std::set ov_supported_device_types = {"CPU", "GPU", "GPU.0", "GPU.1", "NPU"}; @@ -147,20 +66,20 @@ struct OpenVINO_Provider : Provider { ov_supported_device_types.emplace(device); } } - if (deprecated_device_types.find(device_type) != deprecated_device_types.end()) { - std::string deprecated_device = device_type; - int delimit = device_type.find("_"); - device_type = deprecated_device.substr(0, delimit); - precision = deprecated_device.substr(delimit + 1); + if (deprecated_device_types.find(pi.device_type) != deprecated_device_types.end()) { + std::string deprecated_device = pi.device_type; + int delimit = pi.device_type.find("_"); + pi.device_type = deprecated_device.substr(0, delimit); + pi.precision = deprecated_device.substr(delimit + 1); LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + deprecated_device + " is deprecated. \n" << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', " << "'GPU.1', 'NPU' or from" << " HETERO/MULTI/AUTO options and set 'precision' separately. \n"; } - if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) || - (device_type.find("HETERO:") == 0) || - (device_type.find("MULTI:") == 0) || - (device_type.find("AUTO:") == 0))) { + if (!((ov_supported_device_types.find(pi.device_type) != ov_supported_device_types.end()) || + (pi.device_type.find("HETERO:") == 0) || + (pi.device_type.find("MULTI:") == 0) || + (pi.device_type.find("AUTO:") == 0))) { ORT_THROW( "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. " "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from" @@ -172,36 +91,36 @@ struct OpenVINO_Provider : Provider { LOGS_DEFAULT(WARNING) << "[OpenVINO] The options 'device_id' is deprecated. " << "Upgrade to set deice_type and precision session options.\n"; if (dev_id == "CPU" || dev_id == "GPU" || dev_id == "NPU") { - device_type = std::move(dev_id); + pi.device_type = std::move(dev_id); } else { ORT_THROW("[ERROR] [OpenVINO] Unsupported device_id is selected. Select from available options."); } } if (provider_options_map.find("precision") != provider_options_map.end()) { - precision = provider_options_map.at("precision").c_str(); + pi.precision = provider_options_map.at("precision").c_str(); } - if (device_type.find("GPU") != std::string::npos) { - if (precision == "") { - precision = "FP16"; - } else if (precision != "ACCURACY" && precision != "FP16" && precision != "FP32") { + if (pi.device_type.find("GPU") != std::string::npos) { + if (pi.precision == "") { + pi.precision = "FP16"; + } else if (pi.precision != "ACCURACY" && pi.precision != "FP16" && pi.precision != "FP32") { ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. GPU only supports FP32 / FP16. \n"); } - } else if (device_type.find("NPU") != std::string::npos) { - if (precision == "" || precision == "ACCURACY" || precision == "FP16") { - precision = "FP16"; + } else if (pi.device_type.find("NPU") != std::string::npos) { + if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP16") { + pi.precision = "FP16"; } else { ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. NPU only supported FP16. \n"); } - } else if (device_type.find("CPU") != std::string::npos) { - if (precision == "" || precision == "ACCURACY" || precision == "FP32") { - precision = "FP32"; + } else if (pi.device_type.find("CPU") != std::string::npos) { + if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP32") { + pi.precision = "FP32"; } else { ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. CPU only supports FP32 . \n"); } } if (provider_options_map.find("cache_dir") != provider_options_map.end()) { - cache_dir = provider_options_map.at("cache_dir"); + pi.cache_dir = provider_options_map.at("cache_dir"); } if (provider_options_map.find("load_config") != provider_options_map.end()) { @@ -263,13 +182,13 @@ struct OpenVINO_Provider : Provider { return target_map; }; - load_config = parse_config(provider_options_map.at("load_config")); + pi.load_config = parse_config(provider_options_map.at("load_config")); } if (provider_options_map.find("context") != provider_options_map.end()) { std::string str = provider_options_map.at("context"); uint64_t number = std::strtoull(str.c_str(), nullptr, 16); - context = reinterpret_cast(number); + pi.context = reinterpret_cast(number); } #if defined(IO_BUFFER_ENABLED) // a valid context must be provided to enable IO Buffer optimizations @@ -285,20 +204,20 @@ struct OpenVINO_Provider : Provider { provider_options_map.at("num_of_threads").end(), ::isdigit)) { ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n"); } - num_of_threads = std::stoi(provider_options_map.at("num_of_threads")); - if (num_of_threads <= 0) { - num_of_threads = 1; + pi.num_of_threads = std::stoi(provider_options_map.at("num_of_threads")); + if (pi.num_of_threads <= 0) { + pi.num_of_threads = 1; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_threads' should be in the positive range.\n " << "Executing with num_threads=1"; } } if (provider_options_map.find("model_priority") != provider_options_map.end()) { - model_priority = provider_options_map.at("model_priority").c_str(); + pi.model_priority = provider_options_map.at("model_priority").c_str(); std::vector supported_priorities({"LOW", "MEDIUM", "HIGH", "DEFAULT"}); if (std::find(supported_priorities.begin(), supported_priorities.end(), - model_priority) == supported_priorities.end()) { - model_priority = "DEFAULT"; + pi.model_priority) == supported_priorities.end()) { + pi.model_priority = "DEFAULT"; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'model_priority' " << "is not one of LOW, MEDIUM, HIGH, DEFAULT. " << "Executing with model_priorty=DEFAULT"; @@ -306,9 +225,9 @@ struct OpenVINO_Provider : Provider { } if (provider_options_map.find("num_streams") != provider_options_map.end()) { - num_streams = std::stoi(provider_options_map.at("num_streams")); - if (num_streams <= 0) { - num_streams = 1; + pi.num_streams = std::stoi(provider_options_map.at("num_streams")); + if (pi.num_streams <= 0) { + pi.num_streams = 1; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_streams' should be in the range of 1-8.\n " << "Executing with num_streams=1"; } @@ -316,57 +235,56 @@ struct OpenVINO_Provider : Provider { if (provider_options_map.find("enable_opencl_throttling") != provider_options_map.end()) { bool_flag = provider_options_map.at("enable_opencl_throttling"); if (bool_flag == "true" || bool_flag == "True") - enable_opencl_throttling = true; + pi.enable_opencl_throttling = true; else if (bool_flag == "false" || bool_flag == "False") - enable_opencl_throttling = false; + pi.enable_opencl_throttling = false; bool_flag = ""; } if (provider_options_map.find("enable_qdq_optimizer") != provider_options_map.end()) { bool_flag = provider_options_map.at("enable_qdq_optimizer"); if (bool_flag == "true" || bool_flag == "True") - enable_qdq_optimizer = true; + pi.enable_qdq_optimizer = true; else if (bool_flag == "false" || bool_flag == "False") - enable_qdq_optimizer = false; + pi.enable_qdq_optimizer = false; else ORT_THROW("[ERROR] [OpenVINO-EP] enable_qdq_optimiser should be a boolean.\n"); bool_flag = ""; } - // [disable_dynamic_shapes]: Rewrite dynamic shaped models to static shape at runtime and execute. - // Always true for NPU plugin. - bool disable_dynamic_shapes = false; - if (device_type.find("NPU") != std::string::npos) { - disable_dynamic_shapes = true; + // Always true for NPU plugin or when passed . + if (pi.device_type.find("NPU") != std::string::npos) { + pi.disable_dynamic_shapes = true; } if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) { bool_flag = provider_options_map.at("disable_dynamic_shapes"); if (bool_flag == "true" || bool_flag == "True") { - disable_dynamic_shapes = true; + pi.disable_dynamic_shapes = true; } else if (bool_flag == "false" || bool_flag == "False") { - if (device_type.find("NPU") != std::string::npos) { - disable_dynamic_shapes = true; + if (pi.device_type.find("NPU") != std::string::npos) { + pi.disable_dynamic_shapes = true; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to " << "TRUE for NPU backend.\n "; } else { - disable_dynamic_shapes = false; + pi.disable_dynamic_shapes = false; } } bool_flag = ""; } - return std::make_shared(device_type, - precision, - num_of_threads, - load_config, - cache_dir, - model_priority, - num_streams, - context, - enable_opencl_throttling, - disable_dynamic_shapes, - enable_qdq_optimizer, - config_options); + pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; + pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; + pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; + pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; + std::string so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data(); + + if (pi.so_context_enable && !so_context_file_path.empty()) { + pi.cache_dir = std::move(so_context_file_path); + } + + SharedContext* shared_context = pi.so_share_ep_contexts ? &shared_context_ : nullptr; + + return std::make_shared(pi, shared_context); } void Initialize() override { @@ -374,13 +292,17 @@ struct OpenVINO_Provider : Provider { void Shutdown() override { } + + private: + SharedContext shared_context_; } g_provider; +} // namespace openvino_ep } // namespace onnxruntime extern "C" { ORT_API(onnxruntime::Provider*, GetProvider) { - return &onnxruntime::g_provider; + return &onnxruntime::openvino_ep::g_provider; } } From a78166d591532b518d1d29ff9c7c23789d7fe2d7 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Sun, 12 Jan 2025 22:06:13 -0800 Subject: [PATCH 13/35] Store metadata in shared context --- .../providers/openvino/backend_manager.cc | 9 +- .../core/providers/openvino/backend_manager.h | 6 +- .../core/providers/openvino/backend_utils.cc | 2 +- .../core/providers/openvino/contexts.h | 35 ++- .../openvino/openvino_execution_provider.cc | 14 +- .../openvino/openvino_execution_provider.h | 4 +- .../openvino/openvino_provider_factory.cc | 13 +- .../qdq_transformations/qdq_stripping.cc | 257 +++++++++--------- .../qdq_transformations/qdq_stripping.h | 7 +- 9 files changed, 191 insertions(+), 156 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 6e308d78ca066..c06e00272a8c8 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -32,11 +32,14 @@ ov::CompiledModel& BackendManager::GetOVCompiledModel() { return (ov_ptr); } -BackendManager::BackendManager(const SessionContext& session_context, +BackendManager::BackendManager(SessionContext& session_context, + SharedContext& shared_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, - EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle), session_context_(session_context) { + EPCtxHandler& ep_ctx_handle) : ep_ctx_handle_(ep_ctx_handle), + session_context_(session_context), + shared_context_{shared_context} { subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph); subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) { @@ -360,7 +363,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, IsQDQGraph(subgraph)) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] QDQ optimization pass status: 1"; std::unique_ptr model; - Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model); + Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, model, shared_context_.shared_weights); auto model_proto = model->ToProto(); model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); print_model_proto_duration(); diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 43dc9ceaa558e..cdc27701ec2e6 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -19,7 +19,8 @@ namespace openvino_ep { // Singleton class that manages all the backends class BackendManager { public: - BackendManager(const SessionContext& session_context, + BackendManager(SessionContext& session_context, + SharedContext& shared_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, @@ -51,7 +52,8 @@ class BackendManager { std::map> backend_map_; SubGraphContext subgraph_context_; EPCtxHandler& ep_ctx_handle_; - SessionContext session_context_; + SessionContext& session_context_; + SharedContext& shared_context_; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index c447a7847434a..4adf9f5b89833 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -48,7 +48,7 @@ CreateOVModel(const std::string model, std::cout << "CreateNgraphFunc" << std::endl; } try { - auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name); + auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name.string()); // Check for Constant Folding if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) { diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 2947d43b4600b..f96fec345eef1 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -16,17 +16,28 @@ namespace openvino_ep { namespace fs = std::filesystem; struct SharedContext { - struct shared_weight_key { - std::string_view name; - std::string location; - }; - struct shared_weight_value { - unsigned int data_offset; - unsigned int size; - ov::Tensor* tensor; - }; - std::map shared_weight_map; - fs::path bin_pathname; + struct SharedWeights { + struct Metadata { + struct Key { + std::string name; + bool operator==(const Key&) const = default; + }; + struct KeyHash { + std::size_t operator()(const Key& key) const noexcept { + return std::hash()(key.name); + } + }; + struct Value { + std::string location; + unsigned int data_offset; + unsigned int size; + ov::Tensor* tensor; + }; + using Map = std::unordered_map; + }; + Metadata::Map metadata; + fs::path external_weight_filename; + } shared_weights; }; using config_t = std::map; @@ -73,7 +84,7 @@ struct SessionContext : ProviderInfo { OVCore ie_core; std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; std::string onnx_model_name; - std::string onnx_model_path_name; + std::filesystem::path onnx_model_path_name; int onnx_opset_version; bool use_api_2; const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index ab7604e1344f2..684a2c64237b8 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -12,6 +12,7 @@ #include "core/providers/openvino/backend_manager.h" #include "core/providers/openvino/onnx_ctx_model_helper.h" #include "core/providers/openvino/ov_versions/capability.h" +#include "core/providers/openvino/qdq_transformations/qdq_stripping.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "openvino/core/version.hpp" #ifdef USE_OVEP_NPU_MEMORY @@ -103,7 +104,7 @@ void AdjustProviderInfo(ProviderInfo& info) { << "Choosing Device: " << info.device_type << " , Precision: " << info.precision; } -OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context) +OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context) : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider}, session_context_(info), shared_context_{shared_context}, @@ -198,6 +199,7 @@ common::Status OpenVINOExecutionProvider::Compile( // For original model, check if the user wants to export a model with pre-compiled blob auto& backend_manager = backend_managers_.emplace_back(session_context_, + shared_context_, fused_node, graph_body_viewer, logger, @@ -239,6 +241,16 @@ common::Status OpenVINOExecutionProvider::Compile( } } + if (session_context_.so_share_ep_contexts && session_context_.so_context_enable && !session_context_.cache_dir.empty()) { + // Metadata is generated only for shared contexts + // If metadata is generated then only save it if also saving epcontext (so_context_enable) + // If saving metadata then save it to the provided path + std::filesystem::path metadata_name = session_context_.cache_dir.parent_path(); + metadata_name /= session_context_.cache_dir.stem().string() + "_metadata"; + metadata_name.replace_extension("bin"); + dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string()); + } + return status; } diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index d35dc5513ed1d..95d7027fd70e3 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -52,7 +52,7 @@ static std::vector split(const std::string& s, char delim) { // Logical device representation. class OpenVINOExecutionProvider : public IExecutionProvider { public: - explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext* shared_context = nullptr); + explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context); ~OpenVINOExecutionProvider() = default; std::vector> @@ -76,7 +76,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider { #endif private: SessionContext session_context_; - SharedContext* shared_context_{nullptr}; + SharedContext& shared_context_; std::list backend_managers_; // EP session owns the backend objects EPCtxHandler ep_ctx_handle_; }; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 2028979c1c87d..7b0d6c6751120 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -14,7 +14,7 @@ namespace onnxruntime { namespace openvino_ep { struct OpenVINOProviderFactory : IExecutionProviderFactory { - OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext* shared_context) + OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context) : provider_info_(provider_info), shared_context_(shared_context) {} ~OpenVINOProviderFactory() override {} @@ -23,7 +23,7 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { private: ProviderInfo provider_info_; - SharedContext* shared_context_; + SharedContext& shared_context_; }; std::unique_ptr OpenVINOProviderFactory::CreateProvider() { @@ -282,9 +282,14 @@ struct OpenVINO_Provider : Provider { pi.cache_dir = std::move(so_context_file_path); } - SharedContext* shared_context = pi.so_share_ep_contexts ? &shared_context_ : nullptr; + // Append values to config to support weight-as-inputs conversion for shared contexts + if (pi.so_share_ep_contexts) { + ov::AnyMap map; + map["NPU_COMPILATION_MODE_PARAMS"] = "enable-wd-blockarg-input=true compute-layers-with-higher-precision=Sqrt,Power,ReduceSum"; + pi.load_config["NPU"] = map; + } - return std::make_shared(pi, shared_context); + return std::make_shared(pi, shared_context_); } void Initialize() override { diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 7c1e850b0b7a0..2ba6e03dd4d8e 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -56,7 +56,7 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph, std::set& initializers_to_keep, const NodeUnitIODef& io_def) { const std::string& name = io_def.node_arg.Name(); - const ONNX_NAMESPACE::TypeProto* orig_type_proto = io_def.node_arg.TypeAsProto(); + const auto* orig_type_proto = io_def.node_arg.TypeAsProto(); // Handle quantized input or output. Convert to float type. if (io_def.quant_param.has_value()) { @@ -68,11 +68,11 @@ static NodeArg& ProcessNodeUnitIO(onnxruntime::Graph& dst_graph, ORT_ENFORCE(tensor_proto_iter != src_initializers.end(), "Unable to find scale initializer ", scale_initializer_name); - const ONNX_NAMESPACE::TensorProto* scale_tensor_proto = tensor_proto_iter->second; + const auto* scale_tensor_proto = tensor_proto_iter->second; int32_t float_type = scale_tensor_proto->data_type(); // Noe set the arg type to the float type of scale. Could be one of float/float16/bfloat16 - std::unique_ptr type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); type_proto->copy_from(orig_type_proto); type_proto->mutable_tensor_type()->set_elem_type(float_type); @@ -457,7 +457,7 @@ static void AddStandaloneNodeUnit(onnxruntime::Graph& dst_graph, const onnxrunti if (duplicate_dq && GetQDQDataType(&node_unit.GetNode()) != DT_UINT16 && GetQDQDataType(&node_unit.GetNode()) != DT_INT16) { std::string orig_dq_name = node_unit.Outputs()[0].node_arg.Name(); // ex: dql_output/duplicated - std::unique_ptr type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); type_proto->copy_from(node_unit.Inputs()[0].node_arg.TypeAsProto()); type_proto->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); orig_dq_name.erase(orig_dq_name.find(DuplicateDQ), std::string::npos); // ex: dql_output @@ -625,93 +625,93 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph, KeepInitsInDstGraph(initializers_to_keep, src_graph, &target_node); } -static void AddInitializerAsInput (onnxruntime::Graph& dst_graph, +static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, InlinedVector& accumulated_inputs, const onnxruntime::GraphViewer& src_graph, const std::string& initializer_name) { - // Get the initializer from source graph - const auto& src_initializers = src_graph.GetAllInitializedTensors(); - auto init_iter = src_initializers.find(initializer_name); + // Get the initializer from source graph + const auto& src_initializers = src_graph.GetAllInitializedTensors(); + auto init_iter = src_initializers.find(initializer_name); - if (init_iter == src_initializers.end()) { - // Initializer not found - return; - } + if (init_iter == src_initializers.end()) { + // Initializer not found + return; + } - const ONNX_NAMESPACE::TensorProto* tensor_proto = init_iter->second; + const auto* tensor_proto = init_iter->second; - // Create TypeProto for the initializer - std::unique_ptr type_proto = ONNX_NAMESPACE::TypeProto::Create(); - auto* tensor_type = type_proto->mutable_tensor_type(); - tensor_type->set_elem_type(tensor_proto->data_type()); + // Create TypeProto for the initializer + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); + auto* tensor_type = type_proto->mutable_tensor_type(); + tensor_type->set_elem_type(tensor_proto->data_type()); - for (int i = 0; i < tensor_proto->dims_size(); ++i) { - tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i)); - } + for (int i = 0; i < tensor_proto->dims_size(); ++i) { + tensor_type->mutable_shape()->add_dim()->set_dim_value(tensor_proto->dims().Get(i)); + } - // Create NodeArg for the initializer - auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get()); + // Create NodeArg for the initializer + auto& input_arg = dst_graph.GetOrCreateNodeArg(initializer_name, type_proto.get()); - // Check if input already exists in accumulated inputs - bool input_exists = false; - for (const auto* existing_input : accumulated_inputs) { - if (existing_input->Name() == initializer_name) { - input_exists = true; - break; - } + // Check if input already exists in accumulated inputs + bool input_exists = false; + for (const auto* existing_input : accumulated_inputs) { + if (existing_input->Name() == initializer_name) { + input_exists = true; + break; } + } - if (!input_exists) { - // Add to accumulated inputs - accumulated_inputs.push_back(&input_arg); - } + if (!input_exists) { + // Add to accumulated inputs + accumulated_inputs.push_back(&input_arg); + } } -bool writeString(std::ofstream& outfile, const std::string& str) { - size_t size = str.size(); - outfile.write(reinterpret_cast(&size), sizeof(size)); - if (!outfile.good()) return false; +template +bool writeScalar(std::ofstream& outfile, const T& scalar) { + auto size = sizeof(T); + outfile.write(reinterpret_cast(&size), sizeof(size)); + if (!outfile.good()) return false; - outfile.write(str.c_str(), size); - return outfile.good(); + outfile.write(reinterpret_cast(&scalar), size); + return outfile.good(); } -bool writeStringVector(std::ofstream& outfile, const std::vector& vec) { - size_t size = vec.size(); - outfile.write(reinterpret_cast(&size), sizeof(size)); - if (!outfile.good()) return false; +template <> +bool writeScalar(std::ofstream& outfile, const std::string& text) { + auto size = text.size() * sizeof(std::string::value_type); + outfile.write(reinterpret_cast(&size), size); + if (!outfile.good()) return false; - for (const auto& str : vec) { - if (!writeString(outfile, str)) { - return false; - } - } - return true; + outfile.write(text.data(), size); + return outfile.good(); } // Main function to dump the map to a binary file -bool dumpMetaDataMapToBinary(const std::unordered_map>& map, const std::string& filename) { - +bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::string& filename) { std::ofstream outfile(filename, std::ios::binary); if (!outfile.is_open()) { - ORT_THROW("Error: Could not open file for writing metadata."); - return false; + ORT_THROW("Error: Could not open file for writing metadata."); + return false; } // Write the size of the map - size_t map_size = map.size(); + size_t map_size = metadata.size(); outfile.write(reinterpret_cast(&map_size), sizeof(map_size)); if (!outfile.good()) { - ORT_THROW("Error: Failed to write map size."); - return false; + ORT_THROW("Error: Failed to write map size."); + return false; } // Write each key-value pair - for (const auto& pair : map) { - if (!writeString(outfile, pair.first) || !writeStringVector(outfile, pair.second)) { - ORT_THROW("Error: Failed to write map data."); - return false; - } + for (const auto& [key, value] : metadata) { + bool result = true; + result &= writeScalar(outfile, key.name); + result &= writeScalar(outfile, value.location); + result &= writeScalar(outfile, value.data_offset); + result &= writeScalar(outfile, value.size); + + ORT_ENFORCE(result, "Error: Failed to write map data."); } return true; @@ -721,7 +721,8 @@ bool dumpMetaDataMapToBinary(const std::unordered_map& model) { + /*out*/ std::unique_ptr& model, + /*out*/ sw& shared_weights) { // NOTE: This function is a re-implementation of GraphViewerToProto() in core/graph/graph_proto_serializer.cc // with the following differences: // - Uses onnxruntime::Graph APIs instead of onnx::GraphProto APIs. @@ -819,7 +820,6 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, seen_node_units.insert(node_unit); } - // Copy initializers to dst graph. std::unordered_set current_scope_initializer_set; @@ -834,97 +834,94 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, std::sort(const_inits.begin(), const_inits.end()); // initialize map for creating metadata for initilizers with external weights - std::unordered_map> metadata_map; + auto& metadata = shared_weights.metadata; + + const auto& insert_metadata = [&metadata](const std::string& name, ONNX_NAMESPACE::StringStringEntryProtos* entry_protos) { + // key: [name], value: [location, offset, length] + sw::Metadata::Map::key_type key{name}; + sw::Metadata::Map::mapped_type value{}; + + for (int i = 0; i < entry_protos->size(); i++) { + auto& string_entry_proto{entry_protos->at(i)}; + const auto& pb_key{*(string_entry_proto.mutable_key())}; + const auto& pb_value{*(string_entry_proto.mutable_value())}; + if (pb_key == "location") { + value.location = pb_value; + } else if (pb_key == "offset") { + value.data_offset = std::stoul(pb_value); + } else if (pb_key == "length") { + value.size = std::stoul(pb_value); + } + } + metadata.emplace(key, value); + }; // metadata structure: initializer_name as key // and [location, offset, length] as value for (auto& it : const_inits) { - const auto* initializer_tensor = initializers.at(it); - - // Check if the initializer has external data - if (initializer_tensor->has_data_location() && - initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && - enable_ovep_weight_sharing) { + const auto* initializer_tensor = initializers.at(it); - // Cast away const to access mutable_external_data - struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast(initializer_tensor); + // Check if the initializer has external data + if (initializer_tensor->has_data_location() && + initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + enable_ovep_weight_sharing) { + // Cast away const to access mutable_external_data + auto* non_const_initializer_tensor = const_cast(initializer_tensor); - // get meta data about the initilizers with external data - struct ONNX_NAMESPACE::StringStringEntryProtos* external_data = non_const_initializer_tensor->mutable_external_data(); + // get meta data about the initilizers with external data + auto* external_data = non_const_initializer_tensor->mutable_external_data(); - std::vector init_info; - // init_info structure: [location, offset, length] + insert_metadata(initializer_tensor->name(), external_data); - for (int i = 0 ; i < external_data->size() ; i++) { - init_info.push_back(*external_data->at(i).mutable_value()); - } + // Add initializer with external data as input + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it); - metadata_map.emplace(initializer_tensor->name(), init_info); - // Add initializer with external data as input - AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it); - - } else { - // Add as an initialized tensor if it does not have external data - if (initializers_to_keep.count(it)) - dst_graph.AddInitializedTensor(*(initializers.at(it))); - - } + } else { + // Add as an initialized tensor if it does not have external data + if (initializers_to_keep.count(it)) + dst_graph.AddInitializedTensor(*(initializers.at(it))); + } - current_scope_initializer_set.insert(it); + current_scope_initializer_set.insert(it); } // Handle outer-scope constant initializers for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) { - const auto& node = src_graph.GetNode(node_idx); - for (const auto& input : node->InputDefs()) { - if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) { - continue; - } - - if (src_graph.IsConstantInitializer(input->Name(), true)) { - const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true); - // Check if the initializer has external data - if (initializer_tensor->has_data_location() && - initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && - enable_ovep_weight_sharing) { - - // Cast away const to access mutable_external_data - struct ONNX_NAMESPACE::TensorProto* non_const_initializer_tensor = const_cast(initializer_tensor); - - // get meta data about the initilizers with external data - struct ONNX_NAMESPACE::StringStringEntryProtos* external_data = non_const_initializer_tensor->mutable_external_data(); + const auto& node = src_graph.GetNode(node_idx); + for (const auto& input : node->InputDefs()) { + if (current_scope_initializer_set.find(input->Name()) != current_scope_initializer_set.end()) { + continue; + } - std::vector init_info; - for (int i = 0 ; i < external_data->size() ; i++) { - init_info.push_back(*external_data->at(i).mutable_value()); - } + if (src_graph.IsConstantInitializer(input->Name(), true)) { + const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true); + // Check if the initializer has external data + if (initializer_tensor->has_data_location() && + initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + enable_ovep_weight_sharing) { + // Cast away const to access mutable_external_data + auto* non_const_initializer_tensor = const_cast(initializer_tensor); - metadata_map.emplace(initializer_tensor->name(), init_info); + // get meta data about the initilizers with external data + auto* external_data = non_const_initializer_tensor->mutable_external_data(); - // Add initializer as input if it has external data - AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name()); + insert_metadata(initializer_tensor->name(), external_data); - } else { - // Add as an initialized tensor if it does not have external data - if (initializers_to_keep.count(input->Name())) { - dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true))); - } - } + // Add initializer as input if it has external data + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name()); - current_scope_initializer_set.insert(input->Name()); + } else { + // Add as an initialized tensor if it does not have external data + if (initializers_to_keep.count(input->Name())) { + dst_graph.AddInitializedTensor(*(src_graph.GetConstantInitializer(input->Name(), true))); } + } + + current_scope_initializer_set.insert(input->Name()); } - } - if (enable_ovep_weight_sharing) { - // creating bin file of metadata_map and dumping the bin file - if (dumpMetaDataMapToBinary(metadata_map, "metadata.bin")) { - LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Metadata for external initializer dumped."; - } else { - ORT_THROW("Error: Unable to write metadat to file."); } } - accumulated_inputs.insert(accumulated_inputs.end(), dst_graph_inputs.begin(), dst_graph_inputs.end()); // Set all inputs (original inputs amnd initializers as inputs) of the destination Graph diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h index 5b777a388adda..02831525cba32 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.h @@ -5,15 +5,20 @@ #include #include "core/providers/shared_library/provider_api.h" +#include "core/providers/openvino/contexts.h" namespace onnxruntime { namespace openvino_ep { +using sw = SharedContext::SharedWeights; + // Creates a new model without the DQ/Q operators in the src graph as per pre-defined rulesets Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, bool enable_ovep_weight_sharing, - /*out*/ std::unique_ptr& model); + /*out*/ std::unique_ptr& model, + /*out*/ sw& shared_weights); +bool dumpMetaDataMapToBinary(const sw::Metadata::Map& shared_weights, const std::string& filename); } // namespace openvino_ep } // namespace onnxruntime From 01ac259a838732fae0cb5125fa5cbed9c61f95a2 Mon Sep 17 00:00:00 2001 From: ankitm3k Date: Mon, 13 Jan 2025 12:21:17 +0530 Subject: [PATCH 14/35] fix: fix provider options --- .../providers/openvino/openvino_execution_provider.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 684a2c64237b8..d02a642699a82 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -69,11 +69,11 @@ void AdjustProviderInfo(ProviderInfo& info) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" << "No runtime device selection option provided."; #if defined OPENVINO_CONFIG_CPU - device_type_ = "CPU"; - precision_ = "FP32"; + info.device_type = "CPU"; + info.precision = "FP32"; #elif defined OPENVINO_CONFIG_GPU - device_type_ = "GPU"; - precision_ = "FP16"; + info.device_type = "GPU"; + info.precision = "FP16"; #elif defined OPENVINO_CONFIG_NPU info.device_type = "NPU"; info.precision = "FP16"; From db075cd1475356c1e54cf37480e7ba089b80fd3d Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Mon, 13 Jan 2025 08:23:04 -0800 Subject: [PATCH 15/35] create ov tensor from meta data and external data --- .../qdq_transformations/qdq_stripping.cc | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 2ba6e03dd4d8e..79529a8586be0 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -717,6 +717,68 @@ bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::strin return true; } +// Helper function to read binary data from a file +std::vector readBinaryData(const std::string& filePath, size_t offset, size_t length) { + std::vector data(length / sizeof(float), 0); + std::ifstream file(filePath, std::ios::binary); + if (!file) { + throw std::runtime_error("Failed to open file: " + filePath); + } + + file.seekg(offset, std::ios::beg); + file.read(reinterpret_cast(data.data()), length); + + if (!file) { + throw std::runtime_error("Error reading from file: " + filePath); + } + return data; +} + +// Function to handle tensor creation from external data +void CreateOVTensor(const ONNX_NAMESPACE::TensorProto* initializer_tensor, + onnxruntime::openvino_ep::SharedContext::SharedWeights::Metadata::Map& metadata_map) { + + for (auto itr: metadata_map) { + if (initializer_tensor->name() == itr.first.name) { + std::string filePath = itr.second.location; + std::uint32_t offset = itr.second.data_offset; + std::uint32_t length = itr.second.size; + + // Read binary data + auto rawData = readBinaryData(filePath, offset, length); + + // Get dimensions + std::vector shape; + for (auto itt = 0 ; itt < initializer_tensor->dims().size() ; itt++) { + shape.push_back(initializer_tensor->dims()[itt]); + } + + // Create OpenVINO Tensor + ov::element::Type elementType = ov::element::f32; + ov::Tensor tensor(elementType, shape, rawData.data()); + } + } +} + +ov::element::Type GetOpenVINOElementType(int onnx_data_type) { + switch (onnx_data_type) { + case 1: return ov::element::f32; // FLOAT + case 2: return ov::element::u8; // UINT8 + case 3: return ov::element::i8; // INT8 + case 4: return ov::element::u16; // UINT16 + case 5: return ov::element::i16; // INT16 + case 6: return ov::element::i32; // INT32 + case 7: return ov::element::i64; // INT64 + case 9: return ov::element::boolean; // BOOL + case 10: return ov::element::f16; // FLOAT16 + case 11: return ov::element::f64; // DOUBLE + case 12: return ov::element::u32; // UINT32 + case 13: return ov::element::u64; // UINT64 + default: + throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(onnx_data_type)); + } +} + // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, @@ -858,7 +920,7 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, }; // metadata structure: initializer_name as key // and [location, offset, length] as value - + std::cout << typeid(metadata).name(); for (auto& it : const_inits) { const auto* initializer_tensor = initializers.at(it); @@ -866,6 +928,10 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, if (initializer_tensor->has_data_location() && initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && enable_ovep_weight_sharing) { + + int onnx_data_type = initializer_tensor->data_type(); // Get ONNX data type + ov::element::Type elementType = GetOpenVINOElementType(onnx_data_type); // Map to OpenVINO data type + // Cast away const to access mutable_external_data auto* non_const_initializer_tensor = const_cast(initializer_tensor); @@ -877,6 +943,8 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, // Add initializer with external data as input AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it); + // Create OV tensor based on external data and metadata + CreateOVTensor(initializer_tensor, metadata); } else { // Add as an initialized tensor if it does not have external data if (initializers_to_keep.count(it)) From 8209162949715e54acb672b696da61d8b0467030 Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Mon, 13 Jan 2025 08:25:48 -0800 Subject: [PATCH 16/35] create ov tensor --- .../core/providers/openvino/qdq_transformations/qdq_stripping.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 79529a8586be0..4bdc72f643018 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -920,7 +920,6 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, }; // metadata structure: initializer_name as key // and [location, offset, length] as value - std::cout << typeid(metadata).name(); for (auto& it : const_inits) { const auto* initializer_tensor = initializers.at(it); From 89ebe8d6765ede635514d6560c0cd011496ff13f Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Tue, 14 Jan 2025 21:53:02 -0800 Subject: [PATCH 17/35] Add support for binding weight as input tensors --- .../providers/openvino/backend_manager.cc | 23 ++++ .../core/providers/openvino/backend_utils.cc | 92 +++++++++++++- .../core/providers/openvino/backend_utils.h | 4 + .../openvino/backends/backend_factory.cc | 3 +- .../openvino/backends/basic_backend.cc | 21 +++- .../openvino/backends/basic_backend.h | 6 +- .../core/providers/openvino/contexts.h | 23 +++- .../core/providers/openvino/ibackend.h | 1 + .../core/providers/openvino/ov_interface.cc | 17 ++- .../core/providers/openvino/ov_interface.h | 4 +- .../qdq_transformations/qdq_stripping.cc | 115 ++++-------------- 11 files changed, 202 insertions(+), 107 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index c06e00272a8c8..d9ef25cefbf59 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -104,6 +104,24 @@ BackendManager::BackendManager(SessionContext& session_context, } std::string device_type = session_context_.device_type; + auto& sw = shared_context_.shared_weights; + if (session_context_.so_share_ep_contexts) { + std::filesystem::path weight_filename = session_context_.cache_dir.parent_path(); + if (sw.external_weight_filename.empty()) + { + sw.external_weight_filename = sw.metadata.begin()->second.location; + } + weight_filename /= sw.external_weight_filename; + std::ifstream weight_file(weight_filename); + + if (weight_file) { + if (!sw.mapped_weights) { + sw.mapped_weights = std::make_unique(weight_filename); + } + backend_utils::CreateOVTensors(sw.metadata, sw.mapped_weights->weight_data); + } + } + if (ModelHasSymbolicInputDims(subgraph)) { subgraph_context_.has_dynamic_input_shape = true; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims"; @@ -116,6 +134,7 @@ BackendManager::BackendManager(SessionContext& session_context, concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, subgraph_context_, + shared_context_, model_stream); } catch (std::string const& msg) { ORT_THROW(msg); @@ -139,6 +158,7 @@ BackendManager::BackendManager(SessionContext& session_context, concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, subgraph_context_, + shared_context_, model_stream); } catch (const OnnxRuntimeException& ex) { std::string exception_str = ex.what(); @@ -158,6 +178,7 @@ BackendManager::BackendManager(SessionContext& session_context, concrete_backend_ = BackendFactory::MakeBackend(model_proto, session_context_, subgraph_context_, + shared_context_, model_stream); } catch (std::string const& msg) { ORT_THROW(msg); @@ -489,6 +510,7 @@ void BackendManager::Compute(OrtKernelContext* context) { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, session_context_, subgraph_context_, + shared_context_, model_stream); } catch (const OnnxRuntimeException& ex) { // Build option disables fallback to CPU on compilation failures with NPU. @@ -508,6 +530,7 @@ void BackendManager::Compute(OrtKernelContext* context) { dynamic_backend = BackendFactory::MakeBackend(modelproto_with_concrete_shapes, session_context_, subgraph_context_, + shared_context_, model_stream); } catch (std::string const& msg) { ORT_THROW(msg); diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 4adf9f5b89833..440b2e9bc5019 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -12,10 +12,46 @@ #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/ov_interface.h" +#include "Windows.h" + using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { + +SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) { + file_ = CreateFile(filename.string().data(), + GENERIC_READ, + FILE_SHARE_READ, + 0, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + 0); + ORT_ENFORCE(file_ != nullptr, "Unable to open weight file at ", filename.string()); + + mapping_ = CreateFileMapping(file_, 0, PAGE_READONLY, 0, 0, 0); + ORT_ENFORCE(mapping_ != nullptr, "Unable to create mapping of weight file at ", filename.string()); + + const char* raw_data = static_cast(MapViewOfFile(mapping_, FILE_MAP_READ, 0, 0, 0)); + ORT_ENFORCE(raw_data != nullptr, "Unable to map weight file at ", filename.string()); + + weight_data = std::string_view(raw_data, std::filesystem::file_size(filename)); +} + +SharedContext::SharedWeights::MappedWeights::~MappedWeights() { + if (!weight_data.empty()) { + UnmapViewOfFile(weight_data.data()); + } + if (mapping_ != nullptr) { + CloseHandle(mapping_); + mapping_ = nullptr; + } + if (file_ != nullptr) { + CloseHandle(file_); + file_ = nullptr; + } +} + namespace backend_utils { bool IsDebugEnabled() { @@ -34,11 +70,6 @@ bool IsCILogEnabled() { return false; } -struct static_cast_int64 { - template // T1 models type statically convertible to T - int64_t operator()(const T1& x) const { return static_cast(x); } -}; - std::shared_ptr CreateOVModel(const std::string model, const SessionContext& session_context, @@ -268,6 +299,57 @@ void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std printPerformanceCounts(performanceMap, stream, std::move(deviceName)); } +ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt) { + static std::unordered_map map{ + {ONNX_NAMESPACE::TensorProto_DataType_FLOAT, ov::element::f32}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT8, ov::element::u8}, + {ONNX_NAMESPACE::TensorProto_DataType_INT8, ov::element::i8}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT16, ov::element::u16}, + {ONNX_NAMESPACE::TensorProto_DataType_INT16, ov::element::i16}, + {ONNX_NAMESPACE::TensorProto_DataType_INT32, ov::element::i32}, + {ONNX_NAMESPACE::TensorProto_DataType_INT64, ov::element::i64}, + {ONNX_NAMESPACE::TensorProto_DataType_STRING, ov::element::string}, + {ONNX_NAMESPACE::TensorProto_DataType_BOOL, ov::element::boolean}, + {ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, ov::element::f16}, + {ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, ov::element::f64}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT32, ov::element::u32}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT64, ov::element::u64}, + //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64, ov::element::undefined}, + //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128, ov::element::undefined}, + {ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, ov::element::bf16}, + //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN, ov::element::undefined}, + //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ, ov::element::undefined}, + {ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2, ov::element::f8e5m2}, + //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ, ov::element::undefined}, + {ONNX_NAMESPACE::TensorProto_DataType_UINT4, ov::element::u4}, + {ONNX_NAMESPACE::TensorProto_DataType_INT4, ov::element::i4}, + }; + + if (auto result = map.find(dt); result != map.end()) { + return result->second; + } else { + throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(dt)); + } +} + +// Function to handle tensor creation from external data +void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights) { + for (auto& [key, value] : metadata_map) { + if (value.tensor) continue; + + // Get tensor data + const auto* tensor_data = weights.data() + value.data_offset; + + // Get element data type + auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type; + ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type); // Map to OpenVINO data type + + // Create OpenVINO Tensor + value.tensor = std::make_shared(ov_elementType, value.dimensions, (void*)tensor_data); + ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch"); + } +} + } // namespace backend_utils } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 0d7378072cb1b..a2e16f5dbbfa9 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/contexts.h" @@ -66,6 +67,9 @@ CreateOVModel(const std::string model, const SubGraphContext& subgraph_context, std::map>& const_outputs_map); +void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, + std::string_view weights); + void printPerformanceCounts(const std::vector& performanceMap, std::ostream& stream, std::string deviceName); diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index 2fd9a7fa0a537..fedc3f21c8e33 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -14,6 +14,7 @@ std::shared_ptr BackendFactory::MakeBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, + SharedContext &shared_context, ptr_stream_t& model_stream) { std::string type = session_context.device_type; if (type == "CPU" || type.find("GPU") != std::string::npos || @@ -23,7 +24,7 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p type.find("AUTO") != std::string::npos) { std::shared_ptr concrete_backend_; try { - concrete_backend_ = std::make_shared(model_proto, session_context, subgraph_context, model_stream); + concrete_backend_ = std::make_shared(model_proto, session_context, subgraph_context, shared_context, model_stream); } catch (std::string const& msg) { ORT_THROW(msg); } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index fb0fdc9b5e85b..6202f9cd95f85 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -23,8 +23,9 @@ using namespace backend_utils; BasicBackend::BasicBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, + SharedContext& shared_context, ptr_stream_t& model_stream) - : session_context_(session_context), subgraph_context_(subgraph_context) { + : session_context_{session_context}, subgraph_context_{subgraph_context}, shared_context_{shared_context} { std::string& hw_target = session_context_.device_type; if (ValidateSubgraph(const_outputs_map_)) @@ -123,8 +124,24 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } catch (const char* msg) { ORT_THROW(msg); } + int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1; - inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req)); + std::function initializer = [](OVInferRequestPtr) {}; + auto metadata = shared_context_.shared_weights.metadata; + if (session_context_.so_share_ep_contexts) { + initializer = [&metadata](OVInferRequestPtr ir_ptr) { + const auto input_count = ir_ptr->GetNumInputs(); + for (auto i = 0; i < input_count; i++) { + using Key = SharedContext::SharedWeights::Metadata::Key; + const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)}; + if (metadata.contains(tensor_key)) { + auto& value = metadata.at(tensor_key); + ir_ptr->SetTensor(tensor_key.name, value.tensor); + } + } + }; + } + inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req, initializer)); } bool BasicBackend::ValidateSubgraph(std::map>& const_outputs_map) { diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 177784a71f575..22bcc4c1da40e 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/openvino/contexts.h" @@ -32,6 +33,7 @@ class BasicBackend : public IBackend { BasicBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, + SharedContext& shared_context, ptr_stream_t& model_stream); void Infer(OrtKernelContext* context) override; @@ -57,6 +59,7 @@ class BasicBackend : public IBackend { SessionContext& session_context_; SubGraphContext subgraph_context_; + SharedContext& shared_context_; mutable std::mutex compute_lock_; OVExeNetwork exe_network_; std::map> const_outputs_map_; @@ -71,10 +74,11 @@ class BasicBackend : public IBackend { class InferRequestsQueue { public: - InferRequestsQueue(OVExeNetwork& net, size_t nireq) { + InferRequestsQueue(OVExeNetwork& net, size_t nireq, std::function initializer) { OVInferRequestPtr infer_request; for (size_t id = 0; id < nireq; id++) { infer_request = std::make_shared(net.CreateInferRequest()); + initializer(infer_request); infer_requests_.push_back(infer_request); } } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index f96fec345eef1..68c0ecf87004b 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include "core/common/common.h" #include "core/providers/openvino/ov_interface.h" namespace onnxruntime { @@ -31,12 +33,29 @@ struct SharedContext { std::string location; unsigned int data_offset; unsigned int size; - ov::Tensor* tensor; + std::vector dimensions; + std::int32_t element_type; + std::shared_ptr tensor; }; using Map = std::unordered_map; }; - Metadata::Map metadata; + + struct MappedWeights { + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MappedWeights); + ~MappedWeights(); + MappedWeights() = delete; + explicit MappedWeights(std::filesystem::path filename); + + std::string_view weight_data; + + private: + void* file_; + void* mapping_; + }; + fs::path external_weight_filename; + std::unique_ptr mapped_weights; + Metadata::Map metadata; } shared_weights; }; diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 0d440eee598d3..dfa669aace875 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -24,6 +24,7 @@ class BackendFactory { MakeBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, + SharedContext &shared_context, ptr_stream_t& model_stream); }; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 804db5b726fc5..5b853539c31ea 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -197,7 +197,18 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { } } -void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) { +std::string OVInferRequest::GetInputTensorName(uint32_t index) { + try { + const auto &model = ovInfReq.get_compiled_model(); + return *model.input(index).get_names().begin(); + } catch (const Exception& e) { + ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index, e.what()); + } catch (...) { + ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index); + } +} + +void OVInferRequest::SetTensor(const std::string &name, OVTensorPtr& blob) { try { ovInfReq.set_tensor(name, *(blob.get())); } catch (const Exception& e) { @@ -207,6 +218,10 @@ void OVInferRequest::SetTensor(std::string name, OVTensorPtr& blob) { } } +uint32_t OVInferRequest::GetNumInputs() { + return ovInfReq.get_compiled_model().inputs().size(); +} + void OVInferRequest::StartAsync() { try { ovInfReq.start_async(); diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 550c7962cca13..5d88994dbabb0 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -86,8 +86,10 @@ class OVInferRequest { ov::InferRequest ovInfReq; public: + uint32_t GetNumInputs(); OVTensorPtr GetTensor(const std::string& name); - void SetTensor(std::string name, OVTensorPtr& blob); + std::string GetInputTensorName(uint32_t index); + void SetTensor(const std::string& name, OVTensorPtr& blob); void StartAsync(); void Infer(); void WaitRequest(); diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 4bdc72f643018..019e121b4f575 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -717,68 +717,6 @@ bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::strin return true; } -// Helper function to read binary data from a file -std::vector readBinaryData(const std::string& filePath, size_t offset, size_t length) { - std::vector data(length / sizeof(float), 0); - std::ifstream file(filePath, std::ios::binary); - if (!file) { - throw std::runtime_error("Failed to open file: " + filePath); - } - - file.seekg(offset, std::ios::beg); - file.read(reinterpret_cast(data.data()), length); - - if (!file) { - throw std::runtime_error("Error reading from file: " + filePath); - } - return data; -} - -// Function to handle tensor creation from external data -void CreateOVTensor(const ONNX_NAMESPACE::TensorProto* initializer_tensor, - onnxruntime::openvino_ep::SharedContext::SharedWeights::Metadata::Map& metadata_map) { - - for (auto itr: metadata_map) { - if (initializer_tensor->name() == itr.first.name) { - std::string filePath = itr.second.location; - std::uint32_t offset = itr.second.data_offset; - std::uint32_t length = itr.second.size; - - // Read binary data - auto rawData = readBinaryData(filePath, offset, length); - - // Get dimensions - std::vector shape; - for (auto itt = 0 ; itt < initializer_tensor->dims().size() ; itt++) { - shape.push_back(initializer_tensor->dims()[itt]); - } - - // Create OpenVINO Tensor - ov::element::Type elementType = ov::element::f32; - ov::Tensor tensor(elementType, shape, rawData.data()); - } - } -} - -ov::element::Type GetOpenVINOElementType(int onnx_data_type) { - switch (onnx_data_type) { - case 1: return ov::element::f32; // FLOAT - case 2: return ov::element::u8; // UINT8 - case 3: return ov::element::i8; // INT8 - case 4: return ov::element::u16; // UINT16 - case 5: return ov::element::i16; // INT16 - case 6: return ov::element::i32; // INT32 - case 7: return ov::element::i64; // INT64 - case 9: return ov::element::boolean; // BOOL - case 10: return ov::element::f16; // FLOAT16 - case 11: return ov::element::f64; // DOUBLE - case 12: return ov::element::u32; // UINT32 - case 13: return ov::element::u64; // UINT64 - default: - throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(onnx_data_type)); - } -} - // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, @@ -898,11 +836,13 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, // initialize map for creating metadata for initilizers with external weights auto& metadata = shared_weights.metadata; - const auto& insert_metadata = [&metadata](const std::string& name, ONNX_NAMESPACE::StringStringEntryProtos* entry_protos) { - // key: [name], value: [location, offset, length] - sw::Metadata::Map::key_type key{name}; + const auto& insert_metadata = [&metadata](const ONNX_NAMESPACE::TensorProto& proto) { + sw::Metadata::Map::key_type key{proto.name()}; sw::Metadata::Map::mapped_type value{}; + using mutable_proto_t = ONNX_NAMESPACE::TensorProto*; + auto& mutable_proto = *const_cast(&proto); + auto* entry_protos = mutable_proto.mutable_external_data(); for (int i = 0; i < entry_protos->size(); i++) { auto& string_entry_proto{entry_protos->at(i)}; const auto& pb_key{*(string_entry_proto.mutable_key())}; @@ -915,35 +855,28 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, value.size = std::stoul(pb_value); } } + value.element_type = proto.data_type(); + value.dimensions.resize(proto.dims_size()); + for (uint32_t index = 0; auto& dim : value.dimensions) { + dim = proto.dims()[index++]; + } - metadata.emplace(key, value); + metadata.emplace(key, std::move(value)); }; - // metadata structure: initializer_name as key - // and [location, offset, length] as value + + // Handle constant initializers for (auto& it : const_inits) { - const auto* initializer_tensor = initializers.at(it); + const auto& initializer_tensor = *initializers.at(it); // Check if the initializer has external data - if (initializer_tensor->has_data_location() && - initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + if (initializer_tensor.has_data_location() && + initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && enable_ovep_weight_sharing) { - - int onnx_data_type = initializer_tensor->data_type(); // Get ONNX data type - ov::element::Type elementType = GetOpenVINOElementType(onnx_data_type); // Map to OpenVINO data type - - // Cast away const to access mutable_external_data - auto* non_const_initializer_tensor = const_cast(initializer_tensor); - - // get meta data about the initilizers with external data - auto* external_data = non_const_initializer_tensor->mutable_external_data(); - - insert_metadata(initializer_tensor->name(), external_data); + insert_metadata(initializer_tensor); // Add initializer with external data as input AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, it); - // Create OV tensor based on external data and metadata - CreateOVTensor(initializer_tensor, metadata); } else { // Add as an initialized tensor if it does not have external data if (initializers_to_keep.count(it)) @@ -962,18 +895,12 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, } if (src_graph.IsConstantInitializer(input->Name(), true)) { - const auto* initializer_tensor = src_graph.GetConstantInitializer(input->Name(), true); + const auto& initializer_tensor = *src_graph.GetConstantInitializer(input->Name(), true); // Check if the initializer has external data - if (initializer_tensor->has_data_location() && - initializer_tensor->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && + if (initializer_tensor.has_data_location() && + initializer_tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL && enable_ovep_weight_sharing) { - // Cast away const to access mutable_external_data - auto* non_const_initializer_tensor = const_cast(initializer_tensor); - - // get meta data about the initilizers with external data - auto* external_data = non_const_initializer_tensor->mutable_external_data(); - - insert_metadata(initializer_tensor->name(), external_data); + insert_metadata(initializer_tensor); // Add initializer as input if it has external data AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, input->Name()); From ae408afe0e09bbf341095d09442ebe8a0015fb89 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Wed, 15 Jan 2025 21:14:03 -0800 Subject: [PATCH 18/35] Fix for mapping subgraph to ov compiled network arguments --- .../providers/openvino/backend_manager.cc | 35 +++++----------- .../core/providers/openvino/backend_utils.cc | 4 +- .../core/providers/openvino/backend_utils.h | 4 +- .../openvino/backends/backend_factory.cc | 2 +- .../openvino/backends/basic_backend.cc | 41 ++++++++----------- .../core/providers/openvino/contexts.h | 6 +-- .../core/providers/openvino/ibackend.h | 2 +- .../core/providers/openvino/ov_interface.cc | 4 +- 8 files changed, 39 insertions(+), 59 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index d9ef25cefbf59..d1564836cb247 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -71,30 +71,16 @@ BackendManager::BackendManager(SessionContext& session_context, // Save the indexes of graph inputs among fused_node's inputDefs // (which also contains initializers). - auto node_input_defs = fused_node.InputDefs(); - int i = 0; - for (auto idef : node_input_defs) { - subgraph_context_.input_names.insert({idef->Name(), i}); - i++; + for (uint32_t index = 0; const auto& node : subgraph.GetInputs()) { + subgraph_context_.input_names.insert({node->Name(), index++}); } - const std::vector& graph_inputs = subgraph.GetInputs(); - for (auto input : graph_inputs) { - auto it = subgraph_context_.input_names.find(input->Name()); - if (it == subgraph_context_.input_names.end()) { - ORT_THROW("Input not found in the input defs list"); - } - int index = it->second; - subgraph_context_.input_indexes.push_back(index); + for (uint32_t index = 0; const auto& node : subgraph.GetOutputs()) { + subgraph_context_.output_names.insert({node->Name(), index++}); } - auto graph_outputs_defs = fused_node.OutputDefs(); - i = 0; - for (auto output_def : graph_outputs_defs) { - subgraph_context_.output_names.insert({output_def->Name(), i}); - i++; - } subgraph_context_.subgraph_name = fused_node.Name(); + ptr_stream_t model_stream; std::unique_ptr model_proto; if (subgraph_context_.is_ep_ctx_graph) { @@ -107,8 +93,7 @@ BackendManager::BackendManager(SessionContext& session_context, auto& sw = shared_context_.shared_weights; if (session_context_.so_share_ep_contexts) { std::filesystem::path weight_filename = session_context_.cache_dir.parent_path(); - if (sw.external_weight_filename.empty()) - { + if (sw.external_weight_filename.empty()) { sw.external_weight_filename = sw.metadata.begin()->second.location; } weight_filename /= sw.external_weight_filename; @@ -276,8 +261,8 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& model_proto) const { bool has_batched_inputs = true; - for (int i = 0; i < static_cast(subgraph_context_.input_indexes.size()); i++) { - auto& input = model_proto.graph().input(subgraph_context_.input_indexes[i]); + for (const auto& [name, index] : subgraph_context_.input_names) { + auto& input = model_proto.graph().input(index); // Batch-process only raw image inputs (NCHW or NHWC layouts) auto& shape = input.type().tensor_type().shape(); @@ -291,8 +276,8 @@ bool BackendManager::ModelHasBatchedInputs(const ONNX_NAMESPACE::ModelProto& mod break; } - for (int index = 1; index < 4; index++) { - if (shape.dim(index).value_case() != shape.dim(0).kDimValue) { + for (int dim_index = 1; dim_index < 4; dim_index++) { + if (shape.dim(dim_index).value_case() != shape.dim(0).kDimValue) { has_batched_inputs = false; break; } diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 440b2e9bc5019..e37254b34b9fd 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -114,7 +114,7 @@ Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, size_t batch_size, OVInferRequestPtr infer_request, std::string output_name, - std::unordered_map output_names) { + const SubGraphContext::string_index_map_t& output_names) { auto graph_output_blob = infer_request->GetTensor(output_name); auto graph_output_dims = graph_output_blob->get_shape(); @@ -139,7 +139,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size, Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, std::string output_name, - std::unordered_map output_names, + const SubGraphContext::string_index_map_t& output_names, std::shared_ptr node) { // Find position of '/' in the output_name int pos = output_name.find("/"); diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index a2e16f5dbbfa9..cfb6adc8fbd3d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -45,14 +45,14 @@ void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr n Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, std::string output_name, - std::unordered_map output_names, + const SubGraphContext::string_index_map_t& output_names, std::shared_ptr node); Ort::UnownedValue GetOutputTensor(Ort::KernelContext& context, size_t batch_size, OVInferRequestPtr infer_request, std::string output_name, - std::unordered_map output_names); + const SubGraphContext::string_index_map_t& output_names); void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx, std::string input_name, Ort::KernelContext& context, diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index fedc3f21c8e33..99955da539ae7 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -14,7 +14,7 @@ std::shared_ptr BackendFactory::MakeBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, - SharedContext &shared_context, + SharedContext& shared_context, ptr_stream_t& model_stream) { std::string type = session_context.device_type; if (type == "CPU" || type.find("GPU") != std::string::npos || diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 6202f9cd95f85..a8026d710827b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -131,7 +131,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr if (session_context_.so_share_ep_contexts) { initializer = [&metadata](OVInferRequestPtr ir_ptr) { const auto input_count = ir_ptr->GetNumInputs(); - for (auto i = 0; i < input_count; i++) { + for (auto i = 0u; i < input_count; i++) { using Key = SharedContext::SharedWeights::Metadata::Key; const auto tensor_key = Key{ir_ptr->GetInputTensorName(i)}; if (metadata.contains(tensor_key)) { @@ -357,28 +357,23 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) { // an Infer Request indexed by infer_req_idx void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) { try { - auto graph_input_info = exe_network_.Get().inputs(); - int input_idx = 0; - for (auto input_info_iter = graph_input_info.begin(); - input_info_iter != graph_input_info.end(); ++input_info_iter) { - auto input_names = input_info_iter->get_names(); - std::string onnx_input_name; - std::string input_name; - // use names retrieved from original ONNX model to assign the right onnx input name for the graph - for (auto it = subgraph_context_.input_names.begin(); it != subgraph_context_.input_names.end(); ++it) { - if (it->second == input_idx) { - onnx_input_name = it->first; + auto ov_input_info = exe_network_.Get().inputs(); + + // Loop over subgraph original input names to find the correspondent OV input name + for (const auto& [onnx_input_name, onnx_input_index] : subgraph_context_.input_names) { + std::string input_name{}; + uint32_t input_idx = 0; + for (uint32_t index = 0; const auto& ov_input : ov_input_info) { + if (ov_input.get_names().contains(onnx_input_name)) { + input_name = onnx_input_name; + input_idx = index; break; } + index++; } - // using the input name retrieved from ONNX original to match with the input names returned by OV tensors - if (input_names.find(onnx_input_name) != input_names.end()) { - input_name = std::move(onnx_input_name); - } else { - ORT_THROW(log_tag + - "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name + + ORT_ENFORCE(!input_name.empty(), log_tag, + "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name, " doesn't exist in the list of OpenVINO input tensor names"); - } size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && !session_context_.disable_dynamic_shapes && @@ -395,7 +390,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque input_tensor_shape[tensor_iter] = *i; tensor_iter += 1; } - const auto& input = graph_input_info.at(input_idx); + const auto& input = ov_input_info.at(input_idx); OVTensorPtr tensor_ptr; // avoid input copies on the CPU device if (session_context_.device_type.find("CPU") != std::string::npos) { @@ -428,7 +423,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque if ((it == ort_ov_tensor_map.end()) || (it != ort_ov_tensor_map.end() && (it->second.ort_ptr != tensor.GetTensorRawData()))) { ov_tensor_data_t ov_tensor_data; - const auto& input = graph_input_info.at(input_idx); + const auto& input = ov_input_info.at(input_idx); ov_tensor_data.tensor_ptr = std::make_shared(input.get_element_type(), input.get_shape(), const_cast(tensor.GetTensorRawData())); @@ -443,8 +438,8 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } } } - input_idx++; - } + } // Loop subgraph original input names + if (session_context_.device_type.find("NPU") != std::string::npos) { // Set the output blob as remote blob auto graph_output_info = exe_network_.Get().outputs(); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 68c0ecf87004b..1e8b8fb1127ce 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -112,15 +112,15 @@ struct SessionContext : ProviderInfo { // Holds context specific to subgraph. struct SubGraphContext { + using string_index_map_t = std::unordered_map; bool has_dynamic_input_shape = false; bool enable_batching = false; bool set_npu_config = false; bool is_constant = false; void* context = 0; std::string subgraph_name; - std::vector input_indexes; - std::unordered_map input_names; - std::unordered_map output_names; + string_index_map_t input_names; + string_index_map_t output_names; bool is_wholly_supported_graph = false; bool has_external_weights = false; std::string model_precision; diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index dfa669aace875..d2f91cacb6c4d 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -24,7 +24,7 @@ class BackendFactory { MakeBackend(std::unique_ptr& model_proto, SessionContext& session_context, const SubGraphContext& subgraph_context, - SharedContext &shared_context, + SharedContext& shared_context, ptr_stream_t& model_stream); }; diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 5b853539c31ea..9b0e9c94c0f6e 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -199,7 +199,7 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { std::string OVInferRequest::GetInputTensorName(uint32_t index) { try { - const auto &model = ovInfReq.get_compiled_model(); + const auto& model = ovInfReq.get_compiled_model(); return *model.input(index).get_names().begin(); } catch (const Exception& e) { ORT_THROW(log_tag + " Cannot access IE Blob for input number: ", index, e.what()); @@ -208,7 +208,7 @@ std::string OVInferRequest::GetInputTensorName(uint32_t index) { } } -void OVInferRequest::SetTensor(const std::string &name, OVTensorPtr& blob) { +void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { try { ovInfReq.set_tensor(name, *(blob.get())); } catch (const Exception& e) { From ac9c998c2fbded89f597cb5506a52579ad251688 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 16 Jan 2025 13:51:20 -0800 Subject: [PATCH 19/35] Fix for using so_share_ep_contexts without ep.context* flags --- .../providers/openvino/backend_manager.cc | 5 +++-- .../core/providers/openvino/contexts.h | 1 - .../openvino/openvino_execution_provider.cc | 19 ++++++++++++++----- .../openvino/openvino_provider_factory.cc | 2 +- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index d1564836cb247..589bee61e5200 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -92,8 +92,9 @@ BackendManager::BackendManager(SessionContext& session_context, auto& sw = shared_context_.shared_weights; if (session_context_.so_share_ep_contexts) { - std::filesystem::path weight_filename = session_context_.cache_dir.parent_path(); - if (sw.external_weight_filename.empty()) { + std::filesystem::path weight_filename = session_context_.onnx_model_path_name.parent_path(); + if (sw.external_weight_filename.empty() && !sw.metadata.empty()) { + // Reasonable assumption that all metadata entries have the same external file location sw.external_weight_filename = sw.metadata.begin()->second.location; } weight_filename /= sw.external_weight_filename; diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 1e8b8fb1127ce..7945f96c51138 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -102,7 +102,6 @@ struct SessionContext : ProviderInfo { OVCore ie_core; std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; - std::string onnx_model_name; std::filesystem::path onnx_model_path_name; int onnx_opset_version; bool use_api_2; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index d02a642699a82..a53bcf5cdbf6f 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -174,10 +174,13 @@ common::Status OpenVINOExecutionProvider::Compile( auto& logger = *GetLogger(); Status status = Status::OK(); - // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext - session_context_.onnx_model_path_name = fused_nodes[0].filtered_graph.get().ModelPath().string(); - session_context_.onnx_opset_version = - fused_nodes[0].filtered_graph.get().DomainToVersionMap().at(kOnnxDomain); + if (!fused_nodes.empty()) { + // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext + const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); + session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); + session_context_.onnx_opset_version = + graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); + } struct OpenVINOEPFunctionState { AllocateFunc allocate_func = nullptr; @@ -242,10 +245,16 @@ common::Status OpenVINOExecutionProvider::Compile( } if (session_context_.so_share_ep_contexts && session_context_.so_context_enable && !session_context_.cache_dir.empty()) { + std::filesystem::path metadata_name = session_context_.cache_dir.parent_path(); + + // If cache_dir hasn't been set use the model path to dump files + if (metadata_name.empty()) { + metadata_name = session_context_.onnx_model_path_name.parent_path(); + } + // Metadata is generated only for shared contexts // If metadata is generated then only save it if also saving epcontext (so_context_enable) // If saving metadata then save it to the provided path - std::filesystem::path metadata_name = session_context_.cache_dir.parent_path(); metadata_name /= session_context_.cache_dir.stem().string() + "_metadata"; metadata_name.replace_extension("bin"); dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string()); diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 7b0d6c6751120..06187573a7346 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -279,7 +279,7 @@ struct OpenVINO_Provider : Provider { std::string so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data(); if (pi.so_context_enable && !so_context_file_path.empty()) { - pi.cache_dir = std::move(so_context_file_path); + pi.cache_dir = so_context_file_path; } // Append values to config to support weight-as-inputs conversion for shared contexts From 6512ec634673ea222c41a6e7c5fec6917db91567 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 16 Jan 2025 16:41:00 -0800 Subject: [PATCH 20/35] Add remote tensor support for NPU weight sharing --- .../providers/openvino/backend_manager.cc | 2 +- .../core/providers/openvino/backend_utils.cc | 20 +++++++++++++++++-- .../core/providers/openvino/backend_utils.h | 4 +++- .../core/providers/openvino/contexts.h | 4 ++-- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 589bee61e5200..d7c55ad1ac84c 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -104,7 +104,7 @@ BackendManager::BackendManager(SessionContext& session_context, if (!sw.mapped_weights) { sw.mapped_weights = std::make_unique(weight_filename); } - backend_utils::CreateOVTensors(sw.metadata, sw.mapped_weights->weight_data); + backend_utils::CreateOVTensors(session_context_.device_type, session_context_.ie_core, sw.metadata, sw.mapped_weights->weight_data); } } diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index e37254b34b9fd..576718c10481a 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -8,6 +8,7 @@ #include "openvino/pass/convert_fp32_to_fp16.hpp" #include "openvino/pass/constant_folding.hpp" +#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp" #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/ov_interface.h" @@ -333,7 +334,10 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt } // Function to handle tensor creation from external data -void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights) { +void CreateOVTensors(const std::string& device_name, + OVCore& ov_core, + SharedContext::SharedWeights::Metadata::Map& metadata_map, + std::string_view weights) { for (auto& [key, value] : metadata_map) { if (value.tensor) continue; @@ -342,10 +346,22 @@ void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, // Get element data type auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type; + ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type); // Map to OpenVINO data type // Create OpenVINO Tensor - value.tensor = std::make_shared(ov_elementType, value.dimensions, (void*)tensor_data); + if (device_name == "NPU") { + // Use remote tensors + auto npu_context = ov_core.Get().get_default_context("NPU").as(); + auto&& remote_tensor = npu_context.create_host_tensor(ov_elementType, value.dimensions); + + // Copy data to remote tensor + std::memcpy(remote_tensor.data(), (void*)tensor_data, value.size); + value.tensor = std::make_shared(remote_tensor); + } else { + // Use vanilla tensors + value.tensor = std::make_shared(ov_elementType, value.dimensions, (void*)tensor_data); + } ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch"); } } diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index cfb6adc8fbd3d..4fb54507ad31c 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -67,7 +67,9 @@ CreateOVModel(const std::string model, const SubGraphContext& subgraph_context, std::map>& const_outputs_map); -void CreateOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map, +void CreateOVTensors(const std::string& device_name, + OVCore& ov_core, + SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights); void printPerformanceCounts(const std::vector& performanceMap, diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 7945f96c51138..d7b76f2a9e0de 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -24,7 +24,7 @@ struct SharedContext { std::string name; bool operator==(const Key&) const = default; }; - struct KeyHash { + struct Hash { std::size_t operator()(const Key& key) const noexcept { return std::hash()(key.name); } @@ -37,7 +37,7 @@ struct SharedContext { std::int32_t element_type; std::shared_ptr tensor; }; - using Map = std::unordered_map; + using Map = std::unordered_map; }; struct MappedWeights { From 5594817209d247ed150daa176738aee8829723fb Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 16 Jan 2025 20:46:38 -0800 Subject: [PATCH 21/35] Use a single ov::Core copy across OVEP --- .../providers/openvino/backend_manager.cc | 2 +- .../core/providers/openvino/backend_utils.cc | 5 +- .../core/providers/openvino/backend_utils.h | 1 - .../openvino/backends/basic_backend.cc | 28 +++++----- .../core/providers/openvino/contexts.h | 1 - .../openvino/openvino_execution_provider.cc | 9 ++-- .../openvino/openvino_execution_provider.h | 7 --- .../openvino/openvino_provider_factory.cc | 6 +-- .../core/providers/openvino/ov_interface.cc | 21 +++++--- .../core/providers/openvino/ov_interface.h | 52 +++++++++---------- 10 files changed, 61 insertions(+), 71 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index d7c55ad1ac84c..5a4bf791b4760 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -104,7 +104,7 @@ BackendManager::BackendManager(SessionContext& session_context, if (!sw.mapped_weights) { sw.mapped_weights = std::make_unique(weight_filename); } - backend_utils::CreateOVTensors(session_context_.device_type, session_context_.ie_core, sw.metadata, sw.mapped_weights->weight_data); + backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, sw.mapped_weights->weight_data); } } diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 576718c10481a..05084fe8f838d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -80,7 +80,7 @@ CreateOVModel(const std::string model, std::cout << "CreateNgraphFunc" << std::endl; } try { - auto ov_model = session_context.ie_core.ReadModel(model, session_context.onnx_model_path_name.string()); + auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string()); // Check for Constant Folding if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) { @@ -335,7 +335,6 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt // Function to handle tensor creation from external data void CreateOVTensors(const std::string& device_name, - OVCore& ov_core, SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights) { for (auto& [key, value] : metadata_map) { @@ -352,7 +351,7 @@ void CreateOVTensors(const std::string& device_name, // Create OpenVINO Tensor if (device_name == "NPU") { // Use remote tensors - auto npu_context = ov_core.Get().get_default_context("NPU").as(); + auto npu_context = OVCore::Get().get_default_context("NPU").as(); auto&& remote_tensor = npu_context.create_host_tensor(ov_elementType, value.dimensions); // Copy data to remote tensor diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 4fb54507ad31c..e27a6e277a1a3 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -68,7 +68,6 @@ CreateOVModel(const std::string model, std::map>& const_outputs_map); void CreateOVTensors(const std::string& device_name, - OVCore& ov_core, SharedContext::SharedWeights::Metadata::Map& metadata_map, std::string_view weights); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index a8026d710827b..6f8ab00956fef 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -62,9 +62,9 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Pre-requisite is provider_option "context" must be set #if defined(IO_BUFFER_ENABLED) cl_context ctx = static_cast(session_context_.context); - remote_context_ = new ov::intel_gpu::ocl::ClContext(session_context_.ie_core.Get(), ctx); + remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get(), ctx); if (subgraph_context_.is_ep_ctx_graph) { - exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + exe_network_ = OVCore::ImportModel(*model_stream, remote_context_, subgraph_context_.subgraph_name); model_stream.reset(); // Delete stream after it is no longer needed @@ -78,7 +78,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); } LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; - exe_network_ = session_context_.ie_core.CompileModel( + exe_network_ = OVCore::CompileModel( ov_model, remote_context_, subgraph_context_.subgraph_name); } #else // !IO_BUFFER_ENABLED @@ -88,7 +88,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr if (subgraph_context_.is_ep_ctx_graph) { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob - exe_network_ = session_context_.ie_core.ImportModel(*model_stream, + exe_network_ = OVCore::ImportModel(*model_stream, hw_target, device_config, subgraph_context_.subgraph_name); @@ -102,12 +102,12 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Inputs with static dimenstions // Not enabled for models with external weights and when ep context is set. const std::string model = model_proto->SerializeAsString(); - exe_network_ = session_context_.ie_core.CompileModel(model, + exe_network_ = OVCore::CompileModel(model, hw_target, device_config, subgraph_context_.subgraph_name); - } else { // For all other types use ov::core read_model() to generate OV IR - // followed by ov::core compile_model() + } else { // For all other types use ov::ov_core read_model() to generate OV IR + // followed by ov::ov_core compile_model() std::shared_ptr ov_model; { const std::string model = model_proto->SerializeAsString(); @@ -116,7 +116,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); } - exe_network_ = session_context_.ie_core.CompileModel( + exe_network_ = OVCore::CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); } #endif @@ -196,7 +196,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { device_config.emplace(ov::device::properties("NPU", device_property)); #if (((OPENVINO_VERSION_MAJOR == 2024) && (OPENVINO_VERSION_MINOR > 3)) || (OPENVINO_VERSION_MAJOR > 2024)) if (session_context_.so_context_enable) { - session_context_.ie_core.Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); + OVCore::Get().set_property("NPU", ov::intel_npu::bypass_umd_caching(true)); } #endif } @@ -264,7 +264,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { continue; } if (is_supported_and_mutable(key, supported_properties)) { - session_context_.ie_core.Get().set_property(device, ov::AnyMap{{key, value}}); + OVCore::Get().set_property(device, ov::AnyMap{{key, value}}); } else { LOGS_DEFAULT(WARNING) << "WARNING: Property \"" << key << "\" is either unsupported in current OpenVINO version" @@ -284,14 +284,14 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { for (const std::string& device : individual_devices) { if (target_config.count(device)) { // Get supported properties for each individual device - auto device_properties = session_context_.ie_core.Get().get_property(device, ov::supported_properties); + auto device_properties = OVCore::Get().get_property(device, ov::supported_properties); // Set properties for the device set_target_properties(device, target_config.at(device), device_properties); } } } else { if (target_config.count(session_context_.device_type)) { - auto supported_properties = session_context_.ie_core.Get().get_property(session_context_.device_type, + auto supported_properties = OVCore::Get().get_property(session_context_.device_type, ov::supported_properties); set_target_properties(session_context_.device_type, target_config.at(session_context_.device_type), supported_properties); @@ -311,7 +311,7 @@ void BasicBackend::EnableCaching(ov::AnyMap& device_config) { device_property = std::make_pair("CACHE_DIR", session_context_.cache_dir); device_config.emplace(ov::device::properties("GPU", device_property)); } else { - session_context_.ie_core.SetCache(session_context_.cache_dir.string()); + OVCore::SetCache(session_context_.cache_dir.string()); } } } @@ -343,7 +343,7 @@ void BasicBackend::EnableStreams() { } // Do nothing } else { - session_context_.ie_core.SetStreams(session_context_.device_type, session_context_.num_streams); + OVCore::SetStreams(session_context_.device_type, session_context_.num_streams); } } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index d7b76f2a9e0de..dc6f87520bfd3 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -100,7 +100,6 @@ struct ProviderInfo { struct SessionContext : ProviderInfo { SessionContext(const ProviderInfo& info) : ProviderInfo{info} {} - OVCore ie_core; std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; std::filesystem::path onnx_model_path_name; int onnx_opset_version; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index a53bcf5cdbf6f..3fd6a70e2b7aa 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -56,8 +56,7 @@ void AdjustProviderInfo(ProviderInfo& info) { std::set ov_supported_device_types = {"CPU", "GPU", "GPU.0", "GPU.1", "NPU"}; - OVDevices devices; - std::vector available_devices = devices.get_ov_devices(); + std::vector available_devices = OVCore::GetAvailableDevices(); for (auto& device : available_devices) { if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { @@ -112,10 +111,10 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, S InitProviderOrtApi(); // to check if target device is available - // using ie_core capability GetAvailableDevices to fetch list of devices plugged in + // using OVCore capability GetAvailableDevices to fetch list of devices plugged in if (info.cache_dir.empty()) { bool device_found = false; - std::vector available_devices = session_context_.ie_core.GetAvailableDevices(); + std::vector available_devices = OVCore::GetAvailableDevices(); // Checking for device_type configuration if (info.device_type != "") { if (info.device_type.find("HETERO") != std::string::npos || @@ -269,7 +268,7 @@ std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() AllocatorCreationInfo npu_allocator_info{ [this](OrtDevice::DeviceId device_id) { return std::make_unique( - session_context_.ie_core.Get(), + OVCore::Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU); diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 95d7027fd70e3..294f4d6db54a4 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -18,13 +18,6 @@ namespace onnxruntime { namespace openvino_ep { -struct OVDevices { - ov::Core core; - std::vector get_ov_devices() const { - return core.get_available_devices(); - } -}; - static void print_build_options() { std::cout << "[ERROR] INVALID DEVICE BUILD TYPE SPECIFIED" << std::endl; std::cout << "Specify the keyword HETERO (or) MULTI (or) AUTO followed by the devices in the order of priority " diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 06187573a7346..9bf3e8b040406 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -32,8 +32,7 @@ std::unique_ptr OpenVINOProviderFactory::CreateProvider() { struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO { std::vector GetAvailableDevices() const override { - openvino_ep::OVCore ie_core; - return ie_core.GetAvailableDevices(); + return OVCore::GetAvailableDevices(); } } g_info; @@ -58,8 +57,7 @@ struct OpenVINO_Provider : Provider { std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", "GPU.0_FP16", "GPU.1_FP16"}; - OVDevices devices; - std::vector available_devices = devices.get_ov_devices(); + std::vector available_devices = OVCore::GetAvailableDevices(); for (auto& device : available_devices) { if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 9b0e9c94c0f6e..6ce2d506211e7 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -13,7 +13,8 @@ using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { -const std::string log_tag = "[OpenVINO-EP] "; +static const std::string log_tag = "[OpenVINO-EP] "; +static ov::Core g_core; #ifndef NDEBUG void printDebugInfo(const ov::CompiledModel& obj) { @@ -46,7 +47,7 @@ void printDebugInfo(const ov::CompiledModel& obj) { } #endif -std::shared_ptr OVCore::ReadModel(const std::string& model, const std::string& model_path) const { +std::shared_ptr OVCore::ReadModel(const std::string& model, const std::string& model_path) { try { std::istringstream modelStringStream(model); std::istream& modelStream = modelStringStream; @@ -77,7 +78,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo const std::string& name) { ov::CompiledModel obj; try { - obj = oe.compile_model(ie_cnn_network, hw_target, device_config); + obj = g_core.compile_model(ie_cnn_network, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -96,7 +97,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, const std::string& name) { ov::CompiledModel obj; try { - obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); + obj = g_core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -115,7 +116,7 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream, std::string name) { try { ov::CompiledModel obj; - obj = oe.import_model(model_stream, hw_target, device_config); + obj = g_core.import_model(model_stream, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -129,7 +130,11 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream, } void OVCore::SetCache(const std::string& cache_dir_path) { - oe.set_property(ov::cache_dir(cache_dir_path)); + g_core.set_property(ov::cache_dir(cache_dir_path)); +} + +ov::Core& OVCore::Get() { + return g_core; } #ifdef IO_BUFFER_ENABLED @@ -165,12 +170,12 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr model_strea #endif std::vector OVCore::GetAvailableDevices() { - auto available_devices = oe.get_available_devices(); + auto available_devices = g_core.get_available_devices(); return available_devices; } void OVCore::SetStreams(const std::string& device_type, int num_streams) { - oe.set_property(device_type, {ov::num_streams(num_streams)}); + g_core.set_property(device_type, {ov::num_streams(num_streams)}); } OVInferRequest OVExeNetwork::CreateInferRequest() { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 5d88994dbabb0..a2547ada60f34 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -37,39 +37,37 @@ typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr; typedef ov::RemoteContext OVRemoteContext; #endif -class OVCore { - ov::Core oe; - - public: +struct OVCore { // OV Interface For Reading Model - std::shared_ptr ReadModel(const std::string& model_stream, const std::string& model_path) const; + static std::shared_ptr ReadModel(const std::string& model_stream, const std::string& model_path); + // OV Interface for Compiling OV Model Type - OVExeNetwork CompileModel(std::shared_ptr& ie_cnn_network, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name); + static OVExeNetwork CompileModel(std::shared_ptr& ie_cnn_network, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name); // OV Interface for Fast Compile - OVExeNetwork CompileModel(const std::string& onnx_model, - std::string& hw_target, - ov::AnyMap& device_config, - const std::string& name); + static OVExeNetwork CompileModel(const std::string& onnx_model, + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name); // OV Interface for Import model Stream - OVExeNetwork ImportModel(std::istream& model_stream, - std::string hw_target, - const ov::AnyMap& device_config, - std::string name); + static OVExeNetwork ImportModel(std::istream& model_stream, + std::string hw_target, + const ov::AnyMap& device_config, + std::string name); #ifdef IO_BUFFER_ENABLED - OVExeNetwork CompileModel(std::shared_ptr& model, - OVRemoteContextPtr context, - std::string name); - OVExeNetwork ImportModel(std::shared_ptr model_stream, - OVRemoteContextPtr context, - std::string name); + static OVExeNetwork CompileModel(std::shared_ptr& model, + OVRemoteContextPtr context, + std::string name); + static OVExeNetwork ImportModel(std::shared_ptr model_stream, + OVRemoteContextPtr context, + std::string name); #endif - std::vector GetAvailableDevices(); - void SetCache(const std::string& cache_dir_path); - ov::Core& Get() { return oe; } - void SetStreams(const std::string& device_type, int num_streams); + static std::vector GetAvailableDevices(); + static void SetCache(const std::string& cache_dir_path); + static ov::Core& Get(); + static void SetStreams(const std::string& device_type, int num_streams); }; class OVExeNetwork { From f85c7b5c9dca9c9170c60473b8367e7aed909fc6 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 16 Jan 2025 21:18:35 -0800 Subject: [PATCH 22/35] Decouple provider option cache_dir from session option ep.context_file_path --- .../core/providers/openvino/backend_manager.cc | 9 +++------ onnxruntime/core/providers/openvino/contexts.h | 1 + .../openvino/openvino_execution_provider.cc | 16 +++++++--------- .../openvino/openvino_provider_factory.cc | 6 +----- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 5a4bf791b4760..858d7fb3f0298 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -232,12 +232,9 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie } } else { // External blob - std::filesystem::path blob_filename; - // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability - if (!session_context_.cache_dir.empty()) { - blob_filename = session_context_.cache_dir; - } else { - blob_filename = graph_body_viewer.ModelPath(); + std::filesystem::path blob_filename = session_context_.so_context_file_path; + if (blob_filename.empty()) { + blob_filename = session_context_.onnx_model_path_name; } const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)}; blob_filename = blob_filename.parent_path() / name; diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index dc6f87520bfd3..4f3b22236ae0f 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -94,6 +94,7 @@ struct ProviderInfo { bool so_disable_cpu_ep_fallback{false}; // ORT session option bool so_context_embed_mode{false}; // ORT session option bool so_share_ep_contexts{false}; // ORT session option + fs::path so_context_file_path{}; // ORT session option }; // Holds context applicable to the entire EP instance. diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 3fd6a70e2b7aa..bc06e2e8e9a70 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -243,19 +243,17 @@ common::Status OpenVINOExecutionProvider::Compile( } } - if (session_context_.so_share_ep_contexts && session_context_.so_context_enable && !session_context_.cache_dir.empty()) { - std::filesystem::path metadata_name = session_context_.cache_dir.parent_path(); - - // If cache_dir hasn't been set use the model path to dump files + if (session_context_.so_share_ep_contexts) { + auto metadata_name = session_context_.so_context_file_path.parent_path(); if (metadata_name.empty()) { - metadata_name = session_context_.onnx_model_path_name.parent_path(); + metadata_name = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + } else { + metadata_name /= metadata_name.stem().string() + "_metadata"; + metadata_name.replace_extension("bin"); } // Metadata is generated only for shared contexts - // If metadata is generated then only save it if also saving epcontext (so_context_enable) - // If saving metadata then save it to the provided path - metadata_name /= session_context_.cache_dir.stem().string() + "_metadata"; - metadata_name.replace_extension("bin"); + // If saving metadata then save it to the provided path or ose the original model path dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string()); } diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 9bf3e8b040406..86804b8961cac 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -274,11 +274,7 @@ struct OpenVINO_Provider : Provider { pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; - std::string so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").data(); - - if (pi.so_context_enable && !so_context_file_path.empty()) { - pi.cache_dir = so_context_file_path; - } + pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); // Append values to config to support weight-as-inputs conversion for shared contexts if (pi.so_share_ep_contexts) { From f25f72c36a26b604b4671fd6af23da7bcb19b377 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Fri, 17 Jan 2025 00:43:41 -0800 Subject: [PATCH 23/35] Add support for serialization and deserialization of metadata to disk --- .../core/providers/openvino/backend_utils.cc | 66 +++++++++++++++++++ .../openvino/backends/basic_backend.cc | 18 ++--- .../core/providers/openvino/contexts.h | 7 +- .../openvino/openvino_execution_provider.cc | 29 +++++--- .../qdq_transformations/qdq_stripping.cc | 50 -------------- 5 files changed, 100 insertions(+), 70 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 05084fe8f838d..e5a335bd0bfdd 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -53,6 +53,72 @@ SharedContext::SharedWeights::MappedWeights::~MappedWeights() { } } +std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) { + try { + stream << metadata.size(); + + // Write each key-value pair + // Put elements in separate lines to facilitate reading + for (const auto& [key, value] : metadata) { + stream << std::endl + << key.name; + stream << std::endl + << value.location; + stream << std::endl + << value.data_offset; + stream << std::endl + << value.size; + stream << std::endl + << value.dimensions.size(); + for (const auto& dim : value.dimensions) { + stream << std::endl + << dim; + } + stream << std::endl + << value.element_type; + } + } catch (const Exception& e) { + ORT_THROW("Error: Failed to write map data.", e.what()); + } catch (...) { + ORT_THROW("Error: Failed to write map data."); + } + + ORT_ENFORCE(stream.good(), "Error: Failed to write map data."); + return stream; +} + +std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) { + size_t map_size{0}; + try { + stream >> map_size; + + while (!stream.eof()) { + SharedContext::SharedWeights::Metadata::Key key; + SharedContext::SharedWeights::Metadata::Value value; + stream >> key.name; + stream >> value.location; + stream >> value.data_offset; + stream >> value.size; + size_t num_dimensions; + stream >> num_dimensions; + value.dimensions.resize(num_dimensions); + for (auto& dim : value.dimensions) { + stream >> dim; + } + stream >> value.element_type; + metadata.emplace(key, value); + } + } catch (const Exception& e) { + ORT_THROW("Error: Failed to read map data.", e.what()); + } catch (...) { + ORT_THROW("Error: Failed to read map data."); + } + + ORT_ENFORCE(metadata.size() == map_size, "Error: Inconsistent map data."); + + return stream; +} + namespace backend_utils { bool IsDebugEnabled() { diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 6f8ab00956fef..a730c0b59628b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -65,8 +65,8 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr remote_context_ = new ov::intel_gpu::ocl::ClContext(OVCore::Get(), ctx); if (subgraph_context_.is_ep_ctx_graph) { exe_network_ = OVCore::ImportModel(*model_stream, - remote_context_, - subgraph_context_.subgraph_name); + remote_context_, + subgraph_context_.subgraph_name); model_stream.reset(); // Delete stream after it is no longer needed } else { std::shared_ptr ov_model; @@ -89,9 +89,9 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob exe_network_ = OVCore::ImportModel(*model_stream, - hw_target, - device_config, - subgraph_context_.subgraph_name); + hw_target, + device_config, + subgraph_context_.subgraph_name); model_stream.reset(); // Delete stream after it is no longer needed } else if (!subgraph_context_.has_external_weights && !subgraph_context_.has_dynamic_input_shape && @@ -103,9 +103,9 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr // Not enabled for models with external weights and when ep context is set. const std::string model = model_proto->SerializeAsString(); exe_network_ = OVCore::CompileModel(model, - hw_target, - device_config, - subgraph_context_.subgraph_name); + hw_target, + device_config, + subgraph_context_.subgraph_name); } else { // For all other types use ov::ov_core read_model() to generate OV IR // followed by ov::ov_core compile_model() std::shared_ptr ov_model; @@ -292,7 +292,7 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } else { if (target_config.count(session_context_.device_type)) { auto supported_properties = OVCore::Get().get_property(session_context_.device_type, - ov::supported_properties); + ov::supported_properties); set_target_properties(session_context_.device_type, target_config.at(session_context_.device_type), supported_properties); } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 4f3b22236ae0f..c861446e5cd08 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -38,6 +38,8 @@ struct SharedContext { std::shared_ptr tensor; }; using Map = std::unordered_map; + friend std::ostream& operator<<(std::ostream& right, const Metadata::Map& metadata); + friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata); }; struct MappedWeights { @@ -103,9 +105,8 @@ struct SessionContext : ProviderInfo { std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; std::filesystem::path onnx_model_path_name; - int onnx_opset_version; - bool use_api_2; - const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; + uint32_t onnx_opset_version{0}; + const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR); }; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index bc06e2e8e9a70..05a54c2a328bb 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -181,6 +181,17 @@ common::Status OpenVINOExecutionProvider::Compile( graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); } + // Temporary code to read metadata before it moves to the .bin + auto& metadata = shared_context_.shared_weights.metadata; + if (session_context_.so_share_ep_contexts && metadata.empty()) { + // Metadata is always read from model location, this could be a source or epctx model + fs::path metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + std::ifstream file(metadata_filename, std::ios::binary); + if (file) { + file >> metadata; + } + } + struct OpenVINOEPFunctionState { AllocateFunc allocate_func = nullptr; DestroyFunc destroy_func = nullptr; @@ -194,8 +205,6 @@ common::Status OpenVINOExecutionProvider::Compile( NodeComputeInfo compute_info; - session_context_.use_api_2 = true; - // During backend creation, we check if user wants to use precompiled blob onnx model or the original model // For precompiled blob, directly load the model instead of compiling the model // For original model, check if the user wants to export a model with pre-compiled blob @@ -244,17 +253,21 @@ common::Status OpenVINOExecutionProvider::Compile( } if (session_context_.so_share_ep_contexts) { - auto metadata_name = session_context_.so_context_file_path.parent_path(); - if (metadata_name.empty()) { - metadata_name = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; + fs::path metadata_filename; + if (session_context_.so_context_file_path.empty()) { + metadata_filename = session_context_.onnx_model_path_name.parent_path() / "metadata.bin"; } else { - metadata_name /= metadata_name.stem().string() + "_metadata"; - metadata_name.replace_extension("bin"); + metadata_filename = session_context_.so_context_file_path.parent_path() / "metadata.bin"; } // Metadata is generated only for shared contexts // If saving metadata then save it to the provided path or ose the original model path - dumpMetaDataMapToBinary(shared_context_.shared_weights.metadata, metadata_name.string()); + // Multiple calls to Compile() will update the metadata and for the last call + // the resulting file will contain the aggregated content + std::ofstream file(metadata_filename, std::ios::binary); + if (file) { + file << metadata; + } } return status; diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 019e121b4f575..902dab8c04ed0 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -667,56 +667,6 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, } } -template -bool writeScalar(std::ofstream& outfile, const T& scalar) { - auto size = sizeof(T); - outfile.write(reinterpret_cast(&size), sizeof(size)); - if (!outfile.good()) return false; - - outfile.write(reinterpret_cast(&scalar), size); - return outfile.good(); -} - -template <> -bool writeScalar(std::ofstream& outfile, const std::string& text) { - auto size = text.size() * sizeof(std::string::value_type); - outfile.write(reinterpret_cast(&size), size); - if (!outfile.good()) return false; - - outfile.write(text.data(), size); - return outfile.good(); -} - -// Main function to dump the map to a binary file -bool dumpMetaDataMapToBinary(const sw::Metadata::Map& metadata, const std::string& filename) { - std::ofstream outfile(filename, std::ios::binary); - if (!outfile.is_open()) { - ORT_THROW("Error: Could not open file for writing metadata."); - return false; - } - - // Write the size of the map - size_t map_size = metadata.size(); - outfile.write(reinterpret_cast(&map_size), sizeof(map_size)); - if (!outfile.good()) { - ORT_THROW("Error: Failed to write map size."); - return false; - } - - // Write each key-value pair - for (const auto& [key, value] : metadata) { - bool result = true; - result &= writeScalar(outfile, key.name); - result &= writeScalar(outfile, value.location); - result &= writeScalar(outfile, value.data_offset); - result &= writeScalar(outfile, value.size); - - ORT_ENFORCE(result, "Error: Failed to write map data."); - } - - return true; -} - // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, From 225b6787d3a90a25150526bbd49037f25620e010 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Tue, 21 Jan 2025 13:45:21 -0800 Subject: [PATCH 24/35] Load blobs from relative path stored in ep_cache_context --- onnxruntime/core/providers/openvino/backend_manager.cc | 2 +- onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 858d7fb3f0298..a6efe9bc41e07 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -245,7 +245,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie ORT_THROW("Unable to open file for epctx model dump."); } compiled_model.export_model(blob_file); - model_blob_str = blob_filename.string(); + model_blob_str = blob_filename.filename().string(); } ORT_RETURN_IF_ERROR(ep_ctx_handle_.AddOVEPCtxNodeToGraph(graph_body_viewer, diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 1c6b0a0467836..9c55614633b82 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -115,7 +115,9 @@ std::unique_ptr EPCtxHandler::GetModelBlobStream(const GraphViewer if (embed_mode) { result.reset((std::istream*)new std::istringstream(ep_cache_context)); } else { - result.reset((std::istream*)new std::ifstream(ep_cache_context, std::ios_base::binary | std::ios_base::in)); + const auto& blob_filepath = graph_viewer.ModelPath().parent_path() / ep_cache_context; + ORT_ENFORCE(std::filesystem::exists(blob_filepath), "Blob file not found: ", blob_filepath.string()); + result.reset((std::istream*)new std::ifstream(blob_filepath, std::ios_base::binary | std::ios_base::in)); } LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; return result; From 532401162add1ad7fadc2b71e398cd64ce6c5844 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Tue, 21 Jan 2025 13:46:29 -0800 Subject: [PATCH 25/35] Use remote L0 tensors for shared weights --- onnxruntime/core/providers/openvino/backend_utils.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index e5a335bd0bfdd..1aa4565671c53 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -418,10 +418,10 @@ void CreateOVTensors(const std::string& device_name, if (device_name == "NPU") { // Use remote tensors auto npu_context = OVCore::Get().get_default_context("NPU").as(); - auto&& remote_tensor = npu_context.create_host_tensor(ov_elementType, value.dimensions); + auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT); // Copy data to remote tensor - std::memcpy(remote_tensor.data(), (void*)tensor_data, value.size); + std::memcpy(remote_tensor.get(), (void*)tensor_data, value.size); value.tensor = std::make_shared(remote_tensor); } else { // Use vanilla tensors From 241cfae6bb9fb10ff58b49deac02df9f8fa61d3c Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Tue, 21 Jan 2025 11:23:25 +0000 Subject: [PATCH 26/35] fix linux ci issues --- .../providers/openvino/backend_manager.cc | 3 +- .../core/providers/openvino/backend_utils.cc | 55 +++++++++++++++++++ .../core/providers/openvino/contexts.h | 3 +- .../openvino/openvino_execution_provider.cc | 12 +++- 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index a6efe9bc41e07..d913daf14d6fe 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -236,7 +236,8 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie if (blob_filename.empty()) { blob_filename = session_context_.onnx_model_path_name; } - const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)}; + // const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)}; + const auto name = graph_body_viewer.ModelPath().stem().string() + "_" + subgraph_context_.subgraph_name; blob_filename = blob_filename.parent_path() / name; blob_filename.replace_extension("blob"); std::ofstream blob_file(blob_filename, diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 1aa4565671c53..ffe238a48ba7f 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -6,6 +6,14 @@ #include #include +#include // For open +#include // For mmap, munmap +#include // For fstat +#include // For close +#include +#include +#include + #include "openvino/pass/convert_fp32_to_fp16.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/runtime/intel_npu/level_zero/level_zero.hpp" @@ -13,13 +21,16 @@ #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/ov_interface.h" +#ifdef _WIN32 #include "Windows.h" +#endif using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { +#ifdef _WIN32 SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) { file_ = CreateFile(filename.string().data(), GENERIC_READ, @@ -52,6 +63,50 @@ SharedContext::SharedWeights::MappedWeights::~MappedWeights() { file_ = nullptr; } } +#else +SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) + : file_(nullptr), mapping_(nullptr) { + // Open the file + int fd = open(filename.c_str(), O_RDONLY); + if (fd == -1) { + ORT_THROW("Unable to open weight file at " + filename.string()); + } + + // Get file size + struct stat file_stat; + if (fstat(fd, &file_stat) == -1) { + close(fd); + ORT_THROW("Unable to get file size for " + filename.string()); + } + size_t file_size = file_stat.st_size; + + // Map the file into memory + void* raw_data = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (raw_data == MAP_FAILED) { + close(fd); + ORT_THROW("Unable to map weight file at " + filename.string()); + } + + // Set class members + file_ = reinterpret_cast(fd); // Store file descriptor + mapping_ = raw_data; // Store mapping address + weight_data = std::string_view(static_cast(raw_data), file_size); + + // Close the file descriptor, as mmap does not need it open + close(fd); +} + +SharedContext::SharedWeights::MappedWeights::~MappedWeights() { + // Unmap memory if it was mapped + if (mapping_ != nullptr) { + munmap(mapping_, weight_data.size()); + mapping_ = nullptr; + } + + // Clear the file descriptor, though it was already closed after mmap + file_ = nullptr; +} +#endif std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) { try { diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index c861446e5cd08..786be49954e65 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -107,7 +107,8 @@ struct SessionContext : ProviderInfo { std::filesystem::path onnx_model_path_name; uint32_t onnx_opset_version{0}; const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; - const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR); + // const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR); + const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR); }; // Holds context specific to subgraph. diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 05a54c2a328bb..b2be8176e315d 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -5,7 +5,7 @@ #include #include #include -#include +// #include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/contexts.h" @@ -23,6 +23,7 @@ namespace onnxruntime { namespace openvino_ep { // Parking this code here for now before it's moved to the factory +#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO static std::vector parseDevices(const std::string& device_string, const std::vector& available_devices) { std::string comma_separated_devices = device_string; @@ -50,6 +51,7 @@ static std::vector parseDevices(const std::string& device_string, } return devices; } +#endif // Parking this code here for now before it's moved to the factory void AdjustProviderInfo(ProviderInfo& info) { @@ -93,10 +95,14 @@ void AdjustProviderInfo(ProviderInfo& info) { #endif } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) { info.device_type = std::move(info.device_type); - } else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { + } +#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO + else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { std::ignore = parseDevices(info.device_type, available_devices); info.device_type = std::move(info.device_type); - } else { + } +#endif + else { ORT_THROW("Invalid device string: " + info.device_type); } LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" From 16ddb42926a5891a733e74d00ea0fb059f1a815f Mon Sep 17 00:00:00 2001 From: saurabhkale17 Date: Wed, 22 Jan 2025 20:27:00 +0530 Subject: [PATCH 27/35] fix ci issues --- onnxruntime/core/providers/openvino/backend_manager.cc | 1 - onnxruntime/core/providers/openvino/contexts.h | 1 - .../core/providers/openvino/openvino_execution_provider.cc | 1 - 3 files changed, 3 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index d913daf14d6fe..4479399f22790 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -236,7 +236,6 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie if (blob_filename.empty()) { blob_filename = session_context_.onnx_model_path_name; } - // const auto name{std::format("{}_{}", graph_body_viewer.ModelPath().stem().string(), subgraph_context_.subgraph_name)}; const auto name = graph_body_viewer.ModelPath().stem().string() + "_" + subgraph_context_.subgraph_name; blob_filename = blob_filename.parent_path() / name; blob_filename.replace_extension("blob"); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 786be49954e65..b76e6d7657a45 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -107,7 +107,6 @@ struct SessionContext : ProviderInfo { std::filesystem::path onnx_model_path_name; uint32_t onnx_opset_version{0}; const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; - // const std::string openvino_sdk_version = std::format("{}.{}", OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR); const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR); }; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index b2be8176e315d..ae9b347b26c16 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -5,7 +5,6 @@ #include #include #include -// #include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/contexts.h" From 10e851b31ce2888e6658bb0d2a591afad80dbfe5 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Wed, 22 Jan 2025 21:32:55 -0800 Subject: [PATCH 28/35] Fix Windows build failure --- onnxruntime/core/providers/openvino/backend_utils.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index ffe238a48ba7f..ecfd2d03dfa6d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -6,11 +6,6 @@ #include #include -#include // For open -#include // For mmap, munmap -#include // For fstat -#include // For close -#include #include #include @@ -23,6 +18,11 @@ #ifdef _WIN32 #include "Windows.h" +#else +#include // For open +#include // For mmap, munmap +#include // For fstat +#include // For close #endif using Exception = ov::Exception; From 6f7782ca36069fa460200a06d5c3bbb7dafb2d5f Mon Sep 17 00:00:00 2001 From: Eric Crawford Date: Wed, 22 Jan 2025 17:01:13 -0800 Subject: [PATCH 29/35] Use ifstream to load weights instead of mmaped file --- .../providers/openvino/backend_manager.cc | 4 +- .../core/providers/openvino/backend_utils.cc | 102 +++--------------- .../core/providers/openvino/backend_utils.h | 2 +- .../core/providers/openvino/contexts.h | 17 ++- 4 files changed, 25 insertions(+), 100 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 4479399f22790..dc4a1cf2b4ce9 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -102,9 +102,9 @@ BackendManager::BackendManager(SessionContext& session_context, if (weight_file) { if (!sw.mapped_weights) { - sw.mapped_weights = std::make_unique(weight_filename); + sw.mapped_weights = std::make_unique(weight_filename); } - backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, sw.mapped_weights->weight_data); + backend_utils::CreateOVTensors(session_context_.device_type, sw.metadata, *sw.mapped_weights); } } diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index ecfd2d03dfa6d..90e5fd92517f8 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -1,6 +1,5 @@ // Copyright (C) Intel Corporation // Licensed under the MIT License - #include #include #include @@ -16,97 +15,26 @@ #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/ov_interface.h" -#ifdef _WIN32 -#include "Windows.h" -#else -#include // For open -#include // For mmap, munmap -#include // For fstat -#include // For close -#endif using Exception = ov::Exception; namespace onnxruntime { namespace openvino_ep { -#ifdef _WIN32 -SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) { - file_ = CreateFile(filename.string().data(), - GENERIC_READ, - FILE_SHARE_READ, - 0, - OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL, - 0); - ORT_ENFORCE(file_ != nullptr, "Unable to open weight file at ", filename.string()); - - mapping_ = CreateFileMapping(file_, 0, PAGE_READONLY, 0, 0, 0); - ORT_ENFORCE(mapping_ != nullptr, "Unable to create mapping of weight file at ", filename.string()); - - const char* raw_data = static_cast(MapViewOfFile(mapping_, FILE_MAP_READ, 0, 0, 0)); - ORT_ENFORCE(raw_data != nullptr, "Unable to map weight file at ", filename.string()); - - weight_data = std::string_view(raw_data, std::filesystem::file_size(filename)); -} - -SharedContext::SharedWeights::MappedWeights::~MappedWeights() { - if (!weight_data.empty()) { - UnmapViewOfFile(weight_data.data()); - } - if (mapping_ != nullptr) { - CloseHandle(mapping_); - mapping_ = nullptr; - } - if (file_ != nullptr) { - CloseHandle(file_); - file_ = nullptr; +SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) { + try { + file_.exceptions(std::ifstream::failbit | std::ifstream::badbit); + weights_size_ = file_.seekg(0, std::ios::end).tellg(); + } catch (std::ifstream::failure& e) { + ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what()); } } -#else -SharedContext::SharedWeights::MappedWeights::MappedWeights(std::filesystem::path filename) - : file_(nullptr), mapping_(nullptr) { - // Open the file - int fd = open(filename.c_str(), O_RDONLY); - if (fd == -1) { - ORT_THROW("Unable to open weight file at " + filename.string()); - } - - // Get file size - struct stat file_stat; - if (fstat(fd, &file_stat) == -1) { - close(fd); - ORT_THROW("Unable to get file size for " + filename.string()); - } - size_t file_size = file_stat.st_size; - - // Map the file into memory - void* raw_data = mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0); - if (raw_data == MAP_FAILED) { - close(fd); - ORT_THROW("Unable to map weight file at " + filename.string()); - } - - // Set class members - file_ = reinterpret_cast(fd); // Store file descriptor - mapping_ = raw_data; // Store mapping address - weight_data = std::string_view(static_cast(raw_data), file_size); - - // Close the file descriptor, as mmap does not need it open - close(fd); -} - -SharedContext::SharedWeights::MappedWeights::~MappedWeights() { - // Unmap memory if it was mapped - if (mapping_ != nullptr) { - munmap(mapping_, weight_data.size()); - mapping_ = nullptr; - } - // Clear the file descriptor, though it was already closed after mmap - file_ = nullptr; +void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) { + ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds."); + file_.seekg(file_offset); + file_.read(reinterpret_cast(data), size); } -#endif std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) { try { @@ -457,13 +385,10 @@ ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt // Function to handle tensor creation from external data void CreateOVTensors(const std::string& device_name, SharedContext::SharedWeights::Metadata::Map& metadata_map, - std::string_view weights) { + SharedContext::SharedWeights::WeightsFile &weights) { for (auto& [key, value] : metadata_map) { if (value.tensor) continue; - // Get tensor data - const auto* tensor_data = weights.data() + value.data_offset; - // Get element data type auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type; @@ -476,11 +401,12 @@ void CreateOVTensors(const std::string& device_name, auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT); // Copy data to remote tensor - std::memcpy(remote_tensor.get(), (void*)tensor_data, value.size); + weights.load_weights(value.data_offset, remote_tensor.get(), value.size); value.tensor = std::make_shared(remote_tensor); } else { // Use vanilla tensors - value.tensor = std::make_shared(ov_elementType, value.dimensions, (void*)tensor_data); + value.tensor = std::make_shared(ov_elementType, value.dimensions); + weights.load_weights(value.data_offset, value.tensor->data(), value.size); } ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch"); } diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index e27a6e277a1a3..d406daa4e24e4 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -69,7 +69,7 @@ CreateOVModel(const std::string model, void CreateOVTensors(const std::string& device_name, SharedContext::SharedWeights::Metadata::Map& metadata_map, - std::string_view weights); + SharedContext::SharedWeights::WeightsFile& weights); void printPerformanceCounts(const std::vector& performanceMap, std::ostream& stream, std::string deviceName); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index b76e6d7657a45..a0462e5be35f3 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -42,21 +42,20 @@ struct SharedContext { friend std::istream& operator>>(std::istream& right, Metadata::Map& metadata); }; - struct MappedWeights { - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MappedWeights); - ~MappedWeights(); - MappedWeights() = delete; - explicit MappedWeights(std::filesystem::path filename); + struct WeightsFile { + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightsFile); + WeightsFile() = delete; + explicit WeightsFile(std::filesystem::path filename); - std::string_view weight_data; + void load_weights(size_t file_offset, void* data, size_t size); private: - void* file_; - void* mapping_; + std::ifstream file_; + size_t weights_size_; }; fs::path external_weight_filename; - std::unique_ptr mapped_weights; + std::unique_ptr mapped_weights; Metadata::Map metadata; } shared_weights; }; From f3e4e078aadedffc33ce209dc9768d4863e4fe9e Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Thu, 23 Jan 2025 15:48:24 -0800 Subject: [PATCH 30/35] Fix for epctx models made up entirely of OVEP epctx nodes --- .../core/providers/openvino/ov_versions/capability.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index b9f01cc261f52..cb538c84441fa 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -95,6 +95,12 @@ std::vector> GetCapability::Execute() { } } + // If all the nodes have been accounted for then no more processing is needed + if (result.size() == nodes.size()) { + is_wholly_supported_graph_ = true; + return result; + } + // This is a list of initializers that nGraph considers as constants. Example weights, reshape shape etc. std::unordered_set ng_required_initializers; From 1d4c16ef2d39cbab9a7a2e0a907d5fe063d379cb Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Fri, 24 Jan 2025 12:01:23 -0800 Subject: [PATCH 31/35] Limit ov::Core lifetime to that of provider object --- .../openvino/openvino_provider_factory.cc | 20 +++++++------ .../core/providers/openvino/ov_interface.cc | 29 +++++++++++++------ .../core/providers/openvino/ov_interface.h | 5 +++- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 86804b8961cac..a80c250a75bf7 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -19,25 +19,23 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { ~OpenVINOProviderFactory() override {} - std::unique_ptr CreateProvider() override; + std::unique_ptr CreateProvider() override { + return std::make_unique(provider_info_, shared_context_); + } private: ProviderInfo provider_info_; SharedContext& shared_context_; }; -std::unique_ptr OpenVINOProviderFactory::CreateProvider() { - return std::make_unique(provider_info_, shared_context_); -} - struct ProviderInfo_OpenVINO_Impl : ProviderInfo_OpenVINO { std::vector GetAvailableDevices() const override { return OVCore::GetAvailableDevices(); } -} g_info; +}; struct OpenVINO_Provider : Provider { - void* GetInfo() override { return &g_info; } + void* GetInfo() override { return &info_; } std::shared_ptr CreateExecutionProviderFactory(const void* void_params) override { // Extract the void_params into ProviderOptions and ConfigOptions @@ -287,14 +285,17 @@ struct OpenVINO_Provider : Provider { } void Initialize() override { + OVCore::Initialize(); } void Shutdown() override { + OVCore::Teardown(); } private: SharedContext shared_context_; -} g_provider; + ProviderInfo_OpenVINO_Impl info_; +}; // OpenVINO_Provider } // namespace openvino_ep } // namespace onnxruntime @@ -302,6 +303,7 @@ struct OpenVINO_Provider : Provider { extern "C" { ORT_API(onnxruntime::Provider*, GetProvider) { - return &onnxruntime::openvino_ep::g_provider; + static onnxruntime::openvino_ep::OpenVINO_Provider g_provider; + return &g_provider; } } diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 6ce2d506211e7..e12a560809519 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -14,7 +14,17 @@ namespace onnxruntime { namespace openvino_ep { static const std::string log_tag = "[OpenVINO-EP] "; -static ov::Core g_core; +static std::unique_ptr g_core; + +void OVCore::Initialize() +{ + g_core = std::make_unique(); +} + +void OVCore::Teardown() +{ + g_core.reset(); +} #ifndef NDEBUG void printDebugInfo(const ov::CompiledModel& obj) { @@ -78,7 +88,7 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo const std::string& name) { ov::CompiledModel obj; try { - obj = g_core.compile_model(ie_cnn_network, hw_target, device_config); + obj = Get().compile_model(ie_cnn_network, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -97,7 +107,7 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, const std::string& name) { ov::CompiledModel obj; try { - obj = g_core.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); + obj = Get().compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -116,7 +126,7 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream, std::string name) { try { ov::CompiledModel obj; - obj = g_core.import_model(model_stream, hw_target, device_config); + obj = Get().import_model(model_stream, hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -130,11 +140,12 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream, } void OVCore::SetCache(const std::string& cache_dir_path) { - g_core.set_property(ov::cache_dir(cache_dir_path)); + Get().set_property(ov::cache_dir(cache_dir_path)); } -ov::Core& OVCore::Get() { - return g_core; +inline ov::Core& OVCore::Get() { + ORT_ENFORCE(g_core); + return *g_core; } #ifdef IO_BUFFER_ENABLED @@ -170,12 +181,12 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr model_strea #endif std::vector OVCore::GetAvailableDevices() { - auto available_devices = g_core.get_available_devices(); + auto available_devices = Get().get_available_devices(); return available_devices; } void OVCore::SetStreams(const std::string& device_type, int num_streams) { - g_core.set_property(device_type, {ov::num_streams(num_streams)}); + Get().set_property(device_type, {ov::num_streams(num_streams)}); } OVInferRequest OVExeNetwork::CreateInferRequest() { diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index a2547ada60f34..53b814094438e 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -38,6 +38,9 @@ typedef ov::RemoteContext OVRemoteContext; #endif struct OVCore { + static void Initialize(); + static void Teardown(); + // OV Interface For Reading Model static std::shared_ptr ReadModel(const std::string& model_stream, const std::string& model_path); @@ -66,7 +69,7 @@ struct OVCore { #endif static std::vector GetAvailableDevices(); static void SetCache(const std::string& cache_dir_path); - static ov::Core& Get(); + inline static ov::Core& Get(); static void SetStreams(const std::string& device_type, int num_streams); }; From 6ee1e162bb103b19ca09dd88b54514812dbb90e5 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Fri, 24 Jan 2025 13:24:27 -0800 Subject: [PATCH 32/35] Enforce shared tensors cleanup on shutdown --- .../core/providers/openvino/backend_manager.cc | 3 +++ .../core/providers/openvino/backend_utils.cc | 9 +++++++++ onnxruntime/core/providers/openvino/backend_utils.h | 1 + .../providers/openvino/backends/backend_factory.cc | 6 ++++++ onnxruntime/core/providers/openvino/ibackend.h | 1 + .../openvino/openvino_execution_provider.cc | 13 ++++++++++--- .../openvino/openvino_execution_provider.h | 2 +- .../providers/openvino/openvino_provider_factory.cc | 2 ++ 8 files changed, 33 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index dc4a1cf2b4ce9..3ae4677a86375 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -544,6 +544,9 @@ void BackendManager::Compute(OrtKernelContext* context) { } void BackendManager::ShutdownBackendManager() { + backend_map_.clear(); + BackendFactory::DestroyBackend(concrete_backend_.get()); + concrete_backend_.reset(); } } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 90e5fd92517f8..dfc094267f905 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -412,6 +412,15 @@ void CreateOVTensors(const std::string& device_name, } } +void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) { + for (auto& [key, value] : metadata_map) { + if (value.tensor) { + value.tensor.reset(); + } + } + metadata_map.clear(); +} + } // namespace backend_utils } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index d406daa4e24e4..06fdfe9cd5eca 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -70,6 +70,7 @@ CreateOVModel(const std::string model, void CreateOVTensors(const std::string& device_name, SharedContext::SharedWeights::Metadata::Map& metadata_map, SharedContext::SharedWeights::WeightsFile& weights); +void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map); void printPerformanceCounts(const std::vector& performanceMap, std::ostream& stream, std::string deviceName); diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index 99955da539ae7..78c38ba882512 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -33,5 +33,11 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type); } } + +void BackendFactory::DestroyBackend(IBackend* backend) { + BasicBackend* backend_ptr = (BasicBackend*)backend; + delete backend_ptr; + backend_ptr = nullptr; +} } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index d2f91cacb6c4d..2e01dc00faa6a 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -26,6 +26,7 @@ class BackendFactory { const SubGraphContext& subgraph_context, SharedContext& shared_context, ptr_stream_t& model_stream); + static void DestroyBackend(IBackend* backend); }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index ae9b347b26c16..68ee37097cc84 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -95,11 +95,11 @@ void AdjustProviderInfo(ProviderInfo& info) { } else if (ov_supported_device_types.find(info.device_type) != ov_supported_device_types.end()) { info.device_type = std::move(info.device_type); } -#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO - else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { +#if defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO + else if (info.device_type.find("HETERO") == 0 || info.device_type.find("MULTI") == 0 || info.device_type.find("AUTO") == 0) { std::ignore = parseDevices(info.device_type, available_devices); info.device_type = std::move(info.device_type); - } + } #endif else { ORT_THROW("Invalid device string: " + info.device_type); @@ -153,6 +153,13 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const ProviderInfo& info, S } } +OpenVINOExecutionProvider::~OpenVINOExecutionProvider() { + for (auto& backend_manager : backend_managers_) { + backend_manager.ShutdownBackendManager(); + } + backend_managers_.clear(); +} + std::vector> OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 294f4d6db54a4..75f4ef9f8ecc8 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -46,7 +46,7 @@ static std::vector split(const std::string& s, char delim) { class OpenVINOExecutionProvider : public IExecutionProvider { public: explicit OpenVINOExecutionProvider(const ProviderInfo& info, SharedContext& shared_context); - ~OpenVINOExecutionProvider() = default; + ~OpenVINOExecutionProvider(); std::vector> GetCapability(const GraphViewer& graph_viewer, diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index a80c250a75bf7..41f62377a6a3d 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -8,6 +8,7 @@ #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/openvino_provider_factory_creator.h" #include "core/providers/openvino/contexts.h" +#include "core/providers/openvino/backend_utils.h" #include "core/session/onnxruntime_session_options_config_keys.h" #include "nlohmann/json.hpp" @@ -289,6 +290,7 @@ struct OpenVINO_Provider : Provider { } void Shutdown() override { + backend_utils::DestroyOVTensors(shared_context_.shared_weights.metadata); OVCore::Teardown(); } From 6c108f2c978b1151aebf43c8abd46c117d6ba1c1 Mon Sep 17 00:00:00 2001 From: "Javier E. Martinez" Date: Fri, 24 Jan 2025 21:27:49 -0800 Subject: [PATCH 33/35] Add support for default device type based on project configuration --- .../openvino/openvino_provider_factory.cc | 288 +++++++++++------- 1 file changed, 171 insertions(+), 117 deletions(-) diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 41f62377a6a3d..aec02bbc8be7b 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -14,6 +14,150 @@ namespace onnxruntime { namespace openvino_ep { +void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) { + pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; + pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; + pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; + pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; + pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); +} + +void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) { + if (provider_options.contains("context")) { + uint64_t number = std::strtoull(provider_options.at("context").data(), nullptr, 16); + return reinterpret_cast(number); + } else { + return nullptr; + } +} + +bool ParseBooleanOption(const ProviderOptions& provider_options, std::string option_name) { + if (provider_options.contains(option_name)) { + const auto& value = provider_options.at(option_name); + if (value == "true" || value == "True") { + return true; + } else if (value == "false" || value == "False") { + return false; + } else { + ORT_THROW("[ERROR] [OpenVINO-EP] ", option_name, " should be a boolean.\n"); + } + } + return false; +} + +std::string ParseDeviceType(const ProviderOptions& provider_options, std::string option_name) { + const std::vector ov_available_devices = OVCore::GetAvailableDevices(); + + std::set ov_supported_device_types = {"CPU", "GPU", + "GPU.0", "GPU.1", "NPU"}; + std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", + "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", + "GPU.0_FP16", "GPU.1_FP16"}; + + // Expand set of supported device with OV devices + ov_supported_device_types.insert(ov_available_devices.begin(), ov_available_devices.end()); + + if (provider_options.contains(option_name)) { + const auto& selected_device = provider_options.at("device_type"); + + if (deprecated_device_types.contains(selected_device)) { + // Deprecated device and precision is handled together at ParsePrecision + return selected_device; + } + + if (!((ov_supported_device_types.contains(selected_device)) || + (selected_device.find("HETERO:") == 0) || + (selected_device.find("MULTI:") == 0) || + (selected_device.find("AUTO:") == 0))) { + ORT_THROW( + "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. " + "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from" + " HETERO/MULTI/AUTO options available. \n"); + } + return selected_device; + } else { + std::string default_device; + + // Take default behavior from project configuration +#if defined OPENVINO_CONFIG_CPU + default_device = "CPU"; +#elif defined OPENVINO_CONFIG_GPU + default_device = "GPU"; +#elif defined OPENVINO_CONFIG_NPU + default_device = "NPU"; +#elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO + default_device = DEVICE_NAME; + + // Validate that devices passed are valid + int delimit = device_type.find(":"); + const auto& devices = device_type.substr(delimit + 1); + auto device_list = split(devices, ','); + for (const auto& device : devices) { + if (!ov_supported_device_types.contains(device)) { + ORT_THROW("[ERROR] [OpenVINO] Invalid device selected: ", device); + } + } +#endif + + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Choosing Device: " << default_device; + return default_device; + } +} + +// Depends on ProviderOptions. +std::string ParsePrecision(const ProviderOptions& provider_options, std::string& device_type, const std::string& option_name) { + using DeviceName = std::string; + using DefaultValue = std::string; + using ValidValues = std::list; + using foo = std::pair; + using ParserHelper = std::map; + ParserHelper helper = { + {"GPU", {"FP16", {"FP16", "FP32"}}}, + {"NPU", {"FP16", {"FP16"}}}, + {"CPU", {"FP32", {"FP32"}}}, + }; + + std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", + "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", + "GPU.0_FP16", "GPU.1_FP16"}; + + if (provider_options.contains(option_name)) { + // Start by checking if the device_type is a normal valid one + if (helper.contains(device_type)) { + auto const& valid_values = helper[device_type].second; + const auto& precision = provider_options.at(option_name); + if (precision == "ACCURACY") { + return valid_values.back(); // Return highest supported precision + } else { + if (std::find(valid_values.begin(), valid_values.end(), precision) != valid_values.end()) { + return precision; // Return precision selected if valid + } else { + auto value_iter = valid_values.begin(); + std::string valid_values_joined = *value_iter; + // Append 2nd and up, if only one then ++value_iter is same as end() + for (++value_iter; value_iter != valid_values.end(); ++value_iter) { + valid_values_joined += ", " + *value_iter; + } + + ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. ", device_type, " only supports", valid_values_joined, ".\n"); + } + } + } else if (deprecated_device_types.contains(device_type)) { + LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + device_type + " is deprecated. \n" + << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', " + << "'GPU.1', 'NPU' or from" + << " HETERO/MULTI/AUTO options and set 'precision' separately. \n"; + int delimit = device_type.find("_"); + device_type = device_type.substr(0, delimit); + return device_type.substr(delimit + 1); + } + } + // Return default + return helper[device_type].first; +} + +void ParseProviderOptions(ProviderInfo& result, const ProviderOptions& config_options) {} + struct OpenVINOProviderFactory : IExecutionProviderFactory { OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context) : provider_info_(provider_info), shared_context_(shared_context) {} @@ -42,49 +186,17 @@ struct OpenVINO_Provider : Provider { // Extract the void_params into ProviderOptions and ConfigOptions using ConfigBuffer = std::pair; const ConfigBuffer* buffer = reinterpret_cast(void_params); - const auto& provider_options_map = *buffer->first; + const auto& provider_options = *buffer->first; const auto& config_options = buffer->second; ProviderInfo pi; std::string bool_flag = ""; - if (provider_options_map.find("device_type") != provider_options_map.end()) { - pi.device_type = provider_options_map.at("device_type").c_str(); - - std::set ov_supported_device_types = {"CPU", "GPU", - "GPU.0", "GPU.1", "NPU"}; - std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", - "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", - "GPU.0_FP16", "GPU.1_FP16"}; - std::vector available_devices = OVCore::GetAvailableDevices(); - - for (auto& device : available_devices) { - if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { - ov_supported_device_types.emplace(device); - } - } - if (deprecated_device_types.find(pi.device_type) != deprecated_device_types.end()) { - std::string deprecated_device = pi.device_type; - int delimit = pi.device_type.find("_"); - pi.device_type = deprecated_device.substr(0, delimit); - pi.precision = deprecated_device.substr(delimit + 1); - LOGS_DEFAULT(WARNING) << "[OpenVINO] Selected 'device_type' " + deprecated_device + " is deprecated. \n" - << "Update the 'device_type' to specified types 'CPU', 'GPU', 'GPU.0', " - << "'GPU.1', 'NPU' or from" - << " HETERO/MULTI/AUTO options and set 'precision' separately. \n"; - } - if (!((ov_supported_device_types.find(pi.device_type) != ov_supported_device_types.end()) || - (pi.device_type.find("HETERO:") == 0) || - (pi.device_type.find("MULTI:") == 0) || - (pi.device_type.find("AUTO:") == 0))) { - ORT_THROW( - "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. " - "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from" - " HETERO/MULTI/AUTO options available. \n"); - } - } - if (provider_options_map.find("device_id") != provider_options_map.end()) { - std::string dev_id = provider_options_map.at("device_id").c_str(); + + pi.device_type = ParseDeviceType(provider_options, "device_type"); + + if (provider_options.contains("device_id")) { + std::string dev_id = provider_options.at("device_id").data(); LOGS_DEFAULT(WARNING) << "[OpenVINO] The options 'device_id' is deprecated. " << "Upgrade to set deice_type and precision session options.\n"; if (dev_id == "CPU" || dev_id == "GPU" || dev_id == "NPU") { @@ -93,34 +205,13 @@ struct OpenVINO_Provider : Provider { ORT_THROW("[ERROR] [OpenVINO] Unsupported device_id is selected. Select from available options."); } } - if (provider_options_map.find("precision") != provider_options_map.end()) { - pi.precision = provider_options_map.at("precision").c_str(); - } - if (pi.device_type.find("GPU") != std::string::npos) { - if (pi.precision == "") { - pi.precision = "FP16"; - } else if (pi.precision != "ACCURACY" && pi.precision != "FP16" && pi.precision != "FP32") { - ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. GPU only supports FP32 / FP16. \n"); - } - } else if (pi.device_type.find("NPU") != std::string::npos) { - if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP16") { - pi.precision = "FP16"; - } else { - ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. NPU only supported FP16. \n"); - } - } else if (pi.device_type.find("CPU") != std::string::npos) { - if (pi.precision == "" || pi.precision == "ACCURACY" || pi.precision == "FP32") { - pi.precision = "FP32"; - } else { - ORT_THROW("[ERROR] [OpenVINO] Unsupported inference precision is selected. CPU only supports FP32 . \n"); - } + if (provider_options.contains("cache_dir")) { + pi.cache_dir = provider_options.at("cache_dir"); } - if (provider_options_map.find("cache_dir") != provider_options_map.end()) { - pi.cache_dir = provider_options_map.at("cache_dir"); - } + pi.precision = ParsePrecision(provider_options, pi.device_type, "precision"); - if (provider_options_map.find("load_config") != provider_options_map.end()) { + if (provider_options.contains("load_config")) { auto parse_config = [&](const std::string& config_str) -> std::map { // If the config string is empty, return an empty map and skip processing if (config_str.empty()) { @@ -179,29 +270,25 @@ struct OpenVINO_Provider : Provider { return target_map; }; - pi.load_config = parse_config(provider_options_map.at("load_config")); + pi.load_config = parse_config(provider_options.at("load_config")); } - if (provider_options_map.find("context") != provider_options_map.end()) { - std::string str = provider_options_map.at("context"); - uint64_t number = std::strtoull(str.c_str(), nullptr, 16); - pi.context = reinterpret_cast(number); - } + pi.context = ParseUint64(provider_options, "context"); #if defined(IO_BUFFER_ENABLED) // a valid context must be provided to enable IO Buffer optimizations - if (context == nullptr) { + if (pi.context == nullptr) { #undef IO_BUFFER_ENABLED #define IO_BUFFER_ENABLED = 0 LOGS_DEFAULT(WARNING) << "Context is not set. Disabling IO Buffer optimization"; } #endif - if (provider_options_map.find("num_of_threads") != provider_options_map.end()) { - if (!std::all_of(provider_options_map.at("num_of_threads").begin(), - provider_options_map.at("num_of_threads").end(), ::isdigit)) { + if (provider_options.contains("num_of_threads")) { + if (!std::all_of(provider_options.at("num_of_threads").begin(), + provider_options.at("num_of_threads").end(), ::isdigit)) { ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n"); } - pi.num_of_threads = std::stoi(provider_options_map.at("num_of_threads")); + pi.num_of_threads = std::stoi(provider_options.at("num_of_threads")); if (pi.num_of_threads <= 0) { pi.num_of_threads = 1; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_threads' should be in the positive range.\n " @@ -209,8 +296,8 @@ struct OpenVINO_Provider : Provider { } } - if (provider_options_map.find("model_priority") != provider_options_map.end()) { - pi.model_priority = provider_options_map.at("model_priority").c_str(); + if (provider_options.contains("model_priority")) { + pi.model_priority = provider_options.at("model_priority").data(); std::vector supported_priorities({"LOW", "MEDIUM", "HIGH", "DEFAULT"}); if (std::find(supported_priorities.begin(), supported_priorities.end(), pi.model_priority) == supported_priorities.end()) { @@ -221,59 +308,26 @@ struct OpenVINO_Provider : Provider { } } - if (provider_options_map.find("num_streams") != provider_options_map.end()) { - pi.num_streams = std::stoi(provider_options_map.at("num_streams")); + if (provider_options.contains("num_streams")) { + pi.num_streams = std::stoi(provider_options.at("num_streams")); if (pi.num_streams <= 0) { pi.num_streams = 1; LOGS_DEFAULT(WARNING) << "[OpenVINO-EP] The value for the key 'num_streams' should be in the range of 1-8.\n " << "Executing with num_streams=1"; } } - if (provider_options_map.find("enable_opencl_throttling") != provider_options_map.end()) { - bool_flag = provider_options_map.at("enable_opencl_throttling"); - if (bool_flag == "true" || bool_flag == "True") - pi.enable_opencl_throttling = true; - else if (bool_flag == "false" || bool_flag == "False") - pi.enable_opencl_throttling = false; - bool_flag = ""; - } + pi.enable_opencl_throttling = ParseBooleanOption(provider_options, "enable_opencl_throttling"); - if (provider_options_map.find("enable_qdq_optimizer") != provider_options_map.end()) { - bool_flag = provider_options_map.at("enable_qdq_optimizer"); - if (bool_flag == "true" || bool_flag == "True") - pi.enable_qdq_optimizer = true; - else if (bool_flag == "false" || bool_flag == "False") - pi.enable_qdq_optimizer = false; - else - ORT_THROW("[ERROR] [OpenVINO-EP] enable_qdq_optimiser should be a boolean.\n"); - bool_flag = ""; - } + pi.enable_qdq_optimizer = ParseBooleanOption(provider_options, "enable_qdq_optimizer"); + + pi.disable_dynamic_shapes = ParseBooleanOption(provider_options, "disable_dynamic_shapes"); + + ParseConfigOptions(pi, config_options); // Always true for NPU plugin or when passed . if (pi.device_type.find("NPU") != std::string::npos) { pi.disable_dynamic_shapes = true; } - if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) { - bool_flag = provider_options_map.at("disable_dynamic_shapes"); - if (bool_flag == "true" || bool_flag == "True") { - pi.disable_dynamic_shapes = true; - } else if (bool_flag == "false" || bool_flag == "False") { - if (pi.device_type.find("NPU") != std::string::npos) { - pi.disable_dynamic_shapes = true; - LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to " - << "TRUE for NPU backend.\n "; - } else { - pi.disable_dynamic_shapes = false; - } - } - bool_flag = ""; - } - - pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; - pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1"; - pi.so_context_embed_mode = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "0") == "1"; - pi.so_share_ep_contexts = config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1"; - pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); // Append values to config to support weight-as-inputs conversion for shared contexts if (pi.so_share_ep_contexts) { From 6d1f1cf9bcdee2c3ddefc78755124cb6016dce44 Mon Sep 17 00:00:00 2001 From: ankitm3k Date: Sat, 25 Jan 2025 16:00:11 +0530 Subject: [PATCH 34/35] fix: Fixed concrete_backend_ pointer double free issue on Linux --- onnxruntime/core/providers/openvino/backend_manager.cc | 1 - .../core/providers/openvino/backends/backend_factory.cc | 5 ----- onnxruntime/core/providers/openvino/backends/basic_backend.h | 1 + onnxruntime/core/providers/openvino/ibackend.h | 2 +- .../core/providers/openvino/openvino_provider_factory.cc | 4 ++-- 5 files changed, 4 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 3ae4677a86375..574b4371fee87 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -545,7 +545,6 @@ void BackendManager::Compute(OrtKernelContext* context) { void BackendManager::ShutdownBackendManager() { backend_map_.clear(); - BackendFactory::DestroyBackend(concrete_backend_.get()); concrete_backend_.reset(); } diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index 78c38ba882512..6c1ed9aa42727 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -34,10 +34,5 @@ BackendFactory::MakeBackend(std::unique_ptr& model_p } } -void BackendFactory::DestroyBackend(IBackend* backend) { - BasicBackend* backend_ptr = (BasicBackend*)backend; - delete backend_ptr; - backend_ptr = nullptr; -} } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 22bcc4c1da40e..2690b84cb432f 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -37,6 +37,7 @@ class BasicBackend : public IBackend { ptr_stream_t& model_stream); void Infer(OrtKernelContext* context) override; + ~BasicBackend() override = default; ov::CompiledModel& GetOVCompiledModel() override { return exe_network_.Get(); } diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 2e01dc00faa6a..04d1f52cbf834 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -16,6 +16,7 @@ class IBackend { public: virtual void Infer(OrtKernelContext* context) = 0; virtual ov::CompiledModel& GetOVCompiledModel() = 0; + virtual ~IBackend() = default; }; using ptr_stream_t = std::unique_ptr; class BackendFactory { @@ -26,7 +27,6 @@ class BackendFactory { const SubGraphContext& subgraph_context, SharedContext& shared_context, ptr_stream_t& model_stream); - static void DestroyBackend(IBackend* backend); }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index aec02bbc8be7b..40843be978d90 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -22,7 +22,7 @@ void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) { pi.so_context_file_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); } -void* ParseUint64(const ProviderOptions& provider_options, std::string option_name) { +void* ParseUint64(const ProviderOptions& provider_options, [[maybe_unused]] std::string option_name) { if (provider_options.contains("context")) { uint64_t number = std::strtoull(provider_options.at("context").data(), nullptr, 16); return reinterpret_cast(number); @@ -156,7 +156,7 @@ std::string ParsePrecision(const ProviderOptions& provider_options, std::string& return helper[device_type].first; } -void ParseProviderOptions(ProviderInfo& result, const ProviderOptions& config_options) {} +void ParseProviderOptions([[maybe_unused]] ProviderInfo& result, [[maybe_unused]] const ProviderOptions& config_options) {} struct OpenVINOProviderFactory : IExecutionProviderFactory { OpenVINOProviderFactory(ProviderInfo provider_info, SharedContext& shared_context) From 7179a0ba53d60fd98fe8e7d3e31dfbbdab3362d9 Mon Sep 17 00:00:00 2001 From: Preetha Veeramalai Date: Mon, 27 Jan 2025 06:10:37 -0800 Subject: [PATCH 35/35] Preetha/weight sharing fix (#545) * Move variables from subgraph to session context for model specific properties * Fix for redundant subgraph creation * Remove unused variable --- .../providers/openvino/backend_manager.cc | 8 ----- .../core/providers/openvino/backend_utils.cc | 3 +- .../core/providers/openvino/backend_utils.h | 1 - .../openvino/backends/basic_backend.cc | 6 ++-- .../core/providers/openvino/contexts.h | 5 ++- .../openvino/openvino_execution_provider.cc | 4 +-- .../openvino/openvino_execution_provider.h | 2 +- .../openvino/ov_versions/capability.cc | 36 ++++++------------- 8 files changed, 20 insertions(+), 45 deletions(-) diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 574b4371fee87..3740fdc239aea 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -61,14 +61,6 @@ BackendManager::BackendManager(SessionContext& session_context, return ""; }(subgraph); - openvino_ep::GetCapability obj(ep_ctx_handle_, - subgraph, - session_context_.device_type, - session_context_.enable_qdq_optimizer); - std::ignore = obj.Execute(); - subgraph_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph(); - subgraph_context_.has_external_weights = obj.HasExternalWeights(); - // Save the indexes of graph inputs among fused_node's inputDefs // (which also contains initializers). for (uint32_t index = 0; const auto& node : subgraph.GetInputs()) { diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index dfc094267f905..5322008905c0d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -123,7 +123,6 @@ bool IsCILogEnabled() { std::shared_ptr CreateOVModel(const std::string model, const SessionContext& session_context, - const SubGraphContext& subgraph_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; @@ -132,7 +131,7 @@ CreateOVModel(const std::string model, auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string()); // Check for Constant Folding - if ((session_context.device_type != "NPU") && !subgraph_context.is_wholly_supported_graph) { + if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; pass_const_obj.run_on_model(ov_model); auto& results = const_cast(ov_model.get()->get_results()); diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 06fdfe9cd5eca..a4e6fc0828f79 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -64,7 +64,6 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, std::shared_ptr CreateOVModel(const std::string model, const SessionContext& session_context, - const SubGraphContext& subgraph_context, std::map>& const_outputs_map); void CreateOVTensors(const std::string& device_name, diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index a730c0b59628b..4d294a298fdf5 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -75,7 +75,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr if (!subgraph_context.has_dynamic_input_shape) { delete model_proto.release(); } - ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); + ov_model = CreateOVModel(model, session_context_, const_outputs_map_); } LOGS_DEFAULT(INFO) << log_tag << "IO Buffering Enabled"; exe_network_ = OVCore::CompileModel( @@ -93,7 +93,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr device_config, subgraph_context_.subgraph_name); model_stream.reset(); // Delete stream after it is no longer needed - } else if (!subgraph_context_.has_external_weights && + } else if (!session_context_.has_external_weights && !subgraph_context_.has_dynamic_input_shape && !session_context_.so_context_enable && auto_unified_compile) { @@ -114,7 +114,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr if (!subgraph_context.has_dynamic_input_shape) { delete model_proto.release(); } - ov_model = CreateOVModel(model, session_context_, subgraph_context_, const_outputs_map_); + ov_model = CreateOVModel(model, session_context_, const_outputs_map_); } exe_network_ = OVCore::CompileModel( ov_model, hw_target, device_config, subgraph_context_.subgraph_name); diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index a0462e5be35f3..3b9da726822d5 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -101,10 +101,11 @@ struct ProviderInfo { // Holds context applicable to the entire EP instance. struct SessionContext : ProviderInfo { SessionContext(const ProviderInfo& info) : ProviderInfo{info} {} - std::vector deviceAvailableList = {true, true, true, true, true, true, true, true}; std::filesystem::path onnx_model_path_name; uint32_t onnx_opset_version{0}; + mutable bool is_wholly_supported_graph = false; //Value is set to mutable to modify from capability + mutable bool has_external_weights = false; //Value is set to mutable to modify from capability const std::vector OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; const std::string openvino_sdk_version = std::to_string(OPENVINO_VERSION_MAJOR) + "." + std::to_string(OPENVINO_VERSION_MINOR); }; @@ -120,8 +121,6 @@ struct SubGraphContext { std::string subgraph_name; string_index_map_t input_names; string_index_map_t output_names; - bool is_wholly_supported_graph = false; - bool has_external_weights = false; std::string model_precision; bool is_ep_ctx_graph = false; }; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 68ee37097cc84..7bd50e71935a8 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -169,13 +169,13 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) { std::cout << "In the OpenVINO EP" << std::endl; } - openvino_ep::GetCapability obj(ep_ctx_handle_, graph_viewer, session_context_.device_type, session_context_.enable_qdq_optimizer); result = obj.Execute(); - + session_context_.is_wholly_supported_graph = obj.IsWhollySupportedGraph(); + session_context_.has_external_weights = obj.HasExternalWeights(); return result; } diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 75f4ef9f8ecc8..1ce9f83fd78a8 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -50,7 +50,7 @@ class OpenVINOExecutionProvider : public IExecutionProvider { std::vector> GetCapability(const GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const override; + const IKernelLookup& /*kernel_lookup*/) const override ; Status Compile(const std::vector& fused_nodes, std::vector& node_compute_funcs) override; diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index cb538c84441fa..23cd7de6e84ba 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -74,26 +74,6 @@ std::vector> GetCapability::Execute() { // Check for EpContext nodes const auto& nodes = graph_viewer_.GetNodesInTopologicalOrder(); - for (const auto node_index : nodes) { - const auto& node = *graph_viewer_.GetNode(node_index); - if (ep_ctx_handler_.CheckForOVEPCtxNode(node)) { - std::vector inputs; - std::vector outputs; - - Iterable2String(inputs, node.InputDefs()); - Iterable2String(outputs, node.OutputDefs()); - - auto sub_graph = IndexedSubGraph::Create(); - sub_graph->Nodes().push_back(node_index); - auto meta_def = IndexedSubGraph_MetaDef::Create(); - meta_def->name() = node.Name(); - meta_def->domain() = kMSDomain; - meta_def->inputs() = inputs; - meta_def->outputs() = outputs; - sub_graph->SetMetaDef(std::move(meta_def)); - result.push_back(ComputeCapability::Create(std::move(sub_graph))); - } - } // If all the nodes have been accounted for then no more processing is needed if (result.size() == nodes.size()) { @@ -109,8 +89,8 @@ std::vector> GetCapability::Execute() { if (openvino_ep::backend_utils::IsDebugEnabled()) { std::cout << "No of unsupported nodes " << unsupported_nodes.size() << std::endl; for (size_t i = 0; i < unsupported_nodes.size(); i++) { - const Node* node = graph_viewer_.GetNode(unsupported_nodes[i]); - std::cout << "Unsupported node op " << node->OpType() << std::endl; + const Node* unode = graph_viewer_.GetNode(unsupported_nodes[i]); + std::cout << "Unsupported node op " << unode->OpType() << std::endl; } } #endif @@ -190,9 +170,16 @@ std::vector> GetCapability::Execute() { int no_of_clusters = 0; for (auto this_cluster : connected_clusters) { - // If subgraph has less then three, graph is considered trivial + + // If subgraph has less then three, graph is considered trivial unless its an epctx cluster if (this_cluster.size() < 3) { - continue; + bool is_epctx_node = false; + for(auto node_idx:this_cluster){ + if(graph_viewer_.GetNode(node_idx)->OpType() == "EPContext") + is_epctx_node = true; + } + if(!is_epctx_node) + continue; } std::vector cluster_graph_inputs, cluster_inputs, cluster_outputs; @@ -245,7 +232,6 @@ std::vector> GetCapability::Execute() { } LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Supported subgraphs on OpenVINO: " << no_of_clusters; } - return result; }