-
Notifications
You must be signed in to change notification settings - Fork 57
Openvino/ep weight sharing #548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f6cff34
f170c88
37cee3f
409cb47
3b2b7e9
3949bf5
28c928a
7a89c5a
ad66ae0
c17b276
5e734f1
c9fb757
a78166d
01ac259
db075cd
8209162
89ebe8d
ae408af
ac9c998
6512ec6
5594817
f85c7b5
f25f72c
225b678
5324011
241cfae
16ddb42
10e851b
6f7782c
f3e4e07
1d4c16e
6ee1e16
6c108f2
6d1f1cf
7179a0b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,21 +1,107 @@ | ||
| // Copyright (C) Intel Corporation | ||
| // Licensed under the MIT License | ||
|
|
||
| #include <algorithm> | ||
| #include <sstream> | ||
| #include <fstream> | ||
| #include <utility> | ||
|
|
||
| #include <filesystem> | ||
|
Check notice on line 8 in onnxruntime/core/providers/openvino/backend_utils.cc
|
||
| #include <stdexcept> | ||
|
|
||
| #include "openvino/pass/convert_fp32_to_fp16.hpp" | ||
| #include "openvino/pass/constant_folding.hpp" | ||
| #include "openvino/runtime/intel_npu/level_zero/level_zero.hpp" | ||
| #include "core/providers/shared_library/provider_api.h" | ||
| #include "core/providers/openvino/backend_utils.h" | ||
| #include "core/providers/openvino/ov_interface.h" | ||
|
|
||
|
|
||
| using Exception = ov::Exception; | ||
|
|
||
| namespace onnxruntime { | ||
| namespace openvino_ep { | ||
|
|
||
| SharedContext::SharedWeights::WeightsFile::WeightsFile(std::filesystem::path filename) : file_(filename, std::ios::in | std::ios::binary) { | ||
| try { | ||
| file_.exceptions(std::ifstream::failbit | std::ifstream::badbit); | ||
| weights_size_ = file_.seekg(0, std::ios::end).tellg(); | ||
| } catch (std::ifstream::failure& e) { | ||
| ORT_THROW("Error: Failed to open weight file at ", filename.string(), " ", e.what()); | ||
| } | ||
| } | ||
|
|
||
| void SharedContext::SharedWeights::WeightsFile::load_weights(size_t file_offset, void* data, size_t size) { | ||
| ORT_ENFORCE(file_offset < weights_size_ && size <= weights_size_ && (file_offset <= weights_size_ - size), "Error: File offset is out of bounds."); | ||
| file_.seekg(file_offset); | ||
| file_.read(reinterpret_cast<char*>(data), size); | ||
| } | ||
|
|
||
| std::ostream& operator<<(std::ostream& stream, const SharedContext::SharedWeights::Metadata::Map& metadata) { | ||
| try { | ||
| stream << metadata.size(); | ||
|
|
||
| // Write each key-value pair | ||
| // Put elements in separate lines to facilitate reading | ||
| for (const auto& [key, value] : metadata) { | ||
| stream << std::endl | ||
| << key.name; | ||
| stream << std::endl | ||
| << value.location; | ||
| stream << std::endl | ||
| << value.data_offset; | ||
| stream << std::endl | ||
| << value.size; | ||
| stream << std::endl | ||
| << value.dimensions.size(); | ||
| for (const auto& dim : value.dimensions) { | ||
| stream << std::endl | ||
| << dim; | ||
| } | ||
| stream << std::endl | ||
| << value.element_type; | ||
| } | ||
| } catch (const Exception& e) { | ||
| ORT_THROW("Error: Failed to write map data.", e.what()); | ||
| } catch (...) { | ||
| ORT_THROW("Error: Failed to write map data."); | ||
| } | ||
|
|
||
| ORT_ENFORCE(stream.good(), "Error: Failed to write map data."); | ||
| return stream; | ||
| } | ||
|
|
||
| std::istream& operator>>(std::istream& stream, SharedContext::SharedWeights::Metadata::Map& metadata) { | ||
| size_t map_size{0}; | ||
| try { | ||
| stream >> map_size; | ||
|
|
||
| while (!stream.eof()) { | ||
| SharedContext::SharedWeights::Metadata::Key key; | ||
| SharedContext::SharedWeights::Metadata::Value value; | ||
| stream >> key.name; | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how do we prevent overflow here |
||
| stream >> value.location; | ||
| stream >> value.data_offset; | ||
| stream >> value.size; | ||
| size_t num_dimensions; | ||
| stream >> num_dimensions; | ||
| value.dimensions.resize(num_dimensions); | ||
| for (auto& dim : value.dimensions) { | ||
| stream >> dim; | ||
| } | ||
| stream >> value.element_type; | ||
| metadata.emplace(key, value); | ||
| } | ||
| } catch (const Exception& e) { | ||
| ORT_THROW("Error: Failed to read map data.", e.what()); | ||
| } catch (...) { | ||
| ORT_THROW("Error: Failed to read map data."); | ||
| } | ||
|
|
||
| ORT_ENFORCE(metadata.size() == map_size, "Error: Inconsistent map data."); | ||
|
|
||
| return stream; | ||
| } | ||
|
|
||
| namespace backend_utils { | ||
|
|
||
| bool IsDebugEnabled() { | ||
|
|
@@ -34,23 +120,18 @@ | |
| return false; | ||
| } | ||
|
|
||
| struct static_cast_int64 { | ||
| template <typename T1> // T1 models type statically convertible to T | ||
| int64_t operator()(const T1& x) const { return static_cast<int64_t>(x); } | ||
| }; | ||
|
|
||
| std::shared_ptr<const OVNetwork> | ||
| CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, | ||
| CreateOVModel(const std::string model, | ||
| const SessionContext& session_context, | ||
| std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) { | ||
| if (IsCILogEnabled()) { | ||
| std::cout << "CreateNgraphFunc" << std::endl; | ||
| } | ||
| const std::string model = model_proto.SerializeAsString(); | ||
| try { | ||
| auto ov_model = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); | ||
| auto ov_model = OVCore::ReadModel(model, session_context.onnx_model_path_name.string()); | ||
|
|
||
| // Check for Constant Folding | ||
| if ((global_context.device_type != "NPU") && !global_context.is_wholly_supported_graph) { | ||
| if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { | ||
| ov::pass::ConstantFolding pass_const_obj; | ||
| pass_const_obj.run_on_model(ov_model); | ||
| auto& results = const_cast<ov::ResultVector&>(ov_model.get()->get_results()); | ||
|
|
@@ -82,7 +163,7 @@ | |
| GetOutputTensor(Ort::KernelContext& context, size_t batch_size, | ||
| OVInferRequestPtr infer_request, | ||
| std::string output_name, | ||
| std::unordered_map<std::string, int> output_names) { | ||
| const SubGraphContext::string_index_map_t& output_names) { | ||
| auto graph_output_blob = infer_request->GetTensor(output_name); | ||
|
|
||
| auto graph_output_dims = graph_output_blob->get_shape(); | ||
|
|
@@ -107,7 +188,7 @@ | |
| Ort::UnownedValue | ||
| GetOutputTensor(Ort::KernelContext& context, | ||
| std::string output_name, | ||
| std::unordered_map<std::string, int> output_names, | ||
| const SubGraphContext::string_index_map_t& output_names, | ||
| std::shared_ptr<ov::Node> node) { | ||
| // Find position of '/' in the output_name | ||
| int pos = output_name.find("/"); | ||
|
|
@@ -129,13 +210,13 @@ | |
| return context.GetOutput(index, output_shape.get(), num_dims); | ||
| } | ||
|
|
||
| int GetFirstAvailableDevice(GlobalContext& global_context) { | ||
| int GetFirstAvailableDevice(SessionContext& session_context) { | ||
| int i = 0; | ||
| // Get the first available VAD-M device and set the device to busy | ||
| while (i < 8) { | ||
| bool device = global_context.deviceAvailableList[i]; | ||
| bool device = session_context.deviceAvailableList[i]; | ||
| if (device) { | ||
| global_context.deviceAvailableList[i] = false; | ||
| session_context.deviceAvailableList[i] = false; | ||
| break; | ||
| } | ||
| i++; | ||
|
|
@@ -144,9 +225,9 @@ | |
| // make all remaining devices free | ||
| if (i == 8) { | ||
| i = 0; | ||
| global_context.deviceAvailableList[i] = false; | ||
| session_context.deviceAvailableList[i] = false; | ||
| for (int j = 1; j < 8; j++) { | ||
| global_context.deviceAvailableList[j] = true; | ||
| session_context.deviceAvailableList[j] = true; | ||
| } | ||
| } | ||
| return i; | ||
|
|
@@ -267,6 +348,78 @@ | |
| printPerformanceCounts(performanceMap, stream, std::move(deviceName)); | ||
| } | ||
|
|
||
| ov::element::Type GetOpenVINOElementType(ONNX_NAMESPACE::TensorProto_DataType dt) { | ||
| static std::unordered_map<ONNX_NAMESPACE::TensorProto_DataType, ov::element::Type> map{ | ||
|
Check notice on line 352 in onnxruntime/core/providers/openvino/backend_utils.cc
|
||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @saurabhkale17 can you take care of this cpplint issue |
||
| {ONNX_NAMESPACE::TensorProto_DataType_FLOAT, ov::element::f32}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_UINT8, ov::element::u8}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_INT8, ov::element::i8}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_UINT16, ov::element::u16}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_INT16, ov::element::i16}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_INT32, ov::element::i32}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_INT64, ov::element::i64}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_STRING, ov::element::string}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_BOOL, ov::element::boolean}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, ov::element::f16}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, ov::element::f64}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_UINT32, ov::element::u32}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_UINT64, ov::element::u64}, | ||
| //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64, ov::element::undefined}, | ||
| //{ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128, ov::element::undefined}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, ov::element::bf16}, | ||
| //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN, ov::element::undefined}, | ||
| //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ, ov::element::undefined}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2, ov::element::f8e5m2}, | ||
| //{ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ, ov::element::undefined}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_UINT4, ov::element::u4}, | ||
| {ONNX_NAMESPACE::TensorProto_DataType_INT4, ov::element::i4}, | ||
| }; | ||
|
|
||
| if (auto result = map.find(dt); result != map.end()) { | ||
| return result->second; | ||
| } else { | ||
| throw std::runtime_error("Unsupported ONNX data type: " + std::to_string(dt)); | ||
| } | ||
| } | ||
|
|
||
| // Function to handle tensor creation from external data | ||
| void CreateOVTensors(const std::string& device_name, | ||
|
Check notice on line 385 in onnxruntime/core/providers/openvino/backend_utils.cc
|
||
| SharedContext::SharedWeights::Metadata::Map& metadata_map, | ||
| SharedContext::SharedWeights::WeightsFile &weights) { | ||
| for (auto& [key, value] : metadata_map) { | ||
| if (value.tensor) continue; | ||
|
|
||
| // Get element data type | ||
| auto onnx_element_type = (ONNX_NAMESPACE::TensorProto_DataType)value.element_type; | ||
|
|
||
| ov::element::Type ov_elementType = GetOpenVINOElementType(onnx_element_type); // Map to OpenVINO data type | ||
|
|
||
| // Create OpenVINO Tensor | ||
| if (device_name == "NPU") { | ||
| // Use remote tensors | ||
| auto npu_context = OVCore::Get().get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>(); | ||
| auto&& remote_tensor = npu_context.create_l0_host_tensor(ov_elementType, value.dimensions, ov::intel_npu::TensorType::INPUT); | ||
|
|
||
| // Copy data to remote tensor | ||
| weights.load_weights(value.data_offset, remote_tensor.get(), value.size); | ||
| value.tensor = std::make_shared<ov::Tensor>(remote_tensor); | ||
| } else { | ||
| // Use vanilla tensors | ||
| value.tensor = std::make_shared<ov::Tensor>(ov_elementType, value.dimensions); | ||
|
Check notice on line 407 in onnxruntime/core/providers/openvino/backend_utils.cc
|
||
| weights.load_weights(value.data_offset, value.tensor->data(), value.size); | ||
| } | ||
| ORT_ENFORCE(value.tensor->get_byte_size() == value.size, "Unexpected tensor size mismatch"); | ||
| } | ||
| } | ||
|
|
||
| void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map) { | ||
| for (auto& [key, value] : metadata_map) { | ||
| if (value.tensor) { | ||
| value.tensor.reset(); | ||
| } | ||
| } | ||
| metadata_map.clear(); | ||
| } | ||
|
|
||
| } // namespace backend_utils | ||
| } // namespace openvino_ep | ||
| } // namespace onnxruntime | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@saurabhkale17 please run cpplint to ensure this issue does not occur with MSFT CI Pipelines