diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt index 8d07cd9a366..6338ea8891e 100644 --- a/backends/openvino/CMakeLists.txt +++ b/backends/openvino/CMakeLists.txt @@ -70,6 +70,24 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER) endif() +if(EXECUTORCH_BUILD_OPENVINO_NANOGPT_RUNNER) + # Build executor runner binary for openvino backend + list(APPEND openvino_nanogpt_runner_libs openvino_backend executorch extension_module_static extension_tensor) + + set(_openvino_nanogpt_runner__srcs + ${EXECUTORCH_ROOT}/examples/llm_manual/main.cpp + ) + add_executable(openvino_nanogpt_runner ${_openvino_nanogpt_runner__srcs}) + + list(APPEND openvino_nanogpt_runner_libs) + + target_link_libraries( + openvino_nanogpt_runner gflags portable_ops_lib ${openvino_nanogpt_runner_libs} + ) + target_compile_options(openvino_nanogpt_runner PUBLIC ${_common_compile_options}) +endif() + + # Install OpenVINO backend library to the lib directory install(TARGETS openvino_backend DESTINATION lib) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index bc3fde573e2..b64ebb0a7b2 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -26,6 +26,12 @@ from torch.fx.passes.operator_support import OperatorSupportBase +class PatternNode: + op_types = {} + + def __init__(self): + self.op_types = {} + class OpenvinoOperatorsSupport(OperatorSupportBase): def __init__( @@ -62,6 +68,13 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: op_type = node.target.__name__ else: op_type = str(node.target) + + if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip: + print( + f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped." + ) + return True + supported_ops = OperatorSupport(options)._support_dict if op_type == "getitem": return True @@ -71,11 +84,6 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: else: print("Op not supported: ", "torch.ops." + str(op_type)) - if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip: - print( - f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped." - ) - return False return False @@ -119,6 +127,69 @@ def ops_to_not_decompose( torch.ops.aten.upsample_nearest2d.vec, ] return (ops_not_decompose, None) + + def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.0 - op: ", node.op) + if node.op == "call_function": + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.1") + if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - B - target: ", node.target.__name__) + pt_input_nodes = node.all_input_nodes + pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)] + if pattern_input_ops is None: + enabled_ops.append(node) + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.1") + return True + if len(pt_input_nodes) != len(pattern_input_ops): + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.2") + return False + for i in range(len(pt_input_nodes)): + if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops): + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.3") + return False + enabled_ops.append(node) + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.4") + return True + elif node.op == "get_attr": + if "get_attr" in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2") + return True + else: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3") + return False + elif node.op == "placeholder": + if "placeholder" in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2") + return True + else: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3") + return False + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.4") + return False + + def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): + const_node = PatternNode + const_node.op_types["get_attr"] = None + const_node.op_types["placeholder"] = None + bitwise_right_shift_node = PatternNode + bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node] + bitwise_and_node = PatternNode + bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node] + stack_node = PatternNode + stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node] + + print("DEBUG - capture_nncf_patterns - A") + for node in graph_module.graph.nodes: + print("\tDEBUG - capture_nncf_patterns - B - op: ", node.op, ", target: ", node.target) + if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default": + print("\tDEBUG - capture_nncf_patterns - C - stack found") + enabled_ops = [] + pattern_match = self.check_pattern(node, stack_node, enabled_ops) + if pattern_match: + print("\tDEBUG - capture_nncf_patterns - D - match") + for pattern_op in enabled_ops: + print(pattern_op.name) + self._op_names_to_skip.add(pattern_op.name) def partition(self, exported_program: ExportedProgram) -> PartitionResult: """ @@ -127,15 +198,44 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: :param exported_program: The exported program. :return: A PartitionResult containing the partitioned graph and delegation tags. """ + + self._op_names_to_skip = set() + print("DEBUG - OpenvinoPartitioner - graph") + #print(exported_program.graph_module.code) + for node in exported_program.graph_module.graph.nodes: + if str(node.op).strip() == "call_function" and str(node.target.__name__).strip() == "aten.slice_copy.Tensor": + #if str(node.op).strip() == "call_function" and str(node.target.__name__).strip() == "aten.slice_copy.Tensor" and str(node.name).strip() == "aten_slice_copy_tensor_6": + print("\tDEBUG - OpenvinoPartitioner - slice_copy - op: ", node.op, ", target: ", node.target.__name__, ", name: ", node.name) + if not (len(node.all_input_nodes) == 3): + continue + slice_copy_in0 = node.all_input_nodes[0] + if not (str(slice_copy_in0.op).strip() == "placeholder"): + continue + print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in0 - op: ", slice_copy_in0.op, ", target: ", slice_copy_in0.target, ", name: ", slice_copy_in0.name) + slice_copy_in1 = node.all_input_nodes[1] + if not (str(slice_copy_in1.op).strip() == "call_function" and str(slice_copy_in1.target.__name__).strip() == "_local_scalar_dense.default"): + continue + print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in1 - op: ", slice_copy_in1.op, ", target: ", slice_copy_in1.target.__name__, ", name: ", slice_copy_in1.name) + slice_copy_in2 = node.all_input_nodes[2] + if not (str(slice_copy_in2.op).strip() == "call_function" and str(slice_copy_in2.target.__name__).strip() == "add"): + continue + print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in2 - op: ", slice_copy_in2.op, ", target: ", slice_copy_in2.target.__name__, ", name: ", slice_copy_in2.name) + #for input_node in node.all_input_nodes: + # print("\tDEBUG - OpenvinoPartitioner - input_node - op: ", input_node.op, ", target: ", input_node.target, ", name: ", input_node.name) + self._op_names_to_skip.add(node.name) + + self.capture_nncf_patterns(exported_program.graph_module) partitioner = CapabilityBasedPartitioner( exported_program.graph_module, OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip), allows_single_node_partition=True, ) partition_list = partitioner.propose_partitions() + print("DEBUG - num_parts: ", len(partition_list)) partition_tags = {} for partition in partition_list: + print("\tDEBUG - part - size: ", partition.size()) for node in partition.nodes: tag = f"tag{partition.id}" node.meta["delegation_tag"] = tag diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index c343f44a8b5..665921f50e7 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -52,3 +52,4 @@ def preprocess( model_bytes = compiled.export_model() return PreprocessResult(processed_bytes=model_bytes.getvalue()) + #return PreprocessResult(processed_bytes=model_bytes) diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index 316633e9004..ccb2aa91430 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1,2 +1,2 @@ transformers -git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf +git+https://github.com/openvinotoolkit/nncf@develop#egg=nncf diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp index a3134f72b4b..20e308e6ef7 100644 --- a/backends/openvino/runtime/OpenvinoBackend.cpp +++ b/backends/openvino/runtime/OpenvinoBackend.cpp @@ -23,6 +23,36 @@ namespace executorch { namespace backends { namespace openvino { + +std::string scalarTypeToString(exa::ScalarType type) { + switch (type) { + case exa::ScalarType::Byte: return "Byte"; + case exa::ScalarType::Char: return "Char"; + case exa::ScalarType::Short: return "Short"; + case exa::ScalarType::Int: return "Int"; + case exa::ScalarType::Long: return "Long"; + case exa::ScalarType::Half: return "Half"; + case exa::ScalarType::Float: return "Float"; + case exa::ScalarType::Double: return "Double"; + case exa::ScalarType::Bool: return "Bool"; + case exa::ScalarType::BFloat16: return "BFloat16"; + case exa::ScalarType::ComplexHalf: return "ComplexHalf"; + case exa::ScalarType::ComplexFloat: return "ComplexFloat"; + case exa::ScalarType::ComplexDouble: return "ComplexDouble"; + case exa::ScalarType::QUInt8: return "QUInt8"; + case exa::ScalarType::QInt8: return "QInt8"; + case exa::ScalarType::QInt32: return "QInt32"; + case exa::ScalarType::QUInt4x2: return "QUInt4x2"; + case exa::ScalarType::QUInt2x4: return "QUInt2x4"; + case exa::ScalarType::Undefined: return "Undefined"; + case exa::ScalarType::NumOptions: return "NumOptions"; + default: + throw std::invalid_argument("Unknown ScalarType"); + } +} + + + OpenvinoBackend::OpenvinoBackend() {} bool OpenvinoBackend::is_available() const { @@ -71,7 +101,9 @@ exr::Result OpenvinoBackend::init( } // Import the model + //std::cout << "DEBUG - before import" << std::endl; auto compiled_model = core.import_model(compiled_stream, device); + //std::cout << "DEBUG - after import" << std::endl; // The processed data can be freed since the model is compiled processed->Free(); @@ -102,22 +134,111 @@ exr::Error OpenvinoBackend::execute( size_t num_outputs = infer_request->get_compiled_model().outputs().size(); // Set inputs + //std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl; for (size_t i = 0; i < num_inputs; i++) { - auto input_tensor = args[i]->toTensor(); - ov::Shape input_shape( - input_tensor.sizes().begin(), input_tensor.sizes().end()); - - // Convert input tensor to OpenVINO tensor - ov::element::Type ov_type = - convert_to_openvino_type(input_tensor.scalar_type()); - ov::Tensor ov_input_tensor( - ov_type, input_shape, input_tensor.mutable_data_ptr()); - - infer_request->set_input_tensor(i, ov_input_tensor); + //std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl; + + //if (args[i]->isNone()) { + // std::cout << "DEBUG - Module - forward - A - type: none" << std::endl; + //} else if (args[i]->isInt()) { + // std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl; + //} else if (args[i]->isDouble()) { + // std::cout << "DEBUG - Module - forward - A - type: double" << std::endl; + //} else if (args[i]->isBool()) { + // std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl; + //} else if (args[i]->isScalar()) { + // std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl; + //} else if (args[i]->isTensor()) { + // std::cout << "DEBUG - Module - forward - A - type: tensor, shape: ["; + // for (int j=0; jtoTensor().dim(); j++) { + // std::cout << args[i]->toTensor().size(j) << ", "; + // } + // std::cout << "]" << std::endl; + //} else if (args[i]->isString()) { + // std::cout << "DEBUG - Module - forward - A - type: string" << std::endl; + //} else if (args[i]->isIntList()) { + // std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl; + //} else if (args[i]->isBoolList()) { + // std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl; + //} else if (args[i]->isDoubleList()) { + // std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl; + //} else if (args[i]->isTensorList()) { + // std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl; + //} else if (args[i]->isListOptionalTensor()) { + // std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl; + //} else { + // std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl; + //} + + if (args[i]->isInt()) { + //std::cout << "DEBUG - OpenvinoBackend - input - B.1" << std::endl; + //auto input_tensor = args[i]->toInt(); + //std::cout << "DEBUG - OpenvinoBackend - input - B.2" << std::endl; + //ov::Shape input_shape( + // input_tensor.sizes().begin(), input_tensor.sizes().end()); + + //std::cout << "DEBUG - OpenvinoBackend - input - B.3" << std::endl; + // Convert input tensor to OpenVINO tensor + //std::cout << "DEBUG - OpenvinoBackend - input - B.4" << std::endl; + //int64_t val = args[i]->toInt(); + //int64_t val = i; + int64_t *val = &(args[i]->payload.copyable_union.as_int); + //std::cout << "DEBUG - OpenvinoBackend - input - B.5 - val: " << val << std::endl; + //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{}, &val); + //std::vector val = {args[i]->toInt()}; + //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, &val); + ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val); + //std::cout << "\tDEBUG - OpenvinoBackend - input - int - val: " << ((int64_t*)(ov_input_tensor.data()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl; + + infer_request->set_input_tensor(i, ov_input_tensor); + //std::cout << "DEBUG - OpenvinoBackend - input - B.7" << std::endl; + } else { + //std::cout << "DEBUG - OpenvinoBackend - input - C.1" << std::endl; + auto input_tensor = args[i]->toTensor(); + //std::cout << "DEBUG - OpenvinoBackend - input - C.2" << std::endl; + ov::Shape input_shape( + input_tensor.sizes().begin(), input_tensor.sizes().end()); + + //std::cout << "DEBUG - OpenvinoBackend - input - C.3" << std::endl; + // Convert input tensor to OpenVINO tensor + ov::element::Type ov_type = + convert_to_openvino_type(input_tensor.scalar_type()); + //std::cout << "DEBUG - OpenvinoBackend - input - C.4" << std::endl; + ov::Tensor ov_input_tensor( + ov_type, input_shape, input_tensor.mutable_data_ptr()); + //std::cout << "DEBUG - OpenvinoBackend - input - C.5" << std::endl; + + infer_request->set_input_tensor(i, ov_input_tensor); + //std::cout << "DEBUG - OpenvinoBackend - input - C.6" << std::endl; + + //if (ov_type == ov::element::i64) { + // int64_t sum = 0; + // auto data_ptr = ov_input_tensor.data(); + // for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(int64_t); j++) { + // sum += data_ptr[j]; + // } + // //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl; + //} else { + // float sum = 0; + // auto data_ptr = ov_input_tensor.data(); + // for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(float); j++) { + // sum += data_ptr[j]; + // } + // //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl; + //} + //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << std::endl; + } } // Set outputs + //std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl; for (size_t i = 0; i < num_outputs; i++) { + //args[num_inputs + i]->toTensor().unsafeGetTensorImpl()->set_size(1,1); + //std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: ["; + //for (int j=0; jtoTensor().dim(); j++) { + // std::cout << args[num_inputs + i]->toTensor().size(j) << ", "; + //} + //std::cout << "]" << std::endl; auto output_tensor = args[num_inputs + i]->toTensor(); ov::Shape output_shape( output_tensor.sizes().begin(), output_tensor.sizes().end()); @@ -133,7 +254,23 @@ exr::Error OpenvinoBackend::execute( // Execute the inference infer_request->infer(); - + //for (size_t i = 0; i < num_outputs; i++) { + // auto out_t = infer_request->get_output_tensor(i); + // float sum = 0; + // auto data_ptr = out_t.data(); + // for (size_t j=0; j < out_t.get_byte_size()/sizeof(float); j++) { + // sum += data_ptr[j]; + // } + // //std::cout << "\tDEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << ", type: " << out_t.get_element_type() << ", sum_of_values: " << sum << std::endl; + //} + //auto out_t = infer_request->get_output_tensor(0); + //std::cout << "DEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << std::endl; + //for (int j=0; jtoTensor().dim(); j++) { + // std::cout << args[num_inputs + i]->toTensor().size(j) << ", "; + //} + //std::cout << "]" << std::endl; + + //std::cout << "DEBUG - OpenvinoBackend - DD" << std::endl; return exr::Error::Ok; } @@ -162,13 +299,18 @@ void OpenvinoBackend::destroy(exr::DelegateHandle* handle) const { ov::element::Type OpenvinoBackend::convert_to_openvino_type( exa::ScalarType scalar_type) const { + //std::cout << "DEBUG - scalar_type: " << scalarTypeToString(scalar_type) << std::endl; switch (scalar_type) { case exa::ScalarType::Float: return ov::element::f32; + case exa::ScalarType::Half: + return ov::element::f16; case exa::ScalarType::Int: return ov::element::i32; case exa::ScalarType::Char: return ov::element::i8; + case exa::ScalarType::Byte: + return ov::element::u8; case exa::ScalarType::Long: return ov::element::i64; case exa::ScalarType::Bool: diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py new file mode 100644 index 00000000000..debefdd1a35 --- /dev/null +++ b/backends/openvino/utils.py @@ -0,0 +1,150 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os + +from typing import Any, Dict, Optional, Tuple, Union + +import executorch.exir as exir + +import torch +from executorch.exir import EdgeProgramManager, ExecutorchProgramManager +from executorch.exir.program._program import to_edge_with_preserved_ops +from executorch.exir.tracer import Value +from torch.export import export, export_for_training, ExportedProgram + + +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=True, + _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue +) + + +def _to_core_aten( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + strict=True, + verbose=True, +) -> ExportedProgram: + # post autograd export. eventually this will become .to_core_aten + if not isinstance(model, torch.fx.GraphModule) and not isinstance( + model, torch.nn.Module + ): + raise ValueError( + f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}" + ) + core_aten_ep = export( + model, + example_inputs, + example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + ) + if verbose: + logging.info(f"Core ATen graph:\n{core_aten_ep.graph}") + return core_aten_ep + + +def _core_aten_to_edge( + core_aten_exir_ep: ExportedProgram, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=None, + verbose=True, +) -> EdgeProgramManager: + if not edge_compile_config: + edge_compile_config = exir.EdgeCompileConfig( + _check_ir_validity=False, # quant ops currently break ir verification + ) + edge_manager: EdgeProgramManager = to_edge_with_preserved_ops( + core_aten_exir_ep, + constant_methods=edge_constant_methods, + compile_config=edge_compile_config, + preserve_ops=[torch.ops.aten.stack.default,], + ) + if verbose: + logging.info(f"Exported graph:\n{edge_manager.exported_program()}") + return edge_manager + + +def export_to_edge( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=_EDGE_COMPILE_CONFIG, + strict=True, + verbose=True, +) -> EdgeProgramManager: + print("DEBUG - executorch - openvino_utils") + core_aten_ep = _to_core_aten( + model, + example_inputs, + example_kwarg_inputs=example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + verbose=verbose, + ) + return _core_aten_to_edge( + core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose + ) + + +def export_to_exec_prog( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict[str, Any]] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=_EDGE_COMPILE_CONFIG, + backend_config=None, + strict=True, +) -> ExecutorchProgramManager: + m = model.eval() + # pre-autograd export. eventually this will become torch.export + m = export_for_training(m, example_inputs, strict=True).module() + + core_aten_ep = _to_core_aten( + m, + example_inputs, + example_kwarg_inputs=example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + ) + + edge_m = _core_aten_to_edge( + core_aten_ep, edge_constant_methods, edge_compile_config + ) + + exec_prog = edge_m.to_executorch(backend_config) + return exec_prog + + +def save_pte_program( + prog: ExecutorchProgramManager, model_name: str, output_dir: str = "" +) -> str: + if model_name.endswith(".pte"): + filename = model_name + else: + filename = os.path.join(output_dir, f"{model_name}.pte") + + try: + # Write program to file. + with open(filename, "wb") as file: + prog.write_to_file(file) + logging.info(f"Saved exported program to {filename}") + # Write data to file/s. + prog.write_tensor_data_to_file(outdir=output_dir) + except Exception as e: + logging.error(f"Error while saving to {filename}: {e}") + + return filename diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 952cdf1b65d..f0be54f8806 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -173,6 +173,14 @@ if(TARGET qnn_executorch_backend) target_link_options_shared_lib(qnn_executorch_backend) endif() +# Openvino backend +if(TARGET openvino_backend) + find_package(OpenVINO REQUIRED) + target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core) + list(APPEND link_libraries openvino_backend) + target_link_options_shared_lib(openvino_backend) +endif() + # MPS backend if(TARGET mpsdelegate) list( diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 3a3102886f8..9cd906ad2f3 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -38,6 +38,7 @@ from executorch.extension.llm.export.partitioner_lib import ( get_coreml_partitioner, get_mps_partitioner, + get_openvino_partitioner, get_qnn_partitioner, get_vulkan_partitioner, get_xnnpack_partitioner, @@ -441,6 +442,7 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True", ) + parser.add_argument("--openvino", action="store_true") parser.add_argument( "--expand_rope_table", @@ -546,6 +548,13 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="If true, stops right after torch.export() and saves the exported model.", ) + + parser.add_argument( + "--nncf_compression", + default=False, + action="store_true", + help="If true, stops right after torch.export() and saves the exported model.", + ) return parser @@ -851,6 +860,7 @@ def _to_edge_and_lower_llama( # noqa: C901 mps: bool = False, coreml: bool = False, qnn: bool = False, + openvino: bool = False, dtype_override: str = "fp32", enable_dynamic_shape: bool = True, use_kv_cache: bool = False, @@ -887,6 +897,10 @@ def _to_edge_and_lower_llama( # noqa: C901 partitioners.append(get_mps_partitioner(use_kv_cache)) modelname = f"mps_{modelname}" + if openvino: + partitioners.append(get_openvino_partitioner(use_kv_cache)) + modelname = f"openvino_{modelname}" + if coreml: coreml_partitioner = get_coreml_partitioner( coreml_ios, @@ -1063,6 +1077,7 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901 mps=args.mps, coreml=args.coreml, qnn=args.qnn, + openvino=args.openvino, dtype_override=args.dtype_override, enable_dynamic_shape=args.enable_dynamic_shape, use_kv_cache=args.use_kv_cache, @@ -1221,6 +1236,7 @@ def _load_llama_model( use_legacy_export=args.qnn, save_exported_program=args.export_only, verbose=verbose, + nncf_compression=args.nncf_compression, metadata=_load_llama_model_metadata( weight_type, use_kv_cache, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 323311caeea..6339da0b311 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -15,7 +15,7 @@ from enum import Enum from typing import Any, Callable, Dict, List, Optional, Tuple from unittest.mock import patch - +import nncf import torch from executorch.backends.transforms.duplicate_dynamic_quant_chain import ( DuplicateDynamicQuantChainPass, @@ -41,6 +41,7 @@ from torch.export import export_for_training, ExportedProgram from torch.nn.attention import SDPBackend from torchao.utils import unwrap_tensor_subclass +from functools import partial FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -99,6 +100,7 @@ def __init__( dynamic_shapes: Optional[Any] = None, use_legacy_export: bool = False, save_exported_program: bool = False, + nncf_compression: bool = False ): # Store necessary constructor arguments. self.model = model @@ -120,6 +122,7 @@ def __init__( self.dynamic_shapes = dynamic_shapes self.use_legacy_export = use_legacy_export self.save_exported_program = save_exported_program + self.nncf_compression = nncf_compression # Note: treat this as the source of truth for the result of # torch.export'ing a model. If the overall ExportedProgram is needed, @@ -409,6 +412,36 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m return self + elif (self.nncf_compression): + print("DEBUG - executorch - builder - quantize - A") + tokenizer = get_tokenizer(self.tokenizer_path) + + def transform_fn( + prompts: str, tokenizer + ): + tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) + logging.error(tokenized_text) + + inputs = () + inputs = ( + torch.tensor(tokenized_text).unsqueeze(0), + {"input_pos": torch.tensor([0])}, + ) + + return inputs + + self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data + self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data + logging.error(self.calibration_data) + self.pre_autograd_graph_module = nncf.compress_weights( + self.pre_autograd_graph_module, + dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), + mode=nncf.CompressWeightsMode.INT4_SYM, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) + print("DEBUG - executorch - builder - quantize - B") + return self else: logging.info("No quantizer provided, passing...") return self @@ -417,6 +450,7 @@ def export_to_edge(self) -> "LLMEdgeManager": """ Export the model to Edge dialect and retrieve a LLMEdgeManager. """ + print("DEBUG - executorch - builder - export_to_edge - A") dynamic_shape = self._get_dynamic_shape() edge_config = self._get_edge_config() @@ -436,6 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager": ) with override_export_behaviour: + if (self.nncf_compression): + from executorch.backends.openvino.utils import export_to_edge self.edge_manager = export_to_edge( self.pre_autograd_graph_module, # pyre-fixme[6] self.example_inputs, @@ -445,6 +481,7 @@ def export_to_edge(self) -> "LLMEdgeManager": edge_compile_config=edge_config, verbose=self.verbose, ) + print("DEBUG - executorch - builder - export_to_edge - B") return self def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager": diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 20604bbf635..ade3bec094f 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -64,6 +64,26 @@ def get_mps_partitioner(use_kv_cache: bool = False): return MPSPartitioner(compile_specs) # pyre-fixme[16] +def get_openvino_partitioner(use_kv_cache: bool = False): + from executorch.exir.backend.backend_details import CompileSpec + + assert ( + use_kv_cache is True + ), "MPS backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment" + try: + # pyre-ignore Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.mps.partition.mps_partitioner`. + from executorch.backends.openvino.partitioner import ( + OpenvinoPartitioner, + ) + except ImportError: + raise ImportError( + "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html" + ) + + compile_specs = [CompileSpec("device", "CPU".encode())] + return OpenvinoPartitioner(compile_specs) # pyre-fixme[16] + + def get_coreml_partitioner( ios: int = 15, embedding_quantize: Optional[str] = None, diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index aa5776163a9..baf91f3fd1a 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -68,6 +68,7 @@ set(lib_list mpsdelegate neuron_backend qnn_executorch_backend + openvino_backend portable_ops_lib custom_ops extension_module