From dd6c095fd64fdc7a1c482a1c72c7e31ef99adaf7 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 7 May 2025 19:00:37 -0700 Subject: [PATCH 01/13] ov llama support test updates --- backends/openvino/CMakeLists.txt | 18 ++++ backends/openvino/partitioner.py | 40 ++++++- backends/openvino/preprocess.py | 3 +- backends/openvino/runtime/OpenvinoBackend.cpp | 101 ++++++++++++++++-- examples/models/llama/CMakeLists.txt | 8 ++ examples/models/llama/export_llama_lib.py | 6 ++ extension/llm/export/partitioner_lib.py | 20 ++++ tools/cmake/executorch-config.cmake | 1 + 8 files changed, 180 insertions(+), 17 deletions(-) diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt index 8d07cd9a366..6338ea8891e 100644 --- a/backends/openvino/CMakeLists.txt +++ b/backends/openvino/CMakeLists.txt @@ -70,6 +70,24 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER) endif() +if(EXECUTORCH_BUILD_OPENVINO_NANOGPT_RUNNER) + # Build executor runner binary for openvino backend + list(APPEND openvino_nanogpt_runner_libs openvino_backend executorch extension_module_static extension_tensor) + + set(_openvino_nanogpt_runner__srcs + ${EXECUTORCH_ROOT}/examples/llm_manual/main.cpp + ) + add_executable(openvino_nanogpt_runner ${_openvino_nanogpt_runner__srcs}) + + list(APPEND openvino_nanogpt_runner_libs) + + target_link_libraries( + openvino_nanogpt_runner gflags portable_ops_lib ${openvino_nanogpt_runner_libs} + ) + target_compile_options(openvino_nanogpt_runner PUBLIC ${_common_compile_options}) +endif() + + # Install OpenVINO backend library to the lib directory install(TARGETS openvino_backend DESTINATION lib) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index bc3fde573e2..8f2c5dcb846 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -62,6 +62,13 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: op_type = node.target.__name__ else: op_type = str(node.target) + + if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip: + print( + f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped." + ) + return False + supported_ops = OperatorSupport(options)._support_dict if op_type == "getitem": return True @@ -71,11 +78,6 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: else: print("Op not supported: ", "torch.ops." + str(op_type)) - if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip: - print( - f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped." - ) - return False return False @@ -127,15 +129,43 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: :param exported_program: The exported program. :return: A PartitionResult containing the partitioned graph and delegation tags. """ + + self._op_names_to_skip = set() + print("DEBUG - OpenvinoPartitioner - graph") + #print(exported_program.graph_module.code) + for node in exported_program.graph_module.graph.nodes: + if str(node.op).strip() == "call_function" and str(node.target.__name__).strip() == "aten.slice_copy.Tensor": + #if str(node.op).strip() == "call_function" and str(node.target.__name__).strip() == "aten.slice_copy.Tensor" and str(node.name).strip() == "aten_slice_copy_tensor_6": + print("\tDEBUG - OpenvinoPartitioner - slice_copy - op: ", node.op, ", target: ", node.target.__name__, ", name: ", node.name) + if not (len(node.all_input_nodes) == 3): + continue + slice_copy_in0 = node.all_input_nodes[0] + if not (str(slice_copy_in0.op).strip() == "placeholder"): + continue + print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in0 - op: ", slice_copy_in0.op, ", target: ", slice_copy_in0.target, ", name: ", slice_copy_in0.name) + slice_copy_in1 = node.all_input_nodes[1] + if not (str(slice_copy_in1.op).strip() == "call_function" and str(slice_copy_in1.target.__name__).strip() == "_local_scalar_dense.default"): + continue + print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in1 - op: ", slice_copy_in1.op, ", target: ", slice_copy_in1.target.__name__, ", name: ", slice_copy_in1.name) + slice_copy_in2 = node.all_input_nodes[2] + if not (str(slice_copy_in2.op).strip() == "call_function" and str(slice_copy_in2.target.__name__).strip() == "add"): + continue + print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in2 - op: ", slice_copy_in2.op, ", target: ", slice_copy_in2.target.__name__, ", name: ", slice_copy_in2.name) + #for input_node in node.all_input_nodes: + # print("\tDEBUG - OpenvinoPartitioner - input_node - op: ", input_node.op, ", target: ", input_node.target, ", name: ", input_node.name) + self._op_names_to_skip.add(node.name) + partitioner = CapabilityBasedPartitioner( exported_program.graph_module, OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip), allows_single_node_partition=True, ) partition_list = partitioner.propose_partitions() + print("\tDEBUG - part - size: ", partition.size()) partition_tags = {} for partition in partition_list: + print("\tDEBUG - part - size: ", partition.size()) for node in partition.nodes: tag = f"tag{partition.id}" node.meta["delegation_tag"] = tag diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index c343f44a8b5..2775e3eed89 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -51,4 +51,5 @@ def preprocess( ) model_bytes = compiled.export_model() - return PreprocessResult(processed_bytes=model_bytes.getvalue()) + #return PreprocessResult(processed_bytes=model_bytes.getvalue()) + return PreprocessResult(processed_bytes=model_bytes) diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp index a3134f72b4b..cd50d69f5af 100644 --- a/backends/openvino/runtime/OpenvinoBackend.cpp +++ b/backends/openvino/runtime/OpenvinoBackend.cpp @@ -102,22 +102,94 @@ exr::Error OpenvinoBackend::execute( size_t num_outputs = infer_request->get_compiled_model().outputs().size(); // Set inputs + std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl; for (size_t i = 0; i < num_inputs; i++) { - auto input_tensor = args[i]->toTensor(); - ov::Shape input_shape( - input_tensor.sizes().begin(), input_tensor.sizes().end()); - - // Convert input tensor to OpenVINO tensor - ov::element::Type ov_type = - convert_to_openvino_type(input_tensor.scalar_type()); - ov::Tensor ov_input_tensor( - ov_type, input_shape, input_tensor.mutable_data_ptr()); + std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl; + + if (args[i]->isNone()) { + std::cout << "DEBUG - Module - forward - A - type: none" << std::endl; + } else if (args[i]->isInt()) { + std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl; + } else if (args[i]->isDouble()) { + std::cout << "DEBUG - Module - forward - A - type: double" << std::endl; + } else if (args[i]->isBool()) { + std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl; + } else if (args[i]->isScalar()) { + std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl; + } else if (args[i]->isTensor()) { + std::cout << "DEBUG - Module - forward - A - type: tensor, shape: ["; + for (int j=0; jtoTensor().dim(); j++) { + std::cout << args[i]->toTensor().size(j) << ", "; + } + std::cout << "]" << std::endl; + } else if (args[i]->isString()) { + std::cout << "DEBUG - Module - forward - A - type: string" << std::endl; + } else if (args[i]->isIntList()) { + std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl; + } else if (args[i]->isBoolList()) { + std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl; + } else if (args[i]->isDoubleList()) { + std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl; + } else if (args[i]->isTensorList()) { + std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl; + } else if (args[i]->isListOptionalTensor()) { + std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl; + } else { + std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl; + } - infer_request->set_input_tensor(i, ov_input_tensor); + if (args[i]->isInt()) { + //std::cout << "DEBUG - OpenvinoBackend - input - B.1" << std::endl; + //auto input_tensor = args[i]->toInt(); + //std::cout << "DEBUG - OpenvinoBackend - input - B.2" << std::endl; + //ov::Shape input_shape( + // input_tensor.sizes().begin(), input_tensor.sizes().end()); + + //std::cout << "DEBUG - OpenvinoBackend - input - B.3" << std::endl; + // Convert input tensor to OpenVINO tensor + //std::cout << "DEBUG - OpenvinoBackend - input - B.4" << std::endl; + //int64_t val = args[i]->toInt(); + //int64_t val = i; + int64_t *val = &(args[i]->payload.copyable_union.as_int); + //std::cout << "DEBUG - OpenvinoBackend - input - B.5 - val: " << val << std::endl; + //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{}, &val); + //std::vector val = {args[i]->toInt()}; + //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, &val); + ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val); + std::cout << "DEBUG - OpenvinoBackend - input - B.6 - val: " << ((int64_t*)(ov_input_tensor.data()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl; + + infer_request->set_input_tensor(i, ov_input_tensor); + //std::cout << "DEBUG - OpenvinoBackend - input - B.7" << std::endl; + } else { + //std::cout << "DEBUG - OpenvinoBackend - input - C.1" << std::endl; + auto input_tensor = args[i]->toTensor(); + //std::cout << "DEBUG - OpenvinoBackend - input - C.2" << std::endl; + ov::Shape input_shape( + input_tensor.sizes().begin(), input_tensor.sizes().end()); + + //std::cout << "DEBUG - OpenvinoBackend - input - C.3" << std::endl; + // Convert input tensor to OpenVINO tensor + ov::element::Type ov_type = + convert_to_openvino_type(input_tensor.scalar_type()); + //std::cout << "DEBUG - OpenvinoBackend - input - C.4" << std::endl; + ov::Tensor ov_input_tensor( + ov_type, input_shape, input_tensor.mutable_data_ptr()); + //std::cout << "DEBUG - OpenvinoBackend - input - C.5" << std::endl; + + infer_request->set_input_tensor(i, ov_input_tensor); + //std::cout << "DEBUG - OpenvinoBackend - input - C.6" << std::endl; + } } // Set outputs + std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl; for (size_t i = 0; i < num_outputs; i++) { + //args[num_inputs + i]->toTensor().unsafeGetTensorImpl()->set_size(1,1); + std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: ["; + for (int j=0; jtoTensor().dim(); j++) { + std::cout << args[num_inputs + i]->toTensor().size(j) << ", "; + } + std::cout << "]" << std::endl; auto output_tensor = args[num_inputs + i]->toTensor(); ov::Shape output_shape( output_tensor.sizes().begin(), output_tensor.sizes().end()); @@ -133,7 +205,14 @@ exr::Error OpenvinoBackend::execute( // Execute the inference infer_request->infer(); - + //auto out_t = infer_request->get_output_tensor(0); + //std::cout << "DEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << std::endl; + //for (int j=0; jtoTensor().dim(); j++) { + // std::cout << args[num_inputs + i]->toTensor().size(j) << ", "; + //} + //std::cout << "]" << std::endl; + + //std::cout << "DEBUG - OpenvinoBackend - DD" << std::endl; return exr::Error::Ok; } diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 4ea735e5717..b832cbbccf6 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -173,6 +173,14 @@ if(TARGET qnn_executorch_backend) target_link_options_shared_lib(qnn_executorch_backend) endif() +# Openvino backend +if(TARGET openvino_backend) + find_package(OpenVINO REQUIRED) + target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core) + list(APPEND link_libraries openvino_backend) + target_link_options_shared_lib(openvino_backend) +endif() + # MPS backend if(TARGET mpsdelegate) list( diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 0e48a8520d7..05d03ea5621 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -38,6 +38,7 @@ from executorch.extension.llm.export.partitioner_lib import ( get_coreml_partitioner, get_mps_partitioner, + get_openvino_partitioner, get_qnn_partitioner, get_vulkan_partitioner, get_xnnpack_partitioner, @@ -414,6 +415,7 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True", ) + parser.add_argument("--openvino", action="store_true") parser.add_argument( "--expand_rope_table", @@ -814,6 +816,10 @@ def _to_edge_and_lower_llama( # noqa: C901 partitioners.append(get_mps_partitioner(args.use_kv_cache)) modelname = f"mps_{modelname}" + if args.openvino: + partitioners.append(get_openvino_partitioner(args.use_kv_cache)) + modelname = f"openvino_{modelname}" + if args.coreml: coreml_partitioner = get_coreml_partitioner( args.coreml_ios, diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 20604bbf635..ade3bec094f 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -64,6 +64,26 @@ def get_mps_partitioner(use_kv_cache: bool = False): return MPSPartitioner(compile_specs) # pyre-fixme[16] +def get_openvino_partitioner(use_kv_cache: bool = False): + from executorch.exir.backend.backend_details import CompileSpec + + assert ( + use_kv_cache is True + ), "MPS backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment" + try: + # pyre-ignore Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.mps.partition.mps_partitioner`. + from executorch.backends.openvino.partitioner import ( + OpenvinoPartitioner, + ) + except ImportError: + raise ImportError( + "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html" + ) + + compile_specs = [CompileSpec("device", "CPU".encode())] + return OpenvinoPartitioner(compile_specs) # pyre-fixme[16] + + def get_coreml_partitioner( ios: int = 15, embedding_quantize: Optional[str] = None, diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index a8e756fbb77..d0c21365006 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -74,6 +74,7 @@ set(lib_list mpsdelegate neuron_backend qnn_executorch_backend + openvino_backend portable_ops_lib custom_ops extension_module From 36a7900772c1ff2ee1aed2cef295589b13cdb9d5 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 8 May 2025 11:31:01 -0700 Subject: [PATCH 02/13] partitioning debug print fix --- backends/openvino/partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 8f2c5dcb846..0cb7e47bd3c 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -161,7 +161,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: allows_single_node_partition=True, ) partition_list = partitioner.propose_partitions() - print("\tDEBUG - part - size: ", partition.size()) + print("DEBUG - num_parts: ", len(partition_list)) partition_tags = {} for partition in partition_list: From bb507924ffa63af680534549c1737805116b05f5 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 16:10:04 +0400 Subject: [PATCH 03/13] init --- examples/models/llama/export_llama_lib.py | 8 ++++++ extension/llm/export/builder.py | 35 ++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 05d03ea5621..ae4ab97258e 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -521,6 +521,13 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="If true, stops right after torch.export() and saves the exported model.", ) + + parser.add_argument( + "--nncf_compression", + default=False, + action="store_true", + help="If true, stops right after torch.export() and saves the exported model.", + ) return parser @@ -1138,6 +1145,7 @@ def _load_llama_model( use_legacy_export=args.qnn, save_exported_program=args.export_only, verbose=verbose, + nncf_compression=args.nncf_compression, metadata=_load_llama_model_metadata( weight_type, use_kv_cache, diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 323311caeea..be0fc6824b3 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -15,7 +15,7 @@ from enum import Enum from typing import Any, Callable, Dict, List, Optional, Tuple from unittest.mock import patch - +import nncf import torch from executorch.backends.transforms.duplicate_dynamic_quant_chain import ( DuplicateDynamicQuantChainPass, @@ -41,6 +41,7 @@ from torch.export import export_for_training, ExportedProgram from torch.nn.attention import SDPBackend from torchao.utils import unwrap_tensor_subclass +from functools import partial FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -99,6 +100,7 @@ def __init__( dynamic_shapes: Optional[Any] = None, use_legacy_export: bool = False, save_exported_program: bool = False, + nncf_compression: bool = False ): # Store necessary constructor arguments. self.model = model @@ -120,6 +122,7 @@ def __init__( self.dynamic_shapes = dynamic_shapes self.use_legacy_export = use_legacy_export self.save_exported_program = save_exported_program + self.nncf_compression = nncf_compression # Note: treat this as the source of truth for the result of # torch.export'ing a model. If the overall ExportedProgram is needed, @@ -409,6 +412,36 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m return self + if(self.nncf_compression): + tokenizer = get_tokenizer(self.tokenizer_path) + def transform_fn( + module: torch.fx.GraphModule, tokenizer, prompts: str + ): + # TODO: change criteria & support batch inputs if necessary + pos = torch.tensor(0, dtype=torch.int64) + token_list = tokenizer.encode(prompts, bos=True, eos=False) + + with torch.no_grad(): + while token_list[-1] != tokenizer.eos_id: + logits = module( + torch.full((1, 1), token_list[pos]), + {"input_pos": torch.tensor((pos,))}, + ) + pos += 1 + if pos >= len(token_list): + if self.generate_full_logits: + token_list.append( + torch.argmax(logits[:, -1], dim=-1).item() + ) + else: + token_list.append(torch.argmax(logits[:], dim=-1).item()) + self.pre_autograd_graph_module = nncf.compress_weights( + self.pre_autograd_graph_module, + # dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)), + mode=nncf.CompressWeightsMode.INT4_SYM, + # ratio=0.8, + # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ) else: logging.info("No quantizer provided, passing...") return self From 6925c5e8d1860953006f319e9f30be93bc24f767 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 16:28:22 +0400 Subject: [PATCH 04/13] small fix --- extension/llm/export/builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index be0fc6824b3..076db8ef6e5 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -442,6 +442,7 @@ def transform_fn( # ratio=0.8, # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, ) + return self else: logging.info("No quantizer provided, passing...") return self From 5e23cb9f37f156bc7067360edc72b6d81342bab4 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 16:45:54 +0400 Subject: [PATCH 05/13] minor fix --- extension/llm/export/builder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 076db8ef6e5..dc31efba4f7 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -412,7 +412,7 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m return self - if(self.nncf_compression): + elif(self.nncf_compression): tokenizer = get_tokenizer(self.tokenizer_path) def transform_fn( module: torch.fx.GraphModule, tokenizer, prompts: str @@ -437,10 +437,10 @@ def transform_fn( token_list.append(torch.argmax(logits[:], dim=-1).item()) self.pre_autograd_graph_module = nncf.compress_weights( self.pre_autograd_graph_module, - # dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)), + dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)), mode=nncf.CompressWeightsMode.INT4_SYM, - # ratio=0.8, - # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, + ratio=0.8, + sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, ) return self else: From e04a901c77c7bfcbd06653c27cd3d5d9c4f39b27 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 19:56:42 +0400 Subject: [PATCH 06/13] add data aware wc --- extension/llm/export/builder.py | 36 ++++++++++++++------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index dc31efba4f7..0e26145dff8 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -412,32 +412,26 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage DuplicateDynamicQuantChainPass()(m) self.pre_autograd_graph_module = m return self - elif(self.nncf_compression): + elif (self.nncf_compression): tokenizer = get_tokenizer(self.tokenizer_path) + def transform_fn( - module: torch.fx.GraphModule, tokenizer, prompts: str + prompts: str, tokenizer ): - # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int64) - token_list = tokenizer.encode(prompts, bos=True, eos=False) - - with torch.no_grad(): - while token_list[-1] != tokenizer.eos_id: - logits = module( - torch.full((1, 1), token_list[pos]), - {"input_pos": torch.tensor((pos,))}, - ) - pos += 1 - if pos >= len(token_list): - if self.generate_full_logits: - token_list.append( - torch.argmax(logits[:, -1], dim=-1).item() - ) - else: - token_list.append(torch.argmax(logits[:], dim=-1).item()) + tokenized_text = tokenizer.encode(prompts, bos=False, eos=False) + logging.error(tokenized_text) + + inputs = () + inputs = ( + torch.tensor(tokenized_text).unsqueeze(0), + {"input_pos": torch.tensor([0])}, + ) + + return inputs + self.pre_autograd_graph_module = nncf.compress_weights( self.pre_autograd_graph_module, - dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)), + dataset=nncf.Dataset([self.calibration_data], transform_func=partial(transform_fn, tokenizer=tokenizer)), mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8, sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, From fb5750ef03ad76dbb6a703c1f1ce094fb0b2f0d6 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 19:59:15 +0400 Subject: [PATCH 07/13] minor fix --- extension/llm/export/builder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 0e26145dff8..11cc0d0e749 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -428,10 +428,11 @@ def transform_fn( ) return inputs - + + self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data self.pre_autograd_graph_module = nncf.compress_weights( self.pre_autograd_graph_module, - dataset=nncf.Dataset([self.calibration_data], transform_func=partial(transform_fn, tokenizer=tokenizer)), + dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8, sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, From ea9eeb888f4bde33a46eb8181237e8ddf7ba948b Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 22:43:15 +0400 Subject: [PATCH 08/13] add quantization support for disable_dynamic_shapes --- extension/llm/export/builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 11cc0d0e749..be2b50a2339 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -430,6 +430,7 @@ def transform_fn( return inputs self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data + self.calibration_data = ([word for prompt in self.calibration_data for word in prompt.split()] if self.dynamic_shapes else self.calibration_data) self.pre_autograd_graph_module = nncf.compress_weights( self.pre_autograd_graph_module, dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), From 770569dcfa31f733f15cd26189bf56a135d5147c Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 19 May 2025 22:51:55 +0400 Subject: [PATCH 09/13] minor fix --- extension/llm/export/builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index be2b50a2339..8da1eab844b 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -430,7 +430,8 @@ def transform_fn( return inputs self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data - self.calibration_data = ([word for prompt in self.calibration_data for word in prompt.split()] if self.dynamic_shapes else self.calibration_data) + self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data + logging.error(self.calibration_data) self.pre_autograd_graph_module = nncf.compress_weights( self.pre_autograd_graph_module, dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)), From 9916cee4d826ca82737b66fe4904f20f949848ff Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 20 May 2025 11:49:26 +0400 Subject: [PATCH 10/13] partitioner update --- backends/openvino/partitioner.py | 50 +++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index 0cb7e47bd3c..b7032634780 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -26,6 +26,12 @@ from torch.fx.passes.operator_support import OperatorSupportBase +class PatternNode: + op_types = {} + + def __init__(self): + self.op_types = {} + class OpenvinoOperatorsSupport(OperatorSupportBase): def __init__( @@ -121,6 +127,47 @@ def ops_to_not_decompose( torch.ops.aten.upsample_nearest2d.vec, ] return (ops_not_decompose, None) + + def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool: + if node.op == "call_function": + if ("call_function" + ":" + str(node.target)) in pattern.op_types: + pt_input_nodes = node.all_input_nodes + pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target)] + if pattern_input_ops is None: + enabled_ops.append(node) + return True + if len(pt_input_nodes) != len(pattern_input_ops): + return False + for i in range(len(pt_input_nodes)): + if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops): + return False + enabled_ops.append(node) + return True + elif node.op == "get_attr": + if "get_attr" in pattern.op_types: + return True + else: + return False + return False + + def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): + const_node = PatternNode + const_node.op_types["get_attr"] = None + bitwise_right_shift_node = PatternNode + bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node] + bitwise_and_node = PatternNode + bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node] + stack_node = PatternNode + stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node] + + for node in graph_module.graph.nodes: + if str(node.op) == "call_function" and str(node.target) == "aten.stack.default": + enabled_ops = [] + pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops) + if pattern_match: + for pattern_op in enabled_ops: + print(pattern_op.name) + self._op_names_to_skip.add(pattern_op.name) def partition(self, exported_program: ExportedProgram) -> PartitionResult: """ @@ -154,7 +201,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: #for input_node in node.all_input_nodes: # print("\tDEBUG - OpenvinoPartitioner - input_node - op: ", input_node.op, ", target: ", input_node.target, ", name: ", input_node.name) self._op_names_to_skip.add(node.name) - + + self.capture_nncf_patterns(exported_program.graph_module) partitioner = CapabilityBasedPartitioner( exported_program.graph_module, OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip), From 0c20955e437bf3bcf3f3ae19aac1417262b4f506 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 20 May 2025 13:33:24 +0400 Subject: [PATCH 11/13] update for latest --- examples/models/llama/export_llama_lib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index f9935fd324f..9cd906ad2f3 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -860,6 +860,7 @@ def _to_edge_and_lower_llama( # noqa: C901 mps: bool = False, coreml: bool = False, qnn: bool = False, + openvino: bool = False, dtype_override: str = "fp32", enable_dynamic_shape: bool = True, use_kv_cache: bool = False, @@ -1076,6 +1077,7 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901 mps=args.mps, coreml=args.coreml, qnn=args.qnn, + openvino=args.openvino, dtype_override=args.dtype_override, enable_dynamic_shape=args.enable_dynamic_shape, use_kv_cache=args.use_kv_cache, From d3730eaf17c57458ca358076fa95ead44b5e8d79 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Wed, 4 Jun 2025 19:32:48 -0700 Subject: [PATCH 12/13] quant and fp16 temp fix --- backends/openvino/partitioner.py | 32 +++- backends/openvino/preprocess.py | 4 +- backends/openvino/requirements.txt | 2 +- backends/openvino/runtime/OpenvinoBackend.cpp | 145 ++++++++++++----- backends/openvino/utils.py | 150 ++++++++++++++++++ extension/llm/export/builder.py | 6 + 6 files changed, 290 insertions(+), 49 deletions(-) create mode 100644 backends/openvino/utils.py diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py index b7032634780..b64ebb0a7b2 100644 --- a/backends/openvino/partitioner.py +++ b/backends/openvino/partitioner.py @@ -73,7 +73,7 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool: print( f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped." ) - return False + return True supported_ops = OperatorSupport(options)._support_dict if op_type == "getitem": @@ -129,30 +129,48 @@ def ops_to_not_decompose( return (ops_not_decompose, None) def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.0 - op: ", node.op) if node.op == "call_function": - if ("call_function" + ":" + str(node.target)) in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.1") + if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - B - target: ", node.target.__name__) pt_input_nodes = node.all_input_nodes - pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target)] + pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)] if pattern_input_ops is None: enabled_ops.append(node) + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.1") return True if len(pt_input_nodes) != len(pattern_input_ops): + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.2") return False for i in range(len(pt_input_nodes)): if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops): + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.3") return False enabled_ops.append(node) + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.4") return True elif node.op == "get_attr": if "get_attr" in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2") + return True + else: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3") + return False + elif node.op == "placeholder": + if "placeholder" in pattern.op_types: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2") return True else: + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3") return False + print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.4") return False def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): const_node = PatternNode const_node.op_types["get_attr"] = None + const_node.op_types["placeholder"] = None bitwise_right_shift_node = PatternNode bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node] bitwise_and_node = PatternNode @@ -160,11 +178,15 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule): stack_node = PatternNode stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node] + print("DEBUG - capture_nncf_patterns - A") for node in graph_module.graph.nodes: - if str(node.op) == "call_function" and str(node.target) == "aten.stack.default": + print("\tDEBUG - capture_nncf_patterns - B - op: ", node.op, ", target: ", node.target) + if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default": + print("\tDEBUG - capture_nncf_patterns - C - stack found") enabled_ops = [] - pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops) + pattern_match = self.check_pattern(node, stack_node, enabled_ops) if pattern_match: + print("\tDEBUG - capture_nncf_patterns - D - match") for pattern_op in enabled_ops: print(pattern_op.name) self._op_names_to_skip.add(pattern_op.name) diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py index 2775e3eed89..665921f50e7 100644 --- a/backends/openvino/preprocess.py +++ b/backends/openvino/preprocess.py @@ -51,5 +51,5 @@ def preprocess( ) model_bytes = compiled.export_model() - #return PreprocessResult(processed_bytes=model_bytes.getvalue()) - return PreprocessResult(processed_bytes=model_bytes) + return PreprocessResult(processed_bytes=model_bytes.getvalue()) + #return PreprocessResult(processed_bytes=model_bytes) diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt index 316633e9004..ccb2aa91430 100644 --- a/backends/openvino/requirements.txt +++ b/backends/openvino/requirements.txt @@ -1,2 +1,2 @@ transformers -git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf +git+https://github.com/openvinotoolkit/nncf@develop#egg=nncf diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp index cd50d69f5af..20e308e6ef7 100644 --- a/backends/openvino/runtime/OpenvinoBackend.cpp +++ b/backends/openvino/runtime/OpenvinoBackend.cpp @@ -23,6 +23,36 @@ namespace executorch { namespace backends { namespace openvino { + +std::string scalarTypeToString(exa::ScalarType type) { + switch (type) { + case exa::ScalarType::Byte: return "Byte"; + case exa::ScalarType::Char: return "Char"; + case exa::ScalarType::Short: return "Short"; + case exa::ScalarType::Int: return "Int"; + case exa::ScalarType::Long: return "Long"; + case exa::ScalarType::Half: return "Half"; + case exa::ScalarType::Float: return "Float"; + case exa::ScalarType::Double: return "Double"; + case exa::ScalarType::Bool: return "Bool"; + case exa::ScalarType::BFloat16: return "BFloat16"; + case exa::ScalarType::ComplexHalf: return "ComplexHalf"; + case exa::ScalarType::ComplexFloat: return "ComplexFloat"; + case exa::ScalarType::ComplexDouble: return "ComplexDouble"; + case exa::ScalarType::QUInt8: return "QUInt8"; + case exa::ScalarType::QInt8: return "QInt8"; + case exa::ScalarType::QInt32: return "QInt32"; + case exa::ScalarType::QUInt4x2: return "QUInt4x2"; + case exa::ScalarType::QUInt2x4: return "QUInt2x4"; + case exa::ScalarType::Undefined: return "Undefined"; + case exa::ScalarType::NumOptions: return "NumOptions"; + default: + throw std::invalid_argument("Unknown ScalarType"); + } +} + + + OpenvinoBackend::OpenvinoBackend() {} bool OpenvinoBackend::is_available() const { @@ -71,7 +101,9 @@ exr::Result OpenvinoBackend::init( } // Import the model + //std::cout << "DEBUG - before import" << std::endl; auto compiled_model = core.import_model(compiled_stream, device); + //std::cout << "DEBUG - after import" << std::endl; // The processed data can be freed since the model is compiled processed->Free(); @@ -102,41 +134,41 @@ exr::Error OpenvinoBackend::execute( size_t num_outputs = infer_request->get_compiled_model().outputs().size(); // Set inputs - std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl; + //std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl; for (size_t i = 0; i < num_inputs; i++) { - std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl; - - if (args[i]->isNone()) { - std::cout << "DEBUG - Module - forward - A - type: none" << std::endl; - } else if (args[i]->isInt()) { - std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl; - } else if (args[i]->isDouble()) { - std::cout << "DEBUG - Module - forward - A - type: double" << std::endl; - } else if (args[i]->isBool()) { - std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl; - } else if (args[i]->isScalar()) { - std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl; - } else if (args[i]->isTensor()) { - std::cout << "DEBUG - Module - forward - A - type: tensor, shape: ["; - for (int j=0; jtoTensor().dim(); j++) { - std::cout << args[i]->toTensor().size(j) << ", "; - } - std::cout << "]" << std::endl; - } else if (args[i]->isString()) { - std::cout << "DEBUG - Module - forward - A - type: string" << std::endl; - } else if (args[i]->isIntList()) { - std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl; - } else if (args[i]->isBoolList()) { - std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl; - } else if (args[i]->isDoubleList()) { - std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl; - } else if (args[i]->isTensorList()) { - std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl; - } else if (args[i]->isListOptionalTensor()) { - std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl; - } else { - std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl; - } + //std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl; + + //if (args[i]->isNone()) { + // std::cout << "DEBUG - Module - forward - A - type: none" << std::endl; + //} else if (args[i]->isInt()) { + // std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl; + //} else if (args[i]->isDouble()) { + // std::cout << "DEBUG - Module - forward - A - type: double" << std::endl; + //} else if (args[i]->isBool()) { + // std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl; + //} else if (args[i]->isScalar()) { + // std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl; + //} else if (args[i]->isTensor()) { + // std::cout << "DEBUG - Module - forward - A - type: tensor, shape: ["; + // for (int j=0; jtoTensor().dim(); j++) { + // std::cout << args[i]->toTensor().size(j) << ", "; + // } + // std::cout << "]" << std::endl; + //} else if (args[i]->isString()) { + // std::cout << "DEBUG - Module - forward - A - type: string" << std::endl; + //} else if (args[i]->isIntList()) { + // std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl; + //} else if (args[i]->isBoolList()) { + // std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl; + //} else if (args[i]->isDoubleList()) { + // std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl; + //} else if (args[i]->isTensorList()) { + // std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl; + //} else if (args[i]->isListOptionalTensor()) { + // std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl; + //} else { + // std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl; + //} if (args[i]->isInt()) { //std::cout << "DEBUG - OpenvinoBackend - input - B.1" << std::endl; @@ -156,7 +188,7 @@ exr::Error OpenvinoBackend::execute( //std::vector val = {args[i]->toInt()}; //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, &val); ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val); - std::cout << "DEBUG - OpenvinoBackend - input - B.6 - val: " << ((int64_t*)(ov_input_tensor.data()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl; + //std::cout << "\tDEBUG - OpenvinoBackend - input - int - val: " << ((int64_t*)(ov_input_tensor.data()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl; infer_request->set_input_tensor(i, ov_input_tensor); //std::cout << "DEBUG - OpenvinoBackend - input - B.7" << std::endl; @@ -178,18 +210,35 @@ exr::Error OpenvinoBackend::execute( infer_request->set_input_tensor(i, ov_input_tensor); //std::cout << "DEBUG - OpenvinoBackend - input - C.6" << std::endl; + + //if (ov_type == ov::element::i64) { + // int64_t sum = 0; + // auto data_ptr = ov_input_tensor.data(); + // for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(int64_t); j++) { + // sum += data_ptr[j]; + // } + // //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl; + //} else { + // float sum = 0; + // auto data_ptr = ov_input_tensor.data(); + // for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(float); j++) { + // sum += data_ptr[j]; + // } + // //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl; + //} + //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << std::endl; } } // Set outputs - std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl; + //std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl; for (size_t i = 0; i < num_outputs; i++) { //args[num_inputs + i]->toTensor().unsafeGetTensorImpl()->set_size(1,1); - std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: ["; - for (int j=0; jtoTensor().dim(); j++) { - std::cout << args[num_inputs + i]->toTensor().size(j) << ", "; - } - std::cout << "]" << std::endl; + //std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: ["; + //for (int j=0; jtoTensor().dim(); j++) { + // std::cout << args[num_inputs + i]->toTensor().size(j) << ", "; + //} + //std::cout << "]" << std::endl; auto output_tensor = args[num_inputs + i]->toTensor(); ov::Shape output_shape( output_tensor.sizes().begin(), output_tensor.sizes().end()); @@ -205,6 +254,15 @@ exr::Error OpenvinoBackend::execute( // Execute the inference infer_request->infer(); + //for (size_t i = 0; i < num_outputs; i++) { + // auto out_t = infer_request->get_output_tensor(i); + // float sum = 0; + // auto data_ptr = out_t.data(); + // for (size_t j=0; j < out_t.get_byte_size()/sizeof(float); j++) { + // sum += data_ptr[j]; + // } + // //std::cout << "\tDEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << ", type: " << out_t.get_element_type() << ", sum_of_values: " << sum << std::endl; + //} //auto out_t = infer_request->get_output_tensor(0); //std::cout << "DEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << std::endl; //for (int j=0; jtoTensor().dim(); j++) { @@ -241,13 +299,18 @@ void OpenvinoBackend::destroy(exr::DelegateHandle* handle) const { ov::element::Type OpenvinoBackend::convert_to_openvino_type( exa::ScalarType scalar_type) const { + //std::cout << "DEBUG - scalar_type: " << scalarTypeToString(scalar_type) << std::endl; switch (scalar_type) { case exa::ScalarType::Float: return ov::element::f32; + case exa::ScalarType::Half: + return ov::element::f16; case exa::ScalarType::Int: return ov::element::i32; case exa::ScalarType::Char: return ov::element::i8; + case exa::ScalarType::Byte: + return ov::element::u8; case exa::ScalarType::Long: return ov::element::i64; case exa::ScalarType::Bool: diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py new file mode 100644 index 00000000000..debefdd1a35 --- /dev/null +++ b/backends/openvino/utils.py @@ -0,0 +1,150 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os + +from typing import Any, Dict, Optional, Tuple, Union + +import executorch.exir as exir + +import torch +from executorch.exir import EdgeProgramManager, ExecutorchProgramManager +from executorch.exir.program._program import to_edge_with_preserved_ops +from executorch.exir.tracer import Value +from torch.export import export, export_for_training, ExportedProgram + + +_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig( + _check_ir_validity=True, + _skip_dim_order=True, # TODO(T189114319): Reuse dim order op after solving the ios oss issue +) + + +def _to_core_aten( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + strict=True, + verbose=True, +) -> ExportedProgram: + # post autograd export. eventually this will become .to_core_aten + if not isinstance(model, torch.fx.GraphModule) and not isinstance( + model, torch.nn.Module + ): + raise ValueError( + f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}" + ) + core_aten_ep = export( + model, + example_inputs, + example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + ) + if verbose: + logging.info(f"Core ATen graph:\n{core_aten_ep.graph}") + return core_aten_ep + + +def _core_aten_to_edge( + core_aten_exir_ep: ExportedProgram, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=None, + verbose=True, +) -> EdgeProgramManager: + if not edge_compile_config: + edge_compile_config = exir.EdgeCompileConfig( + _check_ir_validity=False, # quant ops currently break ir verification + ) + edge_manager: EdgeProgramManager = to_edge_with_preserved_ops( + core_aten_exir_ep, + constant_methods=edge_constant_methods, + compile_config=edge_compile_config, + preserve_ops=[torch.ops.aten.stack.default,], + ) + if verbose: + logging.info(f"Exported graph:\n{edge_manager.exported_program()}") + return edge_manager + + +def export_to_edge( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=_EDGE_COMPILE_CONFIG, + strict=True, + verbose=True, +) -> EdgeProgramManager: + print("DEBUG - executorch - openvino_utils") + core_aten_ep = _to_core_aten( + model, + example_inputs, + example_kwarg_inputs=example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + verbose=verbose, + ) + return _core_aten_to_edge( + core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose + ) + + +def export_to_exec_prog( + model: Union[torch.fx.GraphModule, torch.nn.Module], + example_inputs: Tuple[Value, ...], + *, + example_kwarg_inputs: Optional[Dict[str, Any]] = None, + dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None, + edge_constant_methods: Optional[Dict[str, Any]] = None, + edge_compile_config=_EDGE_COMPILE_CONFIG, + backend_config=None, + strict=True, +) -> ExecutorchProgramManager: + m = model.eval() + # pre-autograd export. eventually this will become torch.export + m = export_for_training(m, example_inputs, strict=True).module() + + core_aten_ep = _to_core_aten( + m, + example_inputs, + example_kwarg_inputs=example_kwarg_inputs, + dynamic_shapes=dynamic_shapes, + strict=strict, + ) + + edge_m = _core_aten_to_edge( + core_aten_ep, edge_constant_methods, edge_compile_config + ) + + exec_prog = edge_m.to_executorch(backend_config) + return exec_prog + + +def save_pte_program( + prog: ExecutorchProgramManager, model_name: str, output_dir: str = "" +) -> str: + if model_name.endswith(".pte"): + filename = model_name + else: + filename = os.path.join(output_dir, f"{model_name}.pte") + + try: + # Write program to file. + with open(filename, "wb") as file: + prog.write_to_file(file) + logging.info(f"Saved exported program to {filename}") + # Write data to file/s. + prog.write_tensor_data_to_file(outdir=output_dir) + except Exception as e: + logging.error(f"Error while saving to {filename}: {e}") + + return filename diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 8da1eab844b..0780ff2cd8b 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -413,6 +413,7 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage self.pre_autograd_graph_module = m return self elif (self.nncf_compression): + print("DEBUG - executorch - builder - quantize - A") tokenizer = get_tokenizer(self.tokenizer_path) def transform_fn( @@ -439,6 +440,7 @@ def transform_fn( ratio=0.8, sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION, ) + print("DEBUG - executorch - builder - quantize - B") return self else: logging.info("No quantizer provided, passing...") @@ -448,6 +450,7 @@ def export_to_edge(self) -> "LLMEdgeManager": """ Export the model to Edge dialect and retrieve a LLMEdgeManager. """ + print("DEBUG - executorch - builder - export_to_edge - A") dynamic_shape = self._get_dynamic_shape() edge_config = self._get_edge_config() @@ -467,6 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager": ) with override_export_behaviour: + #if (self.nncf_compression): + # from executorch.backends.openvino.utils import export_to_edge self.edge_manager = export_to_edge( self.pre_autograd_graph_module, # pyre-fixme[6] self.example_inputs, @@ -476,6 +481,7 @@ def export_to_edge(self) -> "LLMEdgeManager": edge_compile_config=edge_config, verbose=self.verbose, ) + print("DEBUG - executorch - builder - export_to_edge - B") return self def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager": From 3fef8fd57a0d980c94e35e417e16a119939f4268 Mon Sep 17 00:00:00 2001 From: Cavus Mustafa Date: Thu, 5 Jun 2025 17:03:31 -0700 Subject: [PATCH 13/13] enable import override for export_to_edge with openvino --- extension/llm/export/builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 0780ff2cd8b..6339da0b311 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -470,8 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager": ) with override_export_behaviour: - #if (self.nncf_compression): - # from executorch.backends.openvino.utils import export_to_edge + if (self.nncf_compression): + from executorch.backends.openvino.utils import export_to_edge self.edge_manager = export_to_edge( self.pre_autograd_graph_module, # pyre-fixme[6] self.example_inputs,