From dd6c095fd64fdc7a1c482a1c72c7e31ef99adaf7 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 7 May 2025 19:00:37 -0700
Subject: [PATCH 01/13] ov llama support test updates

---
 backends/openvino/CMakeLists.txt              |  18 ++++
 backends/openvino/partitioner.py              |  40 ++++++-
 backends/openvino/preprocess.py               |   3 +-
 backends/openvino/runtime/OpenvinoBackend.cpp | 101 ++++++++++++++++--
 examples/models/llama/CMakeLists.txt          |   8 ++
 examples/models/llama/export_llama_lib.py     |   6 ++
 extension/llm/export/partitioner_lib.py       |  20 ++++
 tools/cmake/executorch-config.cmake           |   1 +
 8 files changed, 180 insertions(+), 17 deletions(-)

diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index 8d07cd9a366..6338ea8891e 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -70,6 +70,24 @@ if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
 endif()
 
 
+if(EXECUTORCH_BUILD_OPENVINO_NANOGPT_RUNNER)
+    # Build executor runner binary for openvino backend
+    list(APPEND openvino_nanogpt_runner_libs openvino_backend executorch extension_module_static extension_tensor)
+
+    set(_openvino_nanogpt_runner__srcs
+        ${EXECUTORCH_ROOT}/examples/llm_manual/main.cpp
+        )
+    add_executable(openvino_nanogpt_runner ${_openvino_nanogpt_runner__srcs})
+
+    list(APPEND openvino_nanogpt_runner_libs)
+
+    target_link_libraries(
+      openvino_nanogpt_runner gflags portable_ops_lib ${openvino_nanogpt_runner_libs}
+    )
+    target_compile_options(openvino_nanogpt_runner PUBLIC ${_common_compile_options})
+endif()
+
+
 
 # Install OpenVINO backend library to the lib directory
 install(TARGETS openvino_backend DESTINATION lib)
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index bc3fde573e2..8f2c5dcb846 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -62,6 +62,13 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             op_type = node.target.__name__
         else:
             op_type = str(node.target)
+
+        if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip:
+            print(
+                f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped."
+            )
+            return False
+
         supported_ops = OperatorSupport(options)._support_dict
         if op_type == "getitem":
             return True
@@ -71,11 +78,6 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
         else:
             print("Op not supported: ", "torch.ops." + str(op_type))
 
-        if op_type in self._op_types_to_skip or node.name in self._op_names_to_skip:
-            print(
-                f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped."
-            )
-            return False
 
         return False
 
@@ -127,15 +129,43 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         :param exported_program: The exported program.
         :return: A PartitionResult containing the partitioned graph and delegation tags.
         """
+
+        self._op_names_to_skip = set()
+        print("DEBUG - OpenvinoPartitioner - graph")
+        #print(exported_program.graph_module.code)
+        for node in exported_program.graph_module.graph.nodes:
+            if str(node.op).strip() == "call_function" and str(node.target.__name__).strip() == "aten.slice_copy.Tensor":
+            #if str(node.op).strip() == "call_function" and str(node.target.__name__).strip() == "aten.slice_copy.Tensor" and str(node.name).strip() == "aten_slice_copy_tensor_6":
+                print("\tDEBUG - OpenvinoPartitioner - slice_copy - op: ", node.op, ", target: ", node.target.__name__, ", name: ", node.name)
+                if not (len(node.all_input_nodes) == 3):
+                    continue
+                slice_copy_in0 = node.all_input_nodes[0]
+                if not (str(slice_copy_in0.op).strip() == "placeholder"):
+                    continue
+                print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in0 - op: ", slice_copy_in0.op, ", target: ", slice_copy_in0.target, ", name: ", slice_copy_in0.name)
+                slice_copy_in1 = node.all_input_nodes[1]
+                if not (str(slice_copy_in1.op).strip() == "call_function" and str(slice_copy_in1.target.__name__).strip() == "_local_scalar_dense.default"):
+                    continue
+                print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in1 - op: ", slice_copy_in1.op, ", target: ", slice_copy_in1.target.__name__, ", name: ", slice_copy_in1.name)
+                slice_copy_in2 = node.all_input_nodes[2]
+                if not (str(slice_copy_in2.op).strip() == "call_function" and str(slice_copy_in2.target.__name__).strip() == "add"):
+                    continue
+                print("\t\tDEBUG - OpenvinoPartitioner - slice_copy_in2 - op: ", slice_copy_in2.op, ", target: ", slice_copy_in2.target.__name__, ", name: ", slice_copy_in2.name)
+                #for input_node in node.all_input_nodes:
+                #    print("\tDEBUG - OpenvinoPartitioner - input_node - op: ", input_node.op, ", target: ", input_node.target, ", name: ", input_node.name)
+                self._op_names_to_skip.add(node.name)
+
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
             OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),
             allows_single_node_partition=True,
         )
         partition_list = partitioner.propose_partitions()
+        print("\tDEBUG - part - size: ", partition.size())
 
         partition_tags = {}
         for partition in partition_list:
+            print("\tDEBUG - part - size: ", partition.size())
             for node in partition.nodes:
                 tag = f"tag{partition.id}"
                 node.meta["delegation_tag"] = tag
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index c343f44a8b5..2775e3eed89 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -51,4 +51,5 @@ def preprocess(
         )
         model_bytes = compiled.export_model()
 
-        return PreprocessResult(processed_bytes=model_bytes.getvalue())
+        #return PreprocessResult(processed_bytes=model_bytes.getvalue())
+        return PreprocessResult(processed_bytes=model_bytes)
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index a3134f72b4b..cd50d69f5af 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -102,22 +102,94 @@ exr::Error OpenvinoBackend::execute(
   size_t num_outputs = infer_request->get_compiled_model().outputs().size();
 
   // Set inputs
+  std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl;
   for (size_t i = 0; i < num_inputs; i++) {
-    auto input_tensor = args[i]->toTensor();
-    ov::Shape input_shape(
-        input_tensor.sizes().begin(), input_tensor.sizes().end());
-
-    // Convert input tensor to OpenVINO tensor
-    ov::element::Type ov_type =
-        convert_to_openvino_type(input_tensor.scalar_type());
-    ov::Tensor ov_input_tensor(
-        ov_type, input_shape, input_tensor.mutable_data_ptr());
+    std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl;
+
+    if (args[i]->isNone()) {
+        std::cout << "DEBUG - Module - forward - A - type: none" << std::endl;
+    } else if (args[i]->isInt()) {
+        std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl;
+    } else if (args[i]->isDouble()) {
+        std::cout << "DEBUG - Module - forward - A - type: double" << std::endl;
+    } else if (args[i]->isBool()) {
+        std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl;
+    } else if (args[i]->isScalar()) {
+        std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl;
+    } else if (args[i]->isTensor()) {
+        std::cout << "DEBUG - Module - forward - A - type: tensor, shape: [";
+        for (int j=0; j<args[i]->toTensor().dim(); j++) {
+            std::cout << args[i]->toTensor().size(j) << ", ";
+        }
+        std::cout << "]" << std::endl;
+    } else if (args[i]->isString()) {
+        std::cout << "DEBUG - Module - forward - A - type: string" << std::endl;
+    } else if (args[i]->isIntList()) {
+        std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl;
+    } else if (args[i]->isBoolList()) {
+        std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl;
+    } else if (args[i]->isDoubleList()) {
+        std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl;
+    } else if (args[i]->isTensorList()) {
+        std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl;
+    } else if (args[i]->isListOptionalTensor()) {
+        std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl;
+    } else {
+        std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl;
+    }
 
-    infer_request->set_input_tensor(i, ov_input_tensor);
+    if (args[i]->isInt()) {
+        //std::cout << "DEBUG - OpenvinoBackend - input - B.1" << std::endl;
+        //auto input_tensor = args[i]->toInt();
+        //std::cout << "DEBUG - OpenvinoBackend - input - B.2" << std::endl;
+        //ov::Shape input_shape(
+        //    input_tensor.sizes().begin(), input_tensor.sizes().end());
+
+        //std::cout << "DEBUG - OpenvinoBackend - input - B.3" << std::endl;
+        // Convert input tensor to OpenVINO tensor
+        //std::cout << "DEBUG - OpenvinoBackend - input - B.4" << std::endl;
+        //int64_t val = args[i]->toInt();
+        //int64_t val = i;
+        int64_t *val = &(args[i]->payload.copyable_union.as_int);
+        //std::cout << "DEBUG - OpenvinoBackend - input - B.5 - val: " << val << std::endl;
+        //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{}, &val);
+        //std::vector<int64_t> val = {args[i]->toInt()};
+        //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, &val);
+        ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
+        std::cout << "DEBUG - OpenvinoBackend - input - B.6 - val: " << ((int64_t*)(ov_input_tensor.data<int64_t>()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl;
+
+        infer_request->set_input_tensor(i, ov_input_tensor);
+        //std::cout << "DEBUG - OpenvinoBackend - input - B.7" << std::endl;
+    } else {
+        //std::cout << "DEBUG - OpenvinoBackend - input - C.1" << std::endl;
+        auto input_tensor = args[i]->toTensor();
+        //std::cout << "DEBUG - OpenvinoBackend - input - C.2" << std::endl;
+        ov::Shape input_shape(
+            input_tensor.sizes().begin(), input_tensor.sizes().end());
+
+        //std::cout << "DEBUG - OpenvinoBackend - input - C.3" << std::endl;
+        // Convert input tensor to OpenVINO tensor
+        ov::element::Type ov_type =
+            convert_to_openvino_type(input_tensor.scalar_type());
+        //std::cout << "DEBUG - OpenvinoBackend - input - C.4" << std::endl;
+        ov::Tensor ov_input_tensor(
+            ov_type, input_shape, input_tensor.mutable_data_ptr());
+        //std::cout << "DEBUG - OpenvinoBackend - input - C.5" << std::endl;
+
+        infer_request->set_input_tensor(i, ov_input_tensor);
+        //std::cout << "DEBUG - OpenvinoBackend - input - C.6" << std::endl;
+    }
   }
 
   // Set outputs
+  std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl;
   for (size_t i = 0; i < num_outputs; i++) {
+    //args[num_inputs + i]->toTensor().unsafeGetTensorImpl()->set_size(1,1);
+    std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: [";
+    for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
+        std::cout << args[num_inputs + i]->toTensor().size(j) << ", ";
+    }
+    std::cout << "]" << std::endl; 
     auto output_tensor = args[num_inputs + i]->toTensor();
     ov::Shape output_shape(
         output_tensor.sizes().begin(), output_tensor.sizes().end());
@@ -133,7 +205,14 @@ exr::Error OpenvinoBackend::execute(
 
   // Execute the inference
   infer_request->infer();
-
+  //auto out_t = infer_request->get_output_tensor(0);
+  //std::cout << "DEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << std::endl;
+  //for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
+  //    std::cout << args[num_inputs + i]->toTensor().size(j) << ", ";
+  //}
+  //std::cout << "]" << std::endl;
+
+  //std::cout << "DEBUG - OpenvinoBackend - DD" << std::endl;
   return exr::Error::Ok;
 }
 
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 4ea735e5717..b832cbbccf6 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -173,6 +173,14 @@ if(TARGET qnn_executorch_backend)
   target_link_options_shared_lib(qnn_executorch_backend)
 endif()
 
+# Openvino backend
+if(TARGET openvino_backend)
+  find_package(OpenVINO REQUIRED)
+  target_link_libraries(openvino_backend INTERFACE openvino::runtime executorch_core)
+  list(APPEND link_libraries openvino_backend)
+  target_link_options_shared_lib(openvino_backend)
+endif()
+
 # MPS backend
 if(TARGET mpsdelegate)
   list(
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 0e48a8520d7..05d03ea5621 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -38,6 +38,7 @@
 from executorch.extension.llm.export.partitioner_lib import (
     get_coreml_partitioner,
     get_mps_partitioner,
+    get_openvino_partitioner,
     get_qnn_partitioner,
     get_vulkan_partitioner,
     get_xnnpack_partitioner,
@@ -414,6 +415,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Delegate llama2 to qnn backend (Qualcomm), please use it --kv_cahce=True",
     )
+    parser.add_argument("--openvino", action="store_true")
 
     parser.add_argument(
         "--expand_rope_table",
@@ -814,6 +816,10 @@ def _to_edge_and_lower_llama(  # noqa: C901
         partitioners.append(get_mps_partitioner(args.use_kv_cache))
         modelname = f"mps_{modelname}"
 
+    if args.openvino:
+        partitioners.append(get_openvino_partitioner(args.use_kv_cache))
+        modelname = f"openvino_{modelname}"
+
     if args.coreml:
         coreml_partitioner = get_coreml_partitioner(
             args.coreml_ios,
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
index 20604bbf635..ade3bec094f 100644
--- a/extension/llm/export/partitioner_lib.py
+++ b/extension/llm/export/partitioner_lib.py
@@ -64,6 +64,26 @@ def get_mps_partitioner(use_kv_cache: bool = False):
     return MPSPartitioner(compile_specs)  # pyre-fixme[16]
 
 
+def get_openvino_partitioner(use_kv_cache: bool = False):
+    from executorch.exir.backend.backend_details import CompileSpec
+
+    assert (
+        use_kv_cache is True
+    ), "MPS backend currently only supports static shape and use_kv_cache=True is the only way to support it at the moment"
+    try:
+        # pyre-ignore Undefined import [21]: Could not find a module corresponding to import `executorch.backends.apple.mps.partition.mps_partitioner`.
+        from executorch.backends.openvino.partitioner import (
+            OpenvinoPartitioner,
+        )
+    except ImportError:
+        raise ImportError(
+            "Please install the MPS backend follwing https://pytorch.org/executorch/main/build-run-mps.html"
+        )
+
+    compile_specs = [CompileSpec("device", "CPU".encode())]
+    return OpenvinoPartitioner(compile_specs)  # pyre-fixme[16]
+
+
 def get_coreml_partitioner(
     ios: int = 15,
     embedding_quantize: Optional[str] = None,
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index a8e756fbb77..d0c21365006 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -74,6 +74,7 @@ set(lib_list
     mpsdelegate
     neuron_backend
     qnn_executorch_backend
+    openvino_backend
     portable_ops_lib
     custom_ops
     extension_module

From 36a7900772c1ff2ee1aed2cef295589b13cdb9d5 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 8 May 2025 11:31:01 -0700
Subject: [PATCH 02/13] partitioning debug print fix

---
 backends/openvino/partitioner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 8f2c5dcb846..0cb7e47bd3c 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -161,7 +161,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             allows_single_node_partition=True,
         )
         partition_list = partitioner.propose_partitions()
-        print("\tDEBUG - part - size: ", partition.size())
+        print("DEBUG - num_parts: ", len(partition_list))
 
         partition_tags = {}
         for partition in partition_list:

From bb507924ffa63af680534549c1737805116b05f5 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 16:10:04 +0400
Subject: [PATCH 03/13] init

---
 examples/models/llama/export_llama_lib.py |  8 ++++++
 extension/llm/export/builder.py           | 35 ++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 05d03ea5621..ae4ab97258e 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -521,6 +521,13 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="If true, stops right after torch.export() and saves the exported model.",
     )
+
+    parser.add_argument(
+        "--nncf_compression",
+        default=False,
+        action="store_true",
+        help="If true, stops right after torch.export() and saves the exported model.",
+    )
     return parser
 
 
@@ -1138,6 +1145,7 @@ def _load_llama_model(
         use_legacy_export=args.qnn,
         save_exported_program=args.export_only,
         verbose=verbose,
+        nncf_compression=args.nncf_compression,
         metadata=_load_llama_model_metadata(
             weight_type,
             use_kv_cache,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 323311caeea..be0fc6824b3 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -15,7 +15,7 @@
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
-
+import nncf
 import torch
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
@@ -41,6 +41,7 @@
 from torch.export import export_for_training, ExportedProgram
 from torch.nn.attention import SDPBackend
 from torchao.utils import unwrap_tensor_subclass
+from functools import partial
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -99,6 +100,7 @@ def __init__(
         dynamic_shapes: Optional[Any] = None,
         use_legacy_export: bool = False,
         save_exported_program: bool = False,
+        nncf_compression: bool = False
     ):
         # Store necessary constructor arguments.
         self.model = model
@@ -120,6 +122,7 @@ def __init__(
         self.dynamic_shapes = dynamic_shapes
         self.use_legacy_export = use_legacy_export
         self.save_exported_program = save_exported_program
+        self.nncf_compression = nncf_compression
 
         # Note: treat this as the source of truth for the result of
         # torch.export'ing a model. If the overall ExportedProgram is needed,
@@ -409,6 +412,36 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
+        if(self.nncf_compression):
+            tokenizer = get_tokenizer(self.tokenizer_path)
+            def transform_fn(
+                module: torch.fx.GraphModule, tokenizer, prompts: str
+            ):
+                # TODO: change criteria & support batch inputs if necessary
+                pos = torch.tensor(0, dtype=torch.int64)
+                token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+                with torch.no_grad():
+                    while token_list[-1] != tokenizer.eos_id:
+                        logits = module(
+                            torch.full((1, 1), token_list[pos]),
+                            {"input_pos": torch.tensor((pos,))},
+                        )
+                        pos += 1
+                        if pos >= len(token_list):
+                            if self.generate_full_logits:
+                                token_list.append(
+                                    torch.argmax(logits[:, -1], dim=-1).item()
+                                )
+                            else:
+                                token_list.append(torch.argmax(logits[:], dim=-1).item())
+            self.pre_autograd_graph_module = nncf.compress_weights(
+                                                                self.pre_autograd_graph_module,
+                                                                # dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
+                                                                mode=nncf.CompressWeightsMode.INT4_SYM,
+                                                                # ratio=0.8,
+                                                                # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                            )
         else:
             logging.info("No quantizer provided, passing...")
             return self

From 6925c5e8d1860953006f319e9f30be93bc24f767 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 16:28:22 +0400
Subject: [PATCH 04/13] small fix

---
 extension/llm/export/builder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index be0fc6824b3..076db8ef6e5 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -442,6 +442,7 @@ def transform_fn(
                                                                 # ratio=0.8,
                                                                 # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
                                                             )
+            return self
         else:
             logging.info("No quantizer provided, passing...")
             return self

From 5e23cb9f37f156bc7067360edc72b6d81342bab4 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 16:45:54 +0400
Subject: [PATCH 05/13] minor fix

---
 extension/llm/export/builder.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 076db8ef6e5..dc31efba4f7 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -412,7 +412,7 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
-        if(self.nncf_compression):
+        elif(self.nncf_compression):
             tokenizer = get_tokenizer(self.tokenizer_path)
             def transform_fn(
                 module: torch.fx.GraphModule, tokenizer, prompts: str
@@ -437,10 +437,10 @@ def transform_fn(
                                 token_list.append(torch.argmax(logits[:], dim=-1).item())
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
-                                                                # dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
+                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
                                                                 mode=nncf.CompressWeightsMode.INT4_SYM,
-                                                                # ratio=0.8,
-                                                                # sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
+                                                                ratio=0.8,
+                                                                sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
                                                             )
             return self
         else:

From e04a901c77c7bfcbd06653c27cd3d5d9c4f39b27 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 19:56:42 +0400
Subject: [PATCH 06/13] add data aware wc

---
 extension/llm/export/builder.py | 36 ++++++++++++++-------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index dc31efba4f7..0e26145dff8 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -412,32 +412,26 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m
             return self
-        elif(self.nncf_compression):
+        elif (self.nncf_compression):
             tokenizer = get_tokenizer(self.tokenizer_path)
+
             def transform_fn(
-                module: torch.fx.GraphModule, tokenizer, prompts: str
+                prompts: str, tokenizer
             ):
-                # TODO: change criteria & support batch inputs if necessary
-                pos = torch.tensor(0, dtype=torch.int64)
-                token_list = tokenizer.encode(prompts, bos=True, eos=False)
-
-                with torch.no_grad():
-                    while token_list[-1] != tokenizer.eos_id:
-                        logits = module(
-                            torch.full((1, 1), token_list[pos]),
-                            {"input_pos": torch.tensor((pos,))},
-                        )
-                        pos += 1
-                        if pos >= len(token_list):
-                            if self.generate_full_logits:
-                                token_list.append(
-                                    torch.argmax(logits[:, -1], dim=-1).item()
-                                )
-                            else:
-                                token_list.append(torch.argmax(logits[:], dim=-1).item())
+                tokenized_text = tokenizer.encode(prompts, bos=False, eos=False)
+                logging.error(tokenized_text)
+
+                inputs = ()
+                inputs = (
+                    torch.tensor(tokenized_text).unsqueeze(0),
+                    {"input_pos": torch.tensor([0])},
+                )
+
+                return inputs
+                                
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
-                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, self.pre_autograd_graph_module, tokenizer)),
+                                                                dataset=nncf.Dataset([self.calibration_data], transform_func=partial(transform_fn, tokenizer=tokenizer)),
                                                                 mode=nncf.CompressWeightsMode.INT4_SYM,
                                                                 ratio=0.8,
                                                                 sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,

From fb5750ef03ad76dbb6a703c1f1ce094fb0b2f0d6 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 19:59:15 +0400
Subject: [PATCH 07/13] minor fix

---
 extension/llm/export/builder.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 0e26145dff8..11cc0d0e749 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -428,10 +428,11 @@ def transform_fn(
                 )
 
                 return inputs
-                                
+
+            self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
-                                                                dataset=nncf.Dataset([self.calibration_data], transform_func=partial(transform_fn, tokenizer=tokenizer)),
+                                                                dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),
                                                                 mode=nncf.CompressWeightsMode.INT4_SYM,
                                                                 ratio=0.8,
                                                                 sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,

From ea9eeb888f4bde33a46eb8181237e8ddf7ba948b Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 22:43:15 +0400
Subject: [PATCH 08/13] add quantization support for disable_dynamic_shapes

---
 extension/llm/export/builder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 11cc0d0e749..be2b50a2339 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -430,6 +430,7 @@ def transform_fn(
                 return inputs
 
             self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
+            self.calibration_data = ([word for prompt in self.calibration_data for word in prompt.split()] if self.dynamic_shapes else self.calibration_data)
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
                                                                 dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),

From 770569dcfa31f733f15cd26189bf56a135d5147c Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Mon, 19 May 2025 22:51:55 +0400
Subject: [PATCH 09/13] minor fix

---
 extension/llm/export/builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index be2b50a2339..8da1eab844b 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -430,7 +430,8 @@ def transform_fn(
                 return inputs
 
             self.calibration_data = [self.calibration_data] if isinstance(self.calibration_data, str) else self.calibration_data
-            self.calibration_data = ([word for prompt in self.calibration_data for word in prompt.split()] if self.dynamic_shapes else self.calibration_data)
+            self.calibration_data = [word for prompt in self.calibration_data for word in prompt.split()] if not self.dynamic_shapes else self.calibration_data
+            logging.error(self.calibration_data)
             self.pre_autograd_graph_module = nncf.compress_weights(
                                                                 self.pre_autograd_graph_module,
                                                                 dataset=nncf.Dataset(self.calibration_data, transform_func=partial(transform_fn, tokenizer=tokenizer)),

From 9916cee4d826ca82737b66fe4904f20f949848ff Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 20 May 2025 11:49:26 +0400
Subject: [PATCH 10/13] partitioner update

---
 backends/openvino/partitioner.py | 50 +++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 0cb7e47bd3c..b7032634780 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -26,6 +26,12 @@
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 
+class PatternNode:
+    op_types = {}
+
+    def __init__(self):
+        self.op_types = {}
+
 class OpenvinoOperatorsSupport(OperatorSupportBase):
 
     def __init__(
@@ -121,6 +127,47 @@ def ops_to_not_decompose(
             torch.ops.aten.upsample_nearest2d.vec,
         ]
         return (ops_not_decompose, None)
+    
+    def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+        if node.op == "call_function":
+            if ("call_function" + ":" + str(node.target)) in pattern.op_types:
+                pt_input_nodes = node.all_input_nodes
+                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target)]
+                if pattern_input_ops is None:
+                    enabled_ops.append(node)
+                    return True
+                if len(pt_input_nodes) != len(pattern_input_ops):
+                    return False
+                for i in range(len(pt_input_nodes)):
+                    if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+                        return False
+                enabled_ops.append(node)
+                return True
+        elif node.op == "get_attr":
+            if "get_attr" in pattern.op_types:
+                return True
+            else:
+                return False
+        return False
+
+    def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
+        const_node = PatternNode
+        const_node.op_types["get_attr"] = None
+        bitwise_right_shift_node = PatternNode
+        bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
+        bitwise_and_node = PatternNode
+        bitwise_and_node.op_types["call_function:aten.bitwise_and.Scalar"] = [const_node]
+        stack_node = PatternNode
+        stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
+
+        for node in graph_module.graph.nodes:
+            if str(node.op) == "call_function" and str(node.target) == "aten.stack.default":
+                enabled_ops = []
+                pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops)
+                if pattern_match:
+                    for pattern_op in enabled_ops:
+                        print(pattern_op.name)
+                        self._op_names_to_skip.add(pattern_op.name)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
@@ -154,7 +201,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 #for input_node in node.all_input_nodes:
                 #    print("\tDEBUG - OpenvinoPartitioner - input_node - op: ", input_node.op, ", target: ", input_node.target, ", name: ", input_node.name)
                 self._op_names_to_skip.add(node.name)
-
+                
+        self.capture_nncf_patterns(exported_program.graph_module)
         partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
             OpenvinoOperatorsSupport(self._op_types_to_skip, self._op_names_to_skip),

From 0c20955e437bf3bcf3f3ae19aac1417262b4f506 Mon Sep 17 00:00:00 2001
From: anzr299 <aamir.nazir@intel.com>
Date: Tue, 20 May 2025 13:33:24 +0400
Subject: [PATCH 11/13] update for latest

---
 examples/models/llama/export_llama_lib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index f9935fd324f..9cd906ad2f3 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -860,6 +860,7 @@ def _to_edge_and_lower_llama(  # noqa: C901
     mps: bool = False,
     coreml: bool = False,
     qnn: bool = False,
+    openvino: bool = False,
     dtype_override: str = "fp32",
     enable_dynamic_shape: bool = True,
     use_kv_cache: bool = False,
@@ -1076,6 +1077,7 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
             mps=args.mps,
             coreml=args.coreml,
             qnn=args.qnn,
+            openvino=args.openvino,
             dtype_override=args.dtype_override,
             enable_dynamic_shape=args.enable_dynamic_shape,
             use_kv_cache=args.use_kv_cache,

From d3730eaf17c57458ca358076fa95ead44b5e8d79 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Wed, 4 Jun 2025 19:32:48 -0700
Subject: [PATCH 12/13] quant and fp16 temp fix

---
 backends/openvino/partitioner.py              |  32 +++-
 backends/openvino/preprocess.py               |   4 +-
 backends/openvino/requirements.txt            |   2 +-
 backends/openvino/runtime/OpenvinoBackend.cpp | 145 ++++++++++++-----
 backends/openvino/utils.py                    | 150 ++++++++++++++++++
 extension/llm/export/builder.py               |   6 +
 6 files changed, 290 insertions(+), 49 deletions(-)
 create mode 100644 backends/openvino/utils.py

diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index b7032634780..b64ebb0a7b2 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -73,7 +73,7 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
             print(
                 f"[OpenVINO Backend] The {op_type} operator with name '{node.name}' is skipped."
             )
-            return False
+            return True
 
         supported_ops = OperatorSupport(options)._support_dict
         if op_type == "getitem":
@@ -129,30 +129,48 @@ def ops_to_not_decompose(
         return (ops_not_decompose, None)
     
     def check_pattern(self, node: torch.fx.Node, pattern: PatternNode, enabled_ops: list) -> bool:
+        print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.0 - op: ", node.op)
         if node.op == "call_function":
-            if ("call_function" + ":" + str(node.target)) in pattern.op_types:
+            print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.1")
+            if ("call_function" + ":" + str(node.target.__name__)) in pattern.op_types:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - B - target: ", node.target.__name__)
                 pt_input_nodes = node.all_input_nodes
-                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target)]
+                pattern_input_ops = pattern.op_types["call_function" + ":" + str(node.target.__name__)]
                 if pattern_input_ops is None:
                     enabled_ops.append(node)
+                    print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.1")
                     return True
                 if len(pt_input_nodes) != len(pattern_input_ops):
+                    print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.2")
                     return False
                 for i in range(len(pt_input_nodes)):
                     if not self.check_pattern(pt_input_nodes[i], pattern_input_ops[i], enabled_ops):
+                        print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.3")
                         return False
                 enabled_ops.append(node)
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - C.4")
                 return True
         elif node.op == "get_attr":
             if "get_attr" in pattern.op_types:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2")
+                return True
+            else:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3")
+                return False
+        elif node.op == "placeholder":
+            if "placeholder" in pattern.op_types:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.2")
                 return True
             else:
+                print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.3")
                 return False
+        print("\t\tDEBUG - capture_nncf_patterns - check_pattern - A.4")
         return False
 
     def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
         const_node = PatternNode
         const_node.op_types["get_attr"] = None
+        const_node.op_types["placeholder"] = None
         bitwise_right_shift_node = PatternNode
         bitwise_right_shift_node.op_types["call_function:aten.bitwise_right_shift.Tensor_Scalar"] = [const_node]
         bitwise_and_node = PatternNode
@@ -160,11 +178,15 @@ def capture_nncf_patterns(self, graph_module: torch.fx.GraphModule):
         stack_node = PatternNode
         stack_node.op_types["call_function:aten.stack.default"] = [bitwise_and_node, bitwise_right_shift_node]
 
+        print("DEBUG - capture_nncf_patterns - A")
         for node in graph_module.graph.nodes:
-            if str(node.op) == "call_function" and str(node.target) == "aten.stack.default":
+            print("\tDEBUG - capture_nncf_patterns - B - op: ", node.op, ", target: ", node.target)
+            if str(node.op) == "call_function" and str(node.target.__name__) == "aten.stack.default":
+                print("\tDEBUG - capture_nncf_patterns - C - stack found")
                 enabled_ops = []
-                pattern_match = self.check_pattern(node, bitwise_and_node, enabled_ops)
+                pattern_match = self.check_pattern(node, stack_node, enabled_ops)
                 if pattern_match:
+                    print("\tDEBUG - capture_nncf_patterns - D - match")
                     for pattern_op in enabled_ops:
                         print(pattern_op.name)
                         self._op_names_to_skip.add(pattern_op.name)
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 2775e3eed89..665921f50e7 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -51,5 +51,5 @@ def preprocess(
         )
         model_bytes = compiled.export_model()
 
-        #return PreprocessResult(processed_bytes=model_bytes.getvalue())
-        return PreprocessResult(processed_bytes=model_bytes)
+        return PreprocessResult(processed_bytes=model_bytes.getvalue())
+        #return PreprocessResult(processed_bytes=model_bytes)
diff --git a/backends/openvino/requirements.txt b/backends/openvino/requirements.txt
index 316633e9004..ccb2aa91430 100644
--- a/backends/openvino/requirements.txt
+++ b/backends/openvino/requirements.txt
@@ -1,2 +1,2 @@
 transformers
-git+https://github.com/openvinotoolkit/nncf@6b0fc1c#egg=nncf
+git+https://github.com/openvinotoolkit/nncf@develop#egg=nncf
diff --git a/backends/openvino/runtime/OpenvinoBackend.cpp b/backends/openvino/runtime/OpenvinoBackend.cpp
index cd50d69f5af..20e308e6ef7 100644
--- a/backends/openvino/runtime/OpenvinoBackend.cpp
+++ b/backends/openvino/runtime/OpenvinoBackend.cpp
@@ -23,6 +23,36 @@ namespace executorch {
 namespace backends {
 namespace openvino {
 
+
+std::string scalarTypeToString(exa::ScalarType type) {
+    switch (type) {
+        case exa::ScalarType::Byte:   return "Byte";
+        case exa::ScalarType::Char:   return "Char";
+        case exa::ScalarType::Short:  return "Short";
+        case exa::ScalarType::Int:    return "Int";
+        case exa::ScalarType::Long:   return "Long";
+        case exa::ScalarType::Half:   return "Half";
+        case exa::ScalarType::Float:  return "Float";
+        case exa::ScalarType::Double: return "Double";
+        case exa::ScalarType::Bool:   return "Bool";
+        case exa::ScalarType::BFloat16: return "BFloat16";
+        case exa::ScalarType::ComplexHalf: return "ComplexHalf";
+        case exa::ScalarType::ComplexFloat: return "ComplexFloat";
+        case exa::ScalarType::ComplexDouble: return "ComplexDouble";
+        case exa::ScalarType::QUInt8: return "QUInt8";
+        case exa::ScalarType::QInt8:  return "QInt8";
+        case exa::ScalarType::QInt32: return "QInt32";
+        case exa::ScalarType::QUInt4x2: return "QUInt4x2";
+        case exa::ScalarType::QUInt2x4: return "QUInt2x4";
+        case exa::ScalarType::Undefined: return "Undefined";
+        case exa::ScalarType::NumOptions: return "NumOptions";
+        default:
+            throw std::invalid_argument("Unknown ScalarType");
+    }
+}
+
+
+
 OpenvinoBackend::OpenvinoBackend() {}
 
 bool OpenvinoBackend::is_available() const {
@@ -71,7 +101,9 @@ exr::Result<exr::DelegateHandle*> OpenvinoBackend::init(
   }
 
   // Import the model
+  //std::cout << "DEBUG - before import" << std::endl;
   auto compiled_model = core.import_model(compiled_stream, device);
+  //std::cout << "DEBUG - after import" << std::endl;
 
   // The processed data can be freed since the model is compiled
   processed->Free();
@@ -102,41 +134,41 @@ exr::Error OpenvinoBackend::execute(
   size_t num_outputs = infer_request->get_compiled_model().outputs().size();
 
   // Set inputs
-  std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl;
+  //std::cout << "DEBUG - OpenvinoBackend - num_inputs: " << num_inputs << std::endl;
   for (size_t i = 0; i < num_inputs; i++) {
-    std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl;
-
-    if (args[i]->isNone()) {
-        std::cout << "DEBUG - Module - forward - A - type: none" << std::endl;
-    } else if (args[i]->isInt()) {
-        std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl;
-    } else if (args[i]->isDouble()) {
-        std::cout << "DEBUG - Module - forward - A - type: double" << std::endl;
-    } else if (args[i]->isBool()) {
-        std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl;
-    } else if (args[i]->isScalar()) {
-        std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl;
-    } else if (args[i]->isTensor()) {
-        std::cout << "DEBUG - Module - forward - A - type: tensor, shape: [";
-        for (int j=0; j<args[i]->toTensor().dim(); j++) {
-            std::cout << args[i]->toTensor().size(j) << ", ";
-        }
-        std::cout << "]" << std::endl;
-    } else if (args[i]->isString()) {
-        std::cout << "DEBUG - Module - forward - A - type: string" << std::endl;
-    } else if (args[i]->isIntList()) {
-        std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl;
-    } else if (args[i]->isBoolList()) {
-        std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl;
-    } else if (args[i]->isDoubleList()) {
-        std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl;
-    } else if (args[i]->isTensorList()) {
-        std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl;
-    } else if (args[i]->isListOptionalTensor()) {
-        std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl;
-    } else {
-        std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl;
-    }
+    //std::cout << "DEBUG - OpenvinoBackend - input - A - i: " << i << std::endl;
+
+    //if (args[i]->isNone()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: none" << std::endl;
+    //} else if (args[i]->isInt()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: int, val: " << args[i]->toInt() << std::endl;
+    //} else if (args[i]->isDouble()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: double" << std::endl;
+    //} else if (args[i]->isBool()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: bool" << std::endl;
+    //} else if (args[i]->isScalar()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: scalar" << std::endl;
+    //} else if (args[i]->isTensor()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: tensor, shape: [";
+    //    for (int j=0; j<args[i]->toTensor().dim(); j++) {
+    //        std::cout << args[i]->toTensor().size(j) << ", ";
+    //    }
+    //    std::cout << "]" << std::endl;
+    //} else if (args[i]->isString()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: string" << std::endl;
+    //} else if (args[i]->isIntList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: int_list" << std::endl;
+    //} else if (args[i]->isBoolList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: bool_list" << std::endl;
+    //} else if (args[i]->isDoubleList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: double_list" << std::endl;
+    //} else if (args[i]->isTensorList()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: tensor_list" << std::endl;
+    //} else if (args[i]->isListOptionalTensor()) {
+    //    std::cout << "DEBUG - Module - forward - A - type: list_optional_tensor" << std::endl;
+    //} else {
+    //    std::cout << "DEBUG - Module - forward - A - type: no type available" << std::endl;
+    //}
 
     if (args[i]->isInt()) {
         //std::cout << "DEBUG - OpenvinoBackend - input - B.1" << std::endl;
@@ -156,7 +188,7 @@ exr::Error OpenvinoBackend::execute(
         //std::vector<int64_t> val = {args[i]->toInt()};
         //ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, &val);
         ov::Tensor ov_input_tensor(ov::element::i64, ov::Shape{1}, val);
-        std::cout << "DEBUG - OpenvinoBackend - input - B.6 - val: " << ((int64_t*)(ov_input_tensor.data<int64_t>()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl;
+        //std::cout << "\tDEBUG - OpenvinoBackend - input - int - val: " << ((int64_t*)(ov_input_tensor.data<int64_t>()))[0] << ", byte_size: " << ov_input_tensor.get_byte_size() << std::endl;
 
         infer_request->set_input_tensor(i, ov_input_tensor);
         //std::cout << "DEBUG - OpenvinoBackend - input - B.7" << std::endl;
@@ -178,18 +210,35 @@ exr::Error OpenvinoBackend::execute(
 
         infer_request->set_input_tensor(i, ov_input_tensor);
         //std::cout << "DEBUG - OpenvinoBackend - input - C.6" << std::endl;
+
+        //if (ov_type == ov::element::i64) {
+        //    int64_t sum = 0;
+        //    auto data_ptr = ov_input_tensor.data<int64_t>();
+        //    for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(int64_t); j++) {
+        //        sum += data_ptr[j];
+        //    }
+        //    //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl;
+        //} else {
+        //    float sum = 0;
+        //    auto data_ptr = ov_input_tensor.data<float>();
+        //    for (size_t j=0; j < ov_input_tensor.get_byte_size()/sizeof(float); j++) {
+        //        sum += data_ptr[j];
+        //    }
+        //    //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << ", sum_of_values: " << sum << std::endl;
+        //}
+        //std::cout << "\tDEBUG - OpenvinoBackend - input - tensor - shape: " << ov_input_tensor.get_shape() << ", type: " << ov_input_tensor.get_element_type() << std::endl;
     }
   }
 
   // Set outputs
-  std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl;
+  //std::cout << "DEBUG - OpenvinoBackend - num_outputs: " << num_outputs << std::endl;
   for (size_t i = 0; i < num_outputs; i++) {
     //args[num_inputs + i]->toTensor().unsafeGetTensorImpl()->set_size(1,1);
-    std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: [";
-    for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
-        std::cout << args[num_inputs + i]->toTensor().size(j) << ", ";
-    }
-    std::cout << "]" << std::endl; 
+    //std::cout << "DEBUG - OpenvinoBackend output - i: " << i << " - type: tensor, shape: [";
+    //for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
+    //    std::cout << args[num_inputs + i]->toTensor().size(j) << ", ";
+    //}
+    //std::cout << "]" << std::endl; 
     auto output_tensor = args[num_inputs + i]->toTensor();
     ov::Shape output_shape(
         output_tensor.sizes().begin(), output_tensor.sizes().end());
@@ -205,6 +254,15 @@ exr::Error OpenvinoBackend::execute(
 
   // Execute the inference
   infer_request->infer();
+  //for (size_t i = 0; i < num_outputs; i++) {
+  //    auto out_t = infer_request->get_output_tensor(i);
+  //    float sum = 0;
+  //    auto data_ptr = out_t.data<float>();
+  //    for (size_t j=0; j < out_t.get_byte_size()/sizeof(float); j++) {
+  //        sum += data_ptr[j];
+  //    }
+  //    //std::cout << "\tDEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << ", type: " << out_t.get_element_type() << ", sum_of_values: " << sum << std::endl;
+  //}
   //auto out_t = infer_request->get_output_tensor(0);
   //std::cout << "DEBUG - OpenvinoBackend output - after infer tensor - shape: " << out_t.get_shape() << std::endl;
   //for (int j=0; j<args[num_inputs + i]->toTensor().dim(); j++) {
@@ -241,13 +299,18 @@ void OpenvinoBackend::destroy(exr::DelegateHandle* handle) const {
 
 ov::element::Type OpenvinoBackend::convert_to_openvino_type(
     exa::ScalarType scalar_type) const {
+  //std::cout << "DEBUG - scalar_type: " << scalarTypeToString(scalar_type) << std::endl;
   switch (scalar_type) {
     case exa::ScalarType::Float:
       return ov::element::f32;
+    case exa::ScalarType::Half:
+      return ov::element::f16;
     case exa::ScalarType::Int:
       return ov::element::i32;
     case exa::ScalarType::Char:
       return ov::element::i8;
+    case exa::ScalarType::Byte:
+      return ov::element::u8;
     case exa::ScalarType::Long:
       return ov::element::i64;
     case exa::ScalarType::Bool:
diff --git a/backends/openvino/utils.py b/backends/openvino/utils.py
new file mode 100644
index 00000000000..debefdd1a35
--- /dev/null
+++ b/backends/openvino/utils.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import executorch.exir as exir
+
+import torch
+from executorch.exir import EdgeProgramManager, ExecutorchProgramManager
+from executorch.exir.program._program import to_edge_with_preserved_ops
+from executorch.exir.tracer import Value
+from torch.export import export, export_for_training, ExportedProgram
+
+
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
+    _check_ir_validity=True,
+    _skip_dim_order=True,  # TODO(T189114319): Reuse dim order op after solving the ios oss issue
+)
+
+
+def _to_core_aten(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    strict=True,
+    verbose=True,
+) -> ExportedProgram:
+    # post autograd export. eventually this will become .to_core_aten
+    if not isinstance(model, torch.fx.GraphModule) and not isinstance(
+        model, torch.nn.Module
+    ):
+        raise ValueError(
+            f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
+        )
+    core_aten_ep = export(
+        model,
+        example_inputs,
+        example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+    )
+    if verbose:
+        logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
+    return core_aten_ep
+
+
+def _core_aten_to_edge(
+    core_aten_exir_ep: ExportedProgram,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=None,
+    verbose=True,
+) -> EdgeProgramManager:
+    if not edge_compile_config:
+        edge_compile_config = exir.EdgeCompileConfig(
+            _check_ir_validity=False,  # quant ops currently break ir verification
+        )
+    edge_manager: EdgeProgramManager = to_edge_with_preserved_ops(
+        core_aten_exir_ep,
+        constant_methods=edge_constant_methods,
+        compile_config=edge_compile_config,
+        preserve_ops=[torch.ops.aten.stack.default,],
+    )
+    if verbose:
+        logging.info(f"Exported graph:\n{edge_manager.exported_program()}")
+    return edge_manager
+
+
+def export_to_edge(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=_EDGE_COMPILE_CONFIG,
+    strict=True,
+    verbose=True,
+) -> EdgeProgramManager:
+    print("DEBUG - executorch - openvino_utils")
+    core_aten_ep = _to_core_aten(
+        model,
+        example_inputs,
+        example_kwarg_inputs=example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+        verbose=verbose,
+    )
+    return _core_aten_to_edge(
+        core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
+    )
+
+
+def export_to_exec_prog(
+    model: Union[torch.fx.GraphModule, torch.nn.Module],
+    example_inputs: Tuple[Value, ...],
+    *,
+    example_kwarg_inputs: Optional[Dict[str, Any]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+    edge_constant_methods: Optional[Dict[str, Any]] = None,
+    edge_compile_config=_EDGE_COMPILE_CONFIG,
+    backend_config=None,
+    strict=True,
+) -> ExecutorchProgramManager:
+    m = model.eval()
+    # pre-autograd export. eventually this will become torch.export
+    m = export_for_training(m, example_inputs, strict=True).module()
+
+    core_aten_ep = _to_core_aten(
+        m,
+        example_inputs,
+        example_kwarg_inputs=example_kwarg_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=strict,
+    )
+
+    edge_m = _core_aten_to_edge(
+        core_aten_ep, edge_constant_methods, edge_compile_config
+    )
+
+    exec_prog = edge_m.to_executorch(backend_config)
+    return exec_prog
+
+
+def save_pte_program(
+    prog: ExecutorchProgramManager, model_name: str, output_dir: str = ""
+) -> str:
+    if model_name.endswith(".pte"):
+        filename = model_name
+    else:
+        filename = os.path.join(output_dir, f"{model_name}.pte")
+
+    try:
+        # Write program to file.
+        with open(filename, "wb") as file:
+            prog.write_to_file(file)
+            logging.info(f"Saved exported program to {filename}")
+        # Write data to file/s.
+        prog.write_tensor_data_to_file(outdir=output_dir)
+    except Exception as e:
+        logging.error(f"Error while saving to {filename}: {e}")
+
+    return filename
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 8da1eab844b..0780ff2cd8b 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -413,6 +413,7 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 self.pre_autograd_graph_module = m
             return self
         elif (self.nncf_compression):
+            print("DEBUG - executorch - builder - quantize - A")
             tokenizer = get_tokenizer(self.tokenizer_path)
 
             def transform_fn(
@@ -439,6 +440,7 @@ def transform_fn(
                                                                 ratio=0.8,
                                                                 sensitivity_metric=nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION,
                                                             )
+            print("DEBUG - executorch - builder - quantize - B")
             return self
         else:
             logging.info("No quantizer provided, passing...")
@@ -448,6 +450,7 @@ def export_to_edge(self) -> "LLMEdgeManager":
         """
         Export the model to Edge dialect and retrieve a LLMEdgeManager.
         """
+        print("DEBUG - executorch - builder - export_to_edge - A")
         dynamic_shape = self._get_dynamic_shape()
         edge_config = self._get_edge_config()
 
@@ -467,6 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 )
 
             with override_export_behaviour:
+                #if (self.nncf_compression):
+                #    from executorch.backends.openvino.utils import export_to_edge
                 self.edge_manager = export_to_edge(
                     self.pre_autograd_graph_module,  # pyre-fixme[6]
                     self.example_inputs,
@@ -476,6 +481,7 @@ def export_to_edge(self) -> "LLMEdgeManager":
                     edge_compile_config=edge_config,
                     verbose=self.verbose,
                 )
+        print("DEBUG - executorch - builder - export_to_edge - B")
         return self
 
     def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":

From 3fef8fd57a0d980c94e35e417e16a119939f4268 Mon Sep 17 00:00:00 2001
From: Cavus Mustafa <mustafa.cavus@intel.com>
Date: Thu, 5 Jun 2025 17:03:31 -0700
Subject: [PATCH 13/13] enable import override for export_to_edge with openvino

---
 extension/llm/export/builder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 0780ff2cd8b..6339da0b311 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -470,8 +470,8 @@ def export_to_edge(self) -> "LLMEdgeManager":
                 )
 
             with override_export_behaviour:
-                #if (self.nncf_compression):
-                #    from executorch.backends.openvino.utils import export_to_edge
+                if (self.nncf_compression):
+                    from executorch.backends.openvino.utils import export_to_edge
                 self.edge_manager = export_to_edge(
                     self.pre_autograd_graph_module,  # pyre-fixme[6]
                     self.example_inputs,