diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 6f22e867e..8d7163956 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -390,6 +390,12 @@ def _compile(
 
                 For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
         """
+        # Backward-compatible alias accepted by some APIs.
+        if "aic_perf_warning" in compiler_options:
+            if "aic_perf_warnings" not in compiler_options:
+                compiler_options["aic_perf_warnings"] = compiler_options["aic_perf_warning"]
+            compiler_options.pop("aic_perf_warning", None)
+
         onnx_path = Path(
             onnx_path
             if onnx_path
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 3a47aa5ff..5f9e71a24 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -5,11 +5,14 @@
 #
 # ----------------------------------------------------------------------------
 
+import json
 import os
+import re
+import subprocess
 import warnings
 from pathlib import Path
 from time import perf_counter
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -3487,6 +3490,422 @@ def generate(
         else:
             raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
 
+    @staticmethod
+    def _parse_perf_metrics_from_text(text: str) -> Dict[str, float]:
+        """
+        Parse best-effort numeric performance metrics from QAIC logs.
+
+        Parameters
+        ----------
+        text : str
+            Raw log text from compiler/runner.
+
+        Returns
+        -------
+        Dict[str, float]
+            Parsed key-value metrics. Missing fields are omitted.
+        """
+        patterns = {
+            "ddr_reads": r"DDR reads\s*[:=]\s*([0-9]*\.?[0-9]+)",
+            "ddr_writes": r"DDR writes\s*[:=]\s*([0-9]*\.?[0-9]+)",
+            "total_ddr_traffic": r"Total DDR Traffic\s*[:=]\s*([0-9]*\.?[0-9]+)",
+            "inference_per_second": r"Inference per second\s*[:=]\s*([0-9]*\.?[0-9]+)",
+            "inferences_per_second": r"inferences per second\s*[:=]\s*([0-9]*\.?[0-9]+)",
+            "inf_per_sec": r"Inf/Sec\s*([0-9]*\.?[0-9]+)",
+            "latency_ms": r"latency[^0-9]*([0-9]*\.?[0-9]+)\s*ms",
+            "total_duration_us": r"TotalDuration\s*([0-9]*\.?[0-9]+)\s*us",
+            "aic_hmx_mac_estimate": r"AIC HMX MAC estimate:\s*([0-9,]+)",
+            "static_constants_size_gb": r"StaticConstantsSize:\s*([0-9]*\.?[0-9]+)\s*GB",
+            "dynamic_constants_size_mb": r"DynamicConstantsSize:\s*([0-9]*\.?[0-9]+)\s*MB",
+            "total_ddr_read_kb": r"total:\s*DDR traffic\s*read:\s*([0-9,]*\.?[0-9]+)\s*KB",
+            "total_ddr_write_kb": r"total:\s*DDR traffic write:\s*([0-9,]*\.?[0-9]+)\s*KB",
+            "total_mc_write_kb": r"total:\s*MC\s*traffic write:\s*([0-9,]*\.?[0-9]+)\s*KB",
+        }
+        parsed: Dict[str, float] = {}
+        for key, pattern in patterns.items():
+            matches = re.findall(pattern, text, flags=re.IGNORECASE)
+            if matches:
+                try:
+                    value = matches[-1].replace(",", "")
+                    parsed[key] = float(value)
+                except ValueError:
+                    continue
+        return parsed
+
+    @staticmethod
+    def _run_subprocess_capture(command: List[str], stdout_log: Path, stderr_log: Path) -> subprocess.CompletedProcess:
+        """
+        Run subprocess command, persist stdout/stderr logs, and return completed process.
+        """
+        result = subprocess.run(command, capture_output=True, text=True)
+        stdout_log.parent.mkdir(parents=True, exist_ok=True)
+        stderr_log.parent.mkdir(parents=True, exist_ok=True)
+        stdout_log.write_text(result.stdout or "")
+        stderr_log.write_text(result.stderr or "")
+        if result.returncode != 0:
+            raise RuntimeError(
+                "\n".join(
+                    [
+                        "Command failed!",
+                        f"Command: {' '.join(command)}",
+                        f"Exit code: {result.returncode}",
+                        f"Stdout log: {stdout_log}",
+                        f"Stderr log: {stderr_log}",
+                    ]
+                )
+            )
+        return result
+
+    @staticmethod
+    def _prepare_perf_output_dirs(output_dir: Optional[str], qpc_path: Path) -> Dict[str, Path]:
+        """
+        Prepare output directories for evaluate_performance artifacts.
+        """
+        if output_dir:
+            root_dir = Path(output_dir)
+            compile_dir = root_dir / "compile"
+        else:
+            # qpc_path: <...>/qpc-<hash>/qpc
+            # Keep io/performance siblings of qpc-<hash>, as requested.
+            compile_dir = qpc_path.parent
+            root_dir = compile_dir.parent
+
+        dirs = {
+            "root": root_dir,
+            "compile": compile_dir,
+            "compile_logs": compile_dir / "compile_logs",
+            "io": root_dir / "io",
+            "performance_analysis": root_dir / "performance_analysis",
+            "logs": root_dir / "performance_analysis" / "logs",
+            "runner_outputs": root_dir / "performance_analysis" / "runner_outputs",
+            "profiling": root_dir / "performance_analysis" / "profiling",
+            "opstats": root_dir / "performance_analysis" / "opstats",
+        }
+        root_dir.mkdir(parents=True, exist_ok=True)
+        dirs["compile"].mkdir(parents=True, exist_ok=True)
+        dirs["compile_logs"].mkdir(parents=True, exist_ok=True)
+        dirs["logs"].mkdir(parents=True, exist_ok=True)
+        dirs["io"].mkdir(parents=True, exist_ok=True)
+        dirs["performance_analysis"].mkdir(parents=True, exist_ok=True)
+        dirs["runner_outputs"].mkdir(parents=True, exist_ok=True)
+        dirs["profiling"].mkdir(parents=True, exist_ok=True)
+        dirs["opstats"].mkdir(parents=True, exist_ok=True)
+        return dirs
+
+    def _create_runner_batch_json(
+        self,
+        *,
+        mode: str,
+        batch_size: int,
+        seq_len: int,
+        io_dir: Path,
+    ) -> Path:
+        """
+        Create a qaic-runner batch-input JSON with deterministic synthetic inputs.
+        """
+        if mode not in {"prefill", "decode"}:
+            raise ValueError(f"Unknown mode {mode}. Expected one of: prefill, decode.")
+
+        if mode == "prefill":
+            input_ids = np.ones((batch_size, seq_len), dtype=np.int64)
+            position_ids = np.tile(np.arange(seq_len, dtype=np.int64), (batch_size, 1))
+        else:
+            input_ids = np.ones((batch_size, 1), dtype=np.int64)
+            position_ids = np.zeros((batch_size, 1), dtype=np.int64)
+
+        inputs = {"input_ids": input_ids, "position_ids": position_ids}
+
+        if self.continuous_batching:
+            inputs["batch_index"] = np.arange(batch_size, dtype=np.int64).reshape(batch_size, 1)
+
+        if self.is_tlm:
+            inputs["num_logits_to_keep"] = np.zeros((1, 1), dtype=np.int64)
+
+        if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False):
+            raise NotImplementedError("evaluate_performance currently does not support include_sampler=True")
+
+        write_io_files(inputs, {}, str(io_dir), "runner_inputs", "aic_batch_io", include_dims=True, reset=True)
+        batch_json_path = io_dir / "aic_batch_io.json"
+
+        # Convert relative paths to absolute paths for robust qaic-runner execution.
+        with open(batch_json_path, "r") as f:
+            batch_data = json.load(f)
+
+        for io_group in batch_data.get("IO-files", []):
+            for entry in io_group:
+                if "path" in entry:
+                    entry["path"] = str((io_dir / entry["path"]).resolve())
+
+        with open(batch_json_path, "w") as f:
+            json.dump(batch_data, f, indent=2)
+
+        return batch_json_path
+
+    def evaluate_performance(
+        self,
+        *,
+        batch_size: int = 1,
+        prefill_only: bool = False,
+        compile_kwargs: Optional[dict] = None,
+        runner_num_iters: int = 10,
+        profiling_type: str = "raw_device_stats",
+        profiling_start_iter: int = 2,
+        profiling_num_samples: int = 5,
+        write_output_start_iter: Optional[int] = None,
+        output_dir: Optional[str] = None,
+        keep_intermediate_files: bool = True,
+        runner_extra_args: Optional[List[str]] = None,
+    ) -> dict:
+        """
+        Compile with perf flags, execute with qaic-runner, and dump available performance artifacts.
+
+        Parameters
+        ----------
+        batch_size : int, optional
+            Batch size for compilation and synthetic runner inputs. Defaults to 1.
+        prefill_only : bool, optional
+            If True, run performance analysis only for prefill inputs when `prefill_seq_len > 1`.
+            If `prefill_seq_len == 1`, analysis runs decode-only regardless of this flag.
+        compile_kwargs : dict, optional
+            Args passed to compile API. Perf flags are always forced in this API.
+        runner_num_iters : int, optional
+            Number of qaic-runner iterations. Defaults to 10.
+        profiling_type : str, optional
+            qaic-runner profiling type. Defaults to "raw_device_stats".
+        profiling_start_iter : int, optional
+            Iteration to start collecting profiles from. Defaults to 2.
+        profiling_num_samples : int, optional
+            Number of profile samples. Defaults to 5.
+        write_output_start_iter : int, optional
+            Iteration to start writing outputs from qaic-runner.
+            Must be greater than 0 and less than `profiling_start_iter`.
+            If not provided, defaults to `profiling_start_iter - 1`.
+        output_dir : str, optional
+            Directory where all artifacts/logs will be written.
+        keep_intermediate_files : bool, optional
+            If False, removes generated raw input files after runner execution.
+        runner_extra_args : List[str], optional
+            Additional flags forwarded to qaic-runner.
+
+        Returns
+        -------
+        dict
+            Structured metadata including commands, paths, and parsed metrics.
+        """
+        if batch_size <= 0:
+            raise ValueError("`batch_size` must be a positive integer.")
+        if profiling_start_iter <= 1:
+            raise ValueError("`profiling_start_iter` must be greater than 1 to derive `write_output_start_iter`.")
+        if write_output_start_iter is None:
+            write_output_start_iter = profiling_start_iter - 1
+        if write_output_start_iter <= 0 or write_output_start_iter >= profiling_start_iter:
+            raise ValueError(
+                "`write_output_start_iter` must be > 0 and < `profiling_start_iter`. "
+                f"Got write_output_start_iter={write_output_start_iter}, profiling_start_iter={profiling_start_iter}."
+            )
+
+        compile_kwargs = dict(compile_kwargs or {})
+        compile_kwargs.pop("aic_perf_warnings", None)
+        compile_kwargs.setdefault("batch_size", batch_size)
+        if prefill_only:
+            compile_kwargs["prefill_only"] = True
+        requested_prefill_seq_len = int(compile_kwargs.get("prefill_seq_len", 32))
+        compile_prefill_only = compile_kwargs.get("prefill_only", None) is True
+        run_prefill_only = prefill_only or compile_prefill_only
+        compile_kwargs["aic_perf_metrics"] = True
+        compile_kwargs["aic_perf_warning"] = True
+        if profiling_type == "raw_device_stats":
+            compile_kwargs["stats_level"] = 70
+            compile_kwargs["ddr_stats"] = True
+            compile_kwargs["aic_pmu_recipe"] = "KernelUtil"
+
+        if requested_prefill_seq_len == 1:
+            stages_to_run = ["decode"]
+        elif run_prefill_only:
+            stages_to_run = ["prefill"]
+        else:
+            stages_to_run = ["prefill", "decode"]
+
+        if output_dir and "compile_dir" not in compile_kwargs:
+            compile_kwargs["compile_dir"] = str(Path(output_dir) / "compile")
+
+        qpc_path = Path(self.compile(**compile_kwargs))
+        if not qpc_path.is_dir() or not (qpc_path / "programqpc.bin").is_file():
+            raise RuntimeError(f"Compiled QPC directory is invalid: {qpc_path}")
+        dirs = self._prepare_perf_output_dirs(output_dir, qpc_path)
+
+        specializations_path = qpc_path.parent / "specializations.json"
+        if not specializations_path.is_file():
+            raise FileNotFoundError(f"Expected specializations.json at: {specializations_path}")
+
+        with open(specializations_path, "r") as f:
+            specializations = json.load(f)["specializations"]
+        prefill_specs = [spec for spec in specializations if int(spec.get("seq_len", 0)) > 1]
+        decode_specs = [spec for spec in specializations if int(spec.get("seq_len", 0)) == 1]
+
+        if "prefill" in stages_to_run and not prefill_specs:
+            raise RuntimeError("Could not find prefill specialization (seq_len > 1) in compiled specializations.")
+        if "decode" in stages_to_run and not decode_specs:
+            raise RuntimeError("Could not find decode specialization (seq_len == 1) in compiled specializations.")
+
+        stage_specs = {}
+        if "prefill" in stages_to_run:
+            stage_specs["prefill"] = {
+                "batch_size": int(prefill_specs[0]["batch_size"]),
+                "seq_len": int(prefill_specs[0]["seq_len"]),
+            }
+        if "decode" in stages_to_run:
+            stage_specs["decode"] = {
+                "batch_size": int(decode_specs[0]["batch_size"]),
+                "seq_len": 1,
+            }
+
+        stage_results = {}
+        for stage_name, stage_spec in stage_specs.items():
+            stage_io_dir = dirs["io"] / stage_name
+            stage_profiling_dir = dirs["profiling"] / stage_name
+            stage_runner_outputs_dir = dirs["runner_outputs"] / stage_name
+            stage_opstats_dir = dirs["opstats"] / stage_name
+            stage_io_dir.mkdir(parents=True, exist_ok=True)
+            stage_profiling_dir.mkdir(parents=True, exist_ok=True)
+            stage_runner_outputs_dir.mkdir(parents=True, exist_ok=True)
+            stage_opstats_dir.mkdir(parents=True, exist_ok=True)
+
+            batch_json_path = self._create_runner_batch_json(
+                mode=stage_name,
+                batch_size=stage_spec["batch_size"],
+                seq_len=stage_spec["seq_len"],
+                io_dir=stage_io_dir,
+            )
+
+            runner_cmd = [
+                "/opt/qti-aic/exec/qaic-runner",
+                "-t",
+                str(qpc_path),
+                "-n",
+                str(runner_num_iters),
+                "--aic-profiling-type",
+                str(profiling_type),
+                "--aic-profiling-start-iter",
+                str(profiling_start_iter),
+                "--aic-profiling-num-samples",
+                str(profiling_num_samples),
+                "--aic-profiling-out-dir",
+                str(stage_profiling_dir),
+                "--write-output-dir",
+                str(stage_runner_outputs_dir),
+                "--write-output-start-iter",
+                str(write_output_start_iter),
+                "--aic-batch-json-input",
+                str(batch_json_path),
+            ]
+            if runner_extra_args:
+                runner_cmd.extend(runner_extra_args)
+            runner_command_file = dirs["logs"] / f"runner_command_{stage_name}.txt"
+            runner_command_file.write_text(" ".join(runner_cmd))
+
+            runner_stdout_log = dirs["logs"] / f"qaic_runner_{stage_name}_stdout.log"
+            runner_stderr_log = dirs["logs"] / f"qaic_runner_{stage_name}_stderr.log"
+            self._run_subprocess_capture(runner_cmd, runner_stdout_log, runner_stderr_log)
+
+            opstats_cmd = [
+                "/opt/qti-aic/exec/qaic-opstats",
+                "--qpc",
+                str(qpc_path / "programqpc.bin"),
+                "--input-dir",
+                str(stage_profiling_dir),
+                "--output-dir",
+                str(stage_opstats_dir),
+                "--summary",
+                "--trace",
+            ]
+            opstats_command_file = dirs["logs"] / f"opstats_command_{stage_name}.txt"
+            opstats_command_file.write_text(" ".join(opstats_cmd))
+            opstats_stdout_log = dirs["logs"] / f"qaic_opstats_{stage_name}_stdout.log"
+            opstats_stderr_log = dirs["logs"] / f"qaic_opstats_{stage_name}_stderr.log"
+            self._run_subprocess_capture(opstats_cmd, opstats_stdout_log, opstats_stderr_log)
+
+            if not keep_intermediate_files:
+                runner_input_dir = stage_io_dir / "runner_inputs"
+                if runner_input_dir.exists():
+                    for raw_file in runner_input_dir.glob("*.raw"):
+                        raw_file.unlink()
+
+            runner_stdout_text = runner_stdout_log.read_text() if runner_stdout_log.is_file() else ""
+            runner_stderr_text = runner_stderr_log.read_text() if runner_stderr_log.is_file() else ""
+            runner_metrics = self._parse_perf_metrics_from_text(runner_stdout_text + "\n" + runner_stderr_text)
+
+            stage_results[stage_name] = {
+                "batch_input_json_path": str(batch_json_path),
+                "runner_command": " ".join(runner_cmd),
+                "opstats_command": " ".join(opstats_cmd),
+                "runner_metrics": runner_metrics,
+                "log_paths": {
+                    "runner_command": str(runner_command_file),
+                    "runner_stdout": str(runner_stdout_log),
+                    "runner_stderr": str(runner_stderr_log),
+                    "opstats_command": str(opstats_command_file),
+                    "opstats_stdout": str(opstats_stdout_log),
+                    "opstats_stderr": str(opstats_stderr_log),
+                },
+                "profiling_output_dir": str(stage_profiling_dir),
+                "runner_outputs_dir": str(stage_runner_outputs_dir),
+                "opstats_output_dir": str(stage_opstats_dir),
+            }
+
+        compile_stdout_text = ""
+        compile_stderr_text = ""
+        compile_stdout_log = dirs["compile_logs"] / "compiler_stdout.log"
+        compile_stderr_log = dirs["compile_logs"] / "compiler_stderr.log"
+        if compile_stdout_log.is_file():
+            compile_stdout_text = compile_stdout_log.read_text()
+        if compile_stderr_log.is_file():
+            compile_stderr_text = compile_stderr_log.read_text()
+
+        compile_command = None
+        compile_cmd_file = dirs["compile_logs"] / "compiler_command.txt"
+        hashed_compile_params_path = qpc_path.parent / "hashed_compile_params.json"
+        if hashed_compile_params_path.is_file():
+            with open(hashed_compile_params_path, "r") as f:
+                compile_command_list = json.load(f).get("command", [])
+            compile_command = " ".join(compile_command_list)
+            if compile_command:
+                compile_cmd_file.write_text(compile_command)
+        elif compile_cmd_file.is_file():
+            compile_command = compile_cmd_file.read_text().strip()
+
+        compile_metrics = self._parse_perf_metrics_from_text(compile_stdout_text + "\n" + compile_stderr_text)
+        primary_stage = stages_to_run[0]
+        primary_result = stage_results[primary_stage]
+
+        result = {
+            "qpc_path": str(qpc_path),
+            "output_dir": str(dirs["root"]),
+            "stages_ran": stages_to_run,
+            "batch_input_json_path": primary_result["batch_input_json_path"],
+            "batch_input_json_paths": {stage: stage_results[stage]["batch_input_json_path"] for stage in stages_to_run},
+            "compile_command": compile_command,
+            "runner_command": primary_result["runner_command"],
+            "runner_commands": {stage: stage_results[stage]["runner_command"] for stage in stages_to_run},
+            "opstats_command": primary_result["opstats_command"],
+            "opstats_commands": {stage: stage_results[stage]["opstats_command"] for stage in stages_to_run},
+            "compile_metrics": compile_metrics,
+            "runner_metrics": {stage: stage_results[stage]["runner_metrics"] for stage in stages_to_run},
+            "log_paths": {
+                "compile_stdout": str(compile_stdout_log),
+                "compile_stderr": str(compile_stderr_log),
+                "compile_command": str(compile_cmd_file),
+                **{stage: stage_results[stage]["log_paths"] for stage in stages_to_run},
+            },
+            "profiling_output_dir": primary_result["profiling_output_dir"],
+            "runner_outputs_dir": primary_result["runner_outputs_dir"],
+            "opstats_output_dir": primary_result["opstats_output_dir"],
+            "profiling_output_dirs": {stage: stage_results[stage]["profiling_output_dir"] for stage in stages_to_run},
+            "runner_outputs_dirs": {stage: stage_results[stage]["runner_outputs_dir"] for stage in stages_to_run},
+            "opstats_output_dirs": {stage: stage_results[stage]["opstats_output_dir"] for stage in stages_to_run},
+        }
+        return result
+
     def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int):
         """
         Validates and retrieves the number of speculative tokens for TLM models.
diff --git a/examples/run_performance_analysis.py b/examples/run_performance_analysis.py
new file mode 100644
index 000000000..3e930d9cf
--- /dev/null
+++ b/examples/run_performance_analysis.py
@@ -0,0 +1,123 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+import argparse
+import json
+import logging
+import os
+import warnings
+from pathlib import Path
+from typing import Optional
+
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
+from QEfficient.utils.constants import Constants
+
+
+def _suppress_warnings():
+    os.environ["PYTHONWARNINGS"] = "ignore"
+    warnings.filterwarnings("ignore")
+    warnings.simplefilter("ignore")
+    logging.captureWarnings(True)
+    for logger_name in ("torch", "torch.onnx", "onnx", "onnxruntime"):
+        logging.getLogger(logger_name).setLevel(logging.ERROR)
+
+
+def evaluate_model_performance(
+    model_name: str,
+    prompt_len: int = Constants.PROMPT_LEN,
+    ctx_len: int = Constants.CTX_LEN,
+    batch_size: int = 1,
+    prefill_only: bool = False,
+    num_cores: int = 14,
+    num_hidden_layers: int = 2,
+    runner_num_iters: int = 10,
+    profiling_type: str = "raw_device_stats",
+    profiling_start_iter: int = 2,
+    write_output_start_iter: Optional[int] = None,
+    output_dir: str = None,
+):
+    _suppress_warnings()
+    replace_transformers_quantizers()
+
+    model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=num_hidden_layers)
+
+    result = model.evaluate_performance(
+        batch_size=batch_size,
+        prefill_only=prefill_only,
+        compile_kwargs={
+            "prefill_seq_len": prompt_len,
+            "ctx_len": ctx_len,
+            "batch_size": batch_size,
+            "num_cores": num_cores,
+            "mxfp6_matmul": True,
+            "aic_enable_depth_first": False,
+            "mxint8_kv_cache": True,
+        },
+        runner_num_iters=runner_num_iters,
+        profiling_type=profiling_type,
+        profiling_start_iter=profiling_start_iter,
+        write_output_start_iter=write_output_start_iter,
+        output_dir=output_dir,
+    )
+
+    report_path = Path(result["output_dir"]) / "performance_report.json"
+    report_path.write_text(json.dumps(result, indent=2))
+    print(f"Performance report written to: {report_path}")
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compile and run qaic performance analysis for CausalLM")
+    parser.add_argument("--model-name", type=str, required=True, help="HuggingFace model ID")
+    parser.add_argument("--prompt-len", type=int, default=Constants.PROMPT_LEN, help="Prefill sequence length")
+    parser.add_argument("--ctx-len", type=int, default=Constants.CTX_LEN, help="Context length")
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size for compile and synthetic runner IO")
+    parser.add_argument("--prefill-only", action="store_true", help="Run performance analysis only for prefill stage")
+    parser.add_argument("--num-cores", type=int, default=14, help="Number of AI100 cores for compile")
+    parser.add_argument(
+        "--num-hidden-layers",
+        type=int,
+        default=2,
+        help="Number of model layers to load for performance analysis",
+    )
+    parser.add_argument("--runner-num-iters", type=int, default=10, help="qaic-runner iterations")
+    parser.add_argument(
+        "--profiling-type",
+        type=str,
+        default="raw_device_stats",
+        choices=["stats", "trace", "latency", "raw_device_stats"],
+        help="qaic-runner profiling type",
+    )
+    parser.add_argument(
+        "--profiling-start-iter",
+        type=int,
+        default=2,
+        help="qaic-runner profiling start iteration",
+    )
+    parser.add_argument(
+        "--write-output-start-iter",
+        type=int,
+        default=None,
+        help="qaic-runner output write start iteration (must be >0 and < profiling-start-iter)",
+    )
+    parser.add_argument("--output-dir", type=str, default=None, help="Directory for performance artifacts")
+    args = parser.parse_args()
+
+    evaluate_model_performance(
+        model_name=args.model_name,
+        prompt_len=args.prompt_len,
+        ctx_len=args.ctx_len,
+        batch_size=args.batch_size,
+        prefill_only=args.prefill_only,
+        num_cores=args.num_cores,
+        num_hidden_layers=args.num_hidden_layers,
+        runner_num_iters=args.runner_num_iters,
+        profiling_type=args.profiling_type,
+        profiling_start_iter=args.profiling_start_iter,
+        write_output_start_iter=args.write_output_start_iter,
+        output_dir=args.output_dir,
+    )
diff --git a/tests/unit_test/utils/test_auto_model_api.py b/tests/unit_test/utils/test_auto_model_api.py
index ae2a1d722..68c8cfdb4 100644
--- a/tests/unit_test/utils/test_auto_model_api.py
+++ b/tests/unit_test/utils/test_auto_model_api.py
@@ -20,6 +20,9 @@
 All tests run on CPU only, using tiny in-memory models.
 """
 
+import subprocess
+from pathlib import Path
+
 import pytest
 import torch
 from transformers import GPT2Config, GPT2LMHeadModel
@@ -658,3 +661,249 @@ def test_prefill_only_chunked_transform_has_module_mapping(self):
 
         assert hasattr(PrefillOnlyChunkedTransform, "_module_mapping")
         assert isinstance(PrefillOnlyChunkedTransform._module_mapping, dict)
+
+
+@pytest.mark.cpu_only
+class TestQEFFAutoModelForCausalLMEvaluatePerformance:
+    """evaluate_performance() API behavior and artifact generation."""
+
+    def _setup_qeff(self, tmp_path, monkeypatch):
+        from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+
+        qeff = QEFFAutoModelForCausalLM(make_tiny_gpt2())
+        fake_compile_dir = tmp_path / "compile" / "qpc-fakehash"
+        fake_qpc_path = fake_compile_dir / "qpc"
+        fake_qpc_path.mkdir(parents=True, exist_ok=True)
+        (fake_qpc_path / "programqpc.bin").write_bytes(b"qpc")
+        (fake_compile_dir / "specializations.json").write_text(
+            '{"specializations": [{"batch_size": "1", "seq_len": "8", "ctx_len": "16"}, {"batch_size": "1", "seq_len": "1", "ctx_len": "16"}]}'
+        )
+
+        compile_calls = []
+
+        def fake_compile(**kwargs):
+            compile_calls.append(kwargs)
+            return fake_qpc_path
+
+        monkeypatch.setattr(qeff, "compile", fake_compile)
+        return qeff, fake_qpc_path, compile_calls
+
+    def _mock_subprocess(self, monkeypatch):
+        import QEfficient.transformers.models.modeling_auto as modeling_auto
+
+        def fake_run(cmd, capture_output=True, text=True):
+            exe = Path(cmd[0]).name
+            if exe == "qaic-runner":
+                profiling_dir = Path(cmd[cmd.index("--aic-profiling-out-dir") + 1])
+                profiling_dir.mkdir(parents=True, exist_ok=True)
+                (profiling_dir / "runner_profile.raw").write_bytes(b"profile")
+
+                out_dir = Path(cmd[cmd.index("--write-output-dir") + 1])
+                out_dir.mkdir(parents=True, exist_ok=True)
+                (out_dir / "runner_output.raw").write_bytes(b"output")
+                return subprocess.CompletedProcess(
+                    args=cmd, returncode=0, stdout="Inference per second: 111.5\n", stderr=""
+                )
+
+            if exe == "qaic-opstats":
+                opstats_out = Path(cmd[cmd.index("--output-dir") + 1])
+                opstats_out.mkdir(parents=True, exist_ok=True)
+                (opstats_out / "summary.txt").write_text("summary")
+                (opstats_out / "trace.json").write_text("{}")
+                return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="opstats ok\n", stderr="")
+
+            return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="", stderr="")
+
+        monkeypatch.setattr(modeling_auto.subprocess, "run", fake_run)
+
+    def test_has_evaluate_performance_method(self):
+        from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+
+        assert hasattr(QEFFAutoModelForCausalLM, "evaluate_performance")
+        assert callable(QEFFAutoModelForCausalLM.evaluate_performance)
+
+    def test_evaluate_performance_always_calls_compile_and_forces_perf_flags(self, tmp_path, monkeypatch):
+        qeff, _, compile_calls = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf"),
+            compile_kwargs={"aic_perf_warning": False, "aic_perf_warnings": False},
+            runner_num_iters=1,
+        )
+
+        assert len(compile_calls) == 1
+        compile_kwargs_used = compile_calls[0]
+        assert compile_kwargs_used["aic_perf_metrics"] is True
+        assert compile_kwargs_used["aic_perf_warning"] is True
+        assert "aic_perf_warnings" not in compile_kwargs_used
+        assert compile_kwargs_used["stats_level"] == 70
+        assert compile_kwargs_used["ddr_stats"] is True
+        assert compile_kwargs_used["aic_pmu_recipe"] == "KernelUtil"
+
+    def test_evaluate_performance_non_raw_profiling_does_not_force_raw_compile_flags(self, tmp_path, monkeypatch):
+        qeff, _, compile_calls = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf_trace"),
+            profiling_type="trace",
+            runner_num_iters=1,
+        )
+
+        assert len(compile_calls) == 1
+        compile_kwargs_used = compile_calls[0]
+        assert "stats_level" not in compile_kwargs_used
+        assert "ddr_stats" not in compile_kwargs_used
+        assert "aic_pmu_recipe" not in compile_kwargs_used
+
+    def test_evaluate_performance_creates_artifacts_with_new_directory_structure(self, tmp_path, monkeypatch):
+        qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        result = qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf_no_opstats"),
+            runner_num_iters=1,
+        )
+
+        assert Path(result["batch_input_json_paths"]["prefill"]).is_file()
+        assert Path(result["batch_input_json_paths"]["decode"]).is_file()
+        assert Path(result["profiling_output_dirs"]["prefill"], "runner_profile.raw").is_file()
+        assert Path(result["profiling_output_dirs"]["decode"], "runner_profile.raw").is_file()
+        assert Path(result["runner_outputs_dirs"]["prefill"], "runner_output.raw").is_file()
+        assert Path(result["runner_outputs_dirs"]["decode"], "runner_output.raw").is_file()
+        assert Path(result["log_paths"]["prefill"]["runner_command"]).is_file()
+        assert Path(result["log_paths"]["decode"]["runner_command"]).is_file()
+        assert "qaic-runner" in Path(result["log_paths"]["prefill"]["runner_command"]).read_text()
+        assert "qaic-runner" in Path(result["log_paths"]["decode"]["runner_command"]).read_text()
+        assert Path(result["opstats_output_dirs"]["prefill"], "summary.txt").is_file()
+        assert Path(result["opstats_output_dirs"]["prefill"], "trace.json").is_file()
+        assert Path(result["opstats_output_dirs"]["decode"], "summary.txt").is_file()
+        assert Path(result["opstats_output_dirs"]["decode"], "trace.json").is_file()
+        assert Path(result["log_paths"]["prefill"]["opstats_command"]).is_file()
+        assert Path(result["log_paths"]["decode"]["opstats_command"]).is_file()
+        assert "qaic-opstats" in Path(result["log_paths"]["prefill"]["opstats_command"]).read_text()
+        assert "qaic-opstats" in Path(result["log_paths"]["decode"]["opstats_command"]).read_text()
+        assert "--aic-profiling-out-dir" in result["runner_command"]
+        assert "--write-output-start-iter 1" in result["runner_command"]
+
+        output_dir = Path(result["output_dir"])
+        assert (output_dir / "compile").is_dir()
+        assert (output_dir / "io").is_dir()
+        assert (output_dir / "io" / "prefill").is_dir()
+        assert (output_dir / "io" / "decode").is_dir()
+        assert (output_dir / "performance_analysis").is_dir()
+        assert (output_dir / "compile" / "compile_logs").is_dir()
+        assert (output_dir / "performance_analysis" / "profiling").is_dir()
+        assert (output_dir / "performance_analysis" / "profiling" / "prefill").is_dir()
+        assert (output_dir / "performance_analysis" / "profiling" / "decode").is_dir()
+        assert (output_dir / "performance_analysis" / "runner_outputs").is_dir()
+        assert (output_dir / "performance_analysis" / "runner_outputs" / "prefill").is_dir()
+        assert (output_dir / "performance_analysis" / "runner_outputs" / "decode").is_dir()
+        assert (output_dir / "performance_analysis" / "opstats").is_dir()
+        assert (output_dir / "performance_analysis" / "opstats" / "prefill").is_dir()
+        assert (output_dir / "performance_analysis" / "opstats" / "decode").is_dir()
+
+    def test_evaluate_performance_without_output_dir_uses_qpc_parent_layout(self, tmp_path, monkeypatch):
+        qeff, fake_qpc_path, compile_calls = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        result = qeff.evaluate_performance(
+            runner_num_iters=1,
+        )
+
+        assert len(compile_calls) == 1
+        assert "compile_dir" not in compile_calls[0]
+
+        expected_root = fake_qpc_path.parent.parent
+        assert Path(result["output_dir"]) == expected_root
+        assert (expected_root / "io").is_dir()
+        assert (expected_root / "io" / "prefill").is_dir()
+        assert (expected_root / "io" / "decode").is_dir()
+        assert (expected_root / "performance_analysis").is_dir()
+        assert (fake_qpc_path.parent / "compile_logs").is_dir()
+
+    def test_evaluate_performance_writes_opstats_command_log(self, tmp_path, monkeypatch):
+        qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        result = qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf_with_opstats"),
+            runner_num_iters=1,
+        )
+
+        assert Path(result["opstats_output_dirs"]["prefill"], "summary.txt").is_file()
+        assert Path(result["opstats_output_dirs"]["prefill"], "trace.json").is_file()
+        assert Path(result["opstats_output_dirs"]["decode"], "summary.txt").is_file()
+        assert Path(result["opstats_output_dirs"]["decode"], "trace.json").is_file()
+        assert Path(result["log_paths"]["prefill"]["runner_command"]).is_file()
+        assert Path(result["log_paths"]["decode"]["runner_command"]).is_file()
+        assert Path(result["log_paths"]["prefill"]["opstats_command"]).is_file()
+        assert Path(result["log_paths"]["decode"]["opstats_command"]).is_file()
+        assert "qaic-opstats" in Path(result["log_paths"]["prefill"]["opstats_command"]).read_text()
+        assert "qaic-opstats" in Path(result["log_paths"]["decode"]["opstats_command"]).read_text()
+
+    def test_evaluate_performance_allows_write_output_start_iter_override(self, tmp_path, monkeypatch):
+        qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        result = qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf_custom_write_output"),
+            profiling_start_iter=4,
+            write_output_start_iter=2,
+            runner_num_iters=1,
+        )
+
+        assert "--write-output-start-iter 2" in result["runner_commands"]["prefill"]
+        assert "--write-output-start-iter 2" in result["runner_commands"]["decode"]
+
+    def test_evaluate_performance_rejects_invalid_write_output_start_iter(self, tmp_path, monkeypatch):
+        qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        with pytest.raises(ValueError, match="write_output_start_iter"):
+            qeff.evaluate_performance(
+                output_dir=str(tmp_path / "perf_bad_write_output"),
+                profiling_start_iter=3,
+                write_output_start_iter=3,
+                runner_num_iters=1,
+            )
+
+    def test_evaluate_performance_prefill_only_runs_only_prefill_stage(self, tmp_path, monkeypatch):
+        qeff, _, compile_calls = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        result = qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf_prefill_only"),
+            prefill_only=True,
+            runner_num_iters=1,
+        )
+
+        assert len(compile_calls) == 1
+        assert compile_calls[0]["prefill_only"] is True
+        assert result["stages_ran"] == ["prefill"]
+        assert "prefill" in result["runner_commands"]
+        assert "decode" not in result["runner_commands"]
+        assert "prefill" in result["batch_input_json_paths"]
+        assert "decode" not in result["batch_input_json_paths"]
+        assert Path(result["output_dir"], "io", "prefill").is_dir()
+        assert not Path(result["output_dir"], "io", "decode").exists()
+
+    def test_evaluate_performance_prompt_len_one_runs_decode_only(self, tmp_path, monkeypatch):
+        qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch)
+        self._mock_subprocess(monkeypatch)
+
+        result = qeff.evaluate_performance(
+            output_dir=str(tmp_path / "perf_decode_only"),
+            prefill_only=True,
+            compile_kwargs={"prefill_seq_len": 1},
+            runner_num_iters=1,
+        )
+
+        assert result["stages_ran"] == ["decode"]
+        assert "decode" in result["runner_commands"]
+        assert "prefill" not in result["runner_commands"]
+        assert "decode" in result["batch_input_json_paths"]
+        assert "prefill" not in result["batch_input_json_paths"]
+        assert Path(result["output_dir"], "io", "decode").is_dir()
+        assert not Path(result["output_dir"], "io", "prefill").exists()