diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 6f22e867e..8d7163956 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -390,6 +390,12 @@ def _compile( For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ + # Backward-compatible alias accepted by some APIs. + if "aic_perf_warning" in compiler_options: + if "aic_perf_warnings" not in compiler_options: + compiler_options["aic_perf_warnings"] = compiler_options["aic_perf_warning"] + compiler_options.pop("aic_perf_warning", None) + onnx_path = Path( onnx_path if onnx_path diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3a47aa5ff..5f9e71a24 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -5,11 +5,14 @@ # # ---------------------------------------------------------------------------- +import json import os +import re +import subprocess import warnings from pathlib import Path from time import perf_counter -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np import torch @@ -3487,6 +3490,422 @@ def generate( else: raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") + @staticmethod + def _parse_perf_metrics_from_text(text: str) -> Dict[str, float]: + """ + Parse best-effort numeric performance metrics from QAIC logs. + + Parameters + ---------- + text : str + Raw log text from compiler/runner. + + Returns + ------- + Dict[str, float] + Parsed key-value metrics. Missing fields are omitted. + """ + patterns = { + "ddr_reads": r"DDR reads\s*[:=]\s*([0-9]*\.?[0-9]+)", + "ddr_writes": r"DDR writes\s*[:=]\s*([0-9]*\.?[0-9]+)", + "total_ddr_traffic": r"Total DDR Traffic\s*[:=]\s*([0-9]*\.?[0-9]+)", + "inference_per_second": r"Inference per second\s*[:=]\s*([0-9]*\.?[0-9]+)", + "inferences_per_second": r"inferences per second\s*[:=]\s*([0-9]*\.?[0-9]+)", + "inf_per_sec": r"Inf/Sec\s*([0-9]*\.?[0-9]+)", + "latency_ms": r"latency[^0-9]*([0-9]*\.?[0-9]+)\s*ms", + "total_duration_us": r"TotalDuration\s*([0-9]*\.?[0-9]+)\s*us", + "aic_hmx_mac_estimate": r"AIC HMX MAC estimate:\s*([0-9,]+)", + "static_constants_size_gb": r"StaticConstantsSize:\s*([0-9]*\.?[0-9]+)\s*GB", + "dynamic_constants_size_mb": r"DynamicConstantsSize:\s*([0-9]*\.?[0-9]+)\s*MB", + "total_ddr_read_kb": r"total:\s*DDR traffic\s*read:\s*([0-9,]*\.?[0-9]+)\s*KB", + "total_ddr_write_kb": r"total:\s*DDR traffic write:\s*([0-9,]*\.?[0-9]+)\s*KB", + "total_mc_write_kb": r"total:\s*MC\s*traffic write:\s*([0-9,]*\.?[0-9]+)\s*KB", + } + parsed: Dict[str, float] = {} + for key, pattern in patterns.items(): + matches = re.findall(pattern, text, flags=re.IGNORECASE) + if matches: + try: + value = matches[-1].replace(",", "") + parsed[key] = float(value) + except ValueError: + continue + return parsed + + @staticmethod + def _run_subprocess_capture(command: List[str], stdout_log: Path, stderr_log: Path) -> subprocess.CompletedProcess: + """ + Run subprocess command, persist stdout/stderr logs, and return completed process. + """ + result = subprocess.run(command, capture_output=True, text=True) + stdout_log.parent.mkdir(parents=True, exist_ok=True) + stderr_log.parent.mkdir(parents=True, exist_ok=True) + stdout_log.write_text(result.stdout or "") + stderr_log.write_text(result.stderr or "") + if result.returncode != 0: + raise RuntimeError( + "\n".join( + [ + "Command failed!", + f"Command: {' '.join(command)}", + f"Exit code: {result.returncode}", + f"Stdout log: {stdout_log}", + f"Stderr log: {stderr_log}", + ] + ) + ) + return result + + @staticmethod + def _prepare_perf_output_dirs(output_dir: Optional[str], qpc_path: Path) -> Dict[str, Path]: + """ + Prepare output directories for evaluate_performance artifacts. + """ + if output_dir: + root_dir = Path(output_dir) + compile_dir = root_dir / "compile" + else: + # qpc_path: <...>/qpc-/qpc + # Keep io/performance siblings of qpc-, as requested. + compile_dir = qpc_path.parent + root_dir = compile_dir.parent + + dirs = { + "root": root_dir, + "compile": compile_dir, + "compile_logs": compile_dir / "compile_logs", + "io": root_dir / "io", + "performance_analysis": root_dir / "performance_analysis", + "logs": root_dir / "performance_analysis" / "logs", + "runner_outputs": root_dir / "performance_analysis" / "runner_outputs", + "profiling": root_dir / "performance_analysis" / "profiling", + "opstats": root_dir / "performance_analysis" / "opstats", + } + root_dir.mkdir(parents=True, exist_ok=True) + dirs["compile"].mkdir(parents=True, exist_ok=True) + dirs["compile_logs"].mkdir(parents=True, exist_ok=True) + dirs["logs"].mkdir(parents=True, exist_ok=True) + dirs["io"].mkdir(parents=True, exist_ok=True) + dirs["performance_analysis"].mkdir(parents=True, exist_ok=True) + dirs["runner_outputs"].mkdir(parents=True, exist_ok=True) + dirs["profiling"].mkdir(parents=True, exist_ok=True) + dirs["opstats"].mkdir(parents=True, exist_ok=True) + return dirs + + def _create_runner_batch_json( + self, + *, + mode: str, + batch_size: int, + seq_len: int, + io_dir: Path, + ) -> Path: + """ + Create a qaic-runner batch-input JSON with deterministic synthetic inputs. + """ + if mode not in {"prefill", "decode"}: + raise ValueError(f"Unknown mode {mode}. Expected one of: prefill, decode.") + + if mode == "prefill": + input_ids = np.ones((batch_size, seq_len), dtype=np.int64) + position_ids = np.tile(np.arange(seq_len, dtype=np.int64), (batch_size, 1)) + else: + input_ids = np.ones((batch_size, 1), dtype=np.int64) + position_ids = np.zeros((batch_size, 1), dtype=np.int64) + + inputs = {"input_ids": input_ids, "position_ids": position_ids} + + if self.continuous_batching: + inputs["batch_index"] = np.arange(batch_size, dtype=np.int64).reshape(batch_size, 1) + + if self.is_tlm: + inputs["num_logits_to_keep"] = np.zeros((1, 1), dtype=np.int64) + + if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False): + raise NotImplementedError("evaluate_performance currently does not support include_sampler=True") + + write_io_files(inputs, {}, str(io_dir), "runner_inputs", "aic_batch_io", include_dims=True, reset=True) + batch_json_path = io_dir / "aic_batch_io.json" + + # Convert relative paths to absolute paths for robust qaic-runner execution. + with open(batch_json_path, "r") as f: + batch_data = json.load(f) + + for io_group in batch_data.get("IO-files", []): + for entry in io_group: + if "path" in entry: + entry["path"] = str((io_dir / entry["path"]).resolve()) + + with open(batch_json_path, "w") as f: + json.dump(batch_data, f, indent=2) + + return batch_json_path + + def evaluate_performance( + self, + *, + batch_size: int = 1, + prefill_only: bool = False, + compile_kwargs: Optional[dict] = None, + runner_num_iters: int = 10, + profiling_type: str = "raw_device_stats", + profiling_start_iter: int = 2, + profiling_num_samples: int = 5, + write_output_start_iter: Optional[int] = None, + output_dir: Optional[str] = None, + keep_intermediate_files: bool = True, + runner_extra_args: Optional[List[str]] = None, + ) -> dict: + """ + Compile with perf flags, execute with qaic-runner, and dump available performance artifacts. + + Parameters + ---------- + batch_size : int, optional + Batch size for compilation and synthetic runner inputs. Defaults to 1. + prefill_only : bool, optional + If True, run performance analysis only for prefill inputs when `prefill_seq_len > 1`. + If `prefill_seq_len == 1`, analysis runs decode-only regardless of this flag. + compile_kwargs : dict, optional + Args passed to compile API. Perf flags are always forced in this API. + runner_num_iters : int, optional + Number of qaic-runner iterations. Defaults to 10. + profiling_type : str, optional + qaic-runner profiling type. Defaults to "raw_device_stats". + profiling_start_iter : int, optional + Iteration to start collecting profiles from. Defaults to 2. + profiling_num_samples : int, optional + Number of profile samples. Defaults to 5. + write_output_start_iter : int, optional + Iteration to start writing outputs from qaic-runner. + Must be greater than 0 and less than `profiling_start_iter`. + If not provided, defaults to `profiling_start_iter - 1`. + output_dir : str, optional + Directory where all artifacts/logs will be written. + keep_intermediate_files : bool, optional + If False, removes generated raw input files after runner execution. + runner_extra_args : List[str], optional + Additional flags forwarded to qaic-runner. + + Returns + ------- + dict + Structured metadata including commands, paths, and parsed metrics. + """ + if batch_size <= 0: + raise ValueError("`batch_size` must be a positive integer.") + if profiling_start_iter <= 1: + raise ValueError("`profiling_start_iter` must be greater than 1 to derive `write_output_start_iter`.") + if write_output_start_iter is None: + write_output_start_iter = profiling_start_iter - 1 + if write_output_start_iter <= 0 or write_output_start_iter >= profiling_start_iter: + raise ValueError( + "`write_output_start_iter` must be > 0 and < `profiling_start_iter`. " + f"Got write_output_start_iter={write_output_start_iter}, profiling_start_iter={profiling_start_iter}." + ) + + compile_kwargs = dict(compile_kwargs or {}) + compile_kwargs.pop("aic_perf_warnings", None) + compile_kwargs.setdefault("batch_size", batch_size) + if prefill_only: + compile_kwargs["prefill_only"] = True + requested_prefill_seq_len = int(compile_kwargs.get("prefill_seq_len", 32)) + compile_prefill_only = compile_kwargs.get("prefill_only", None) is True + run_prefill_only = prefill_only or compile_prefill_only + compile_kwargs["aic_perf_metrics"] = True + compile_kwargs["aic_perf_warning"] = True + if profiling_type == "raw_device_stats": + compile_kwargs["stats_level"] = 70 + compile_kwargs["ddr_stats"] = True + compile_kwargs["aic_pmu_recipe"] = "KernelUtil" + + if requested_prefill_seq_len == 1: + stages_to_run = ["decode"] + elif run_prefill_only: + stages_to_run = ["prefill"] + else: + stages_to_run = ["prefill", "decode"] + + if output_dir and "compile_dir" not in compile_kwargs: + compile_kwargs["compile_dir"] = str(Path(output_dir) / "compile") + + qpc_path = Path(self.compile(**compile_kwargs)) + if not qpc_path.is_dir() or not (qpc_path / "programqpc.bin").is_file(): + raise RuntimeError(f"Compiled QPC directory is invalid: {qpc_path}") + dirs = self._prepare_perf_output_dirs(output_dir, qpc_path) + + specializations_path = qpc_path.parent / "specializations.json" + if not specializations_path.is_file(): + raise FileNotFoundError(f"Expected specializations.json at: {specializations_path}") + + with open(specializations_path, "r") as f: + specializations = json.load(f)["specializations"] + prefill_specs = [spec for spec in specializations if int(spec.get("seq_len", 0)) > 1] + decode_specs = [spec for spec in specializations if int(spec.get("seq_len", 0)) == 1] + + if "prefill" in stages_to_run and not prefill_specs: + raise RuntimeError("Could not find prefill specialization (seq_len > 1) in compiled specializations.") + if "decode" in stages_to_run and not decode_specs: + raise RuntimeError("Could not find decode specialization (seq_len == 1) in compiled specializations.") + + stage_specs = {} + if "prefill" in stages_to_run: + stage_specs["prefill"] = { + "batch_size": int(prefill_specs[0]["batch_size"]), + "seq_len": int(prefill_specs[0]["seq_len"]), + } + if "decode" in stages_to_run: + stage_specs["decode"] = { + "batch_size": int(decode_specs[0]["batch_size"]), + "seq_len": 1, + } + + stage_results = {} + for stage_name, stage_spec in stage_specs.items(): + stage_io_dir = dirs["io"] / stage_name + stage_profiling_dir = dirs["profiling"] / stage_name + stage_runner_outputs_dir = dirs["runner_outputs"] / stage_name + stage_opstats_dir = dirs["opstats"] / stage_name + stage_io_dir.mkdir(parents=True, exist_ok=True) + stage_profiling_dir.mkdir(parents=True, exist_ok=True) + stage_runner_outputs_dir.mkdir(parents=True, exist_ok=True) + stage_opstats_dir.mkdir(parents=True, exist_ok=True) + + batch_json_path = self._create_runner_batch_json( + mode=stage_name, + batch_size=stage_spec["batch_size"], + seq_len=stage_spec["seq_len"], + io_dir=stage_io_dir, + ) + + runner_cmd = [ + "/opt/qti-aic/exec/qaic-runner", + "-t", + str(qpc_path), + "-n", + str(runner_num_iters), + "--aic-profiling-type", + str(profiling_type), + "--aic-profiling-start-iter", + str(profiling_start_iter), + "--aic-profiling-num-samples", + str(profiling_num_samples), + "--aic-profiling-out-dir", + str(stage_profiling_dir), + "--write-output-dir", + str(stage_runner_outputs_dir), + "--write-output-start-iter", + str(write_output_start_iter), + "--aic-batch-json-input", + str(batch_json_path), + ] + if runner_extra_args: + runner_cmd.extend(runner_extra_args) + runner_command_file = dirs["logs"] / f"runner_command_{stage_name}.txt" + runner_command_file.write_text(" ".join(runner_cmd)) + + runner_stdout_log = dirs["logs"] / f"qaic_runner_{stage_name}_stdout.log" + runner_stderr_log = dirs["logs"] / f"qaic_runner_{stage_name}_stderr.log" + self._run_subprocess_capture(runner_cmd, runner_stdout_log, runner_stderr_log) + + opstats_cmd = [ + "/opt/qti-aic/exec/qaic-opstats", + "--qpc", + str(qpc_path / "programqpc.bin"), + "--input-dir", + str(stage_profiling_dir), + "--output-dir", + str(stage_opstats_dir), + "--summary", + "--trace", + ] + opstats_command_file = dirs["logs"] / f"opstats_command_{stage_name}.txt" + opstats_command_file.write_text(" ".join(opstats_cmd)) + opstats_stdout_log = dirs["logs"] / f"qaic_opstats_{stage_name}_stdout.log" + opstats_stderr_log = dirs["logs"] / f"qaic_opstats_{stage_name}_stderr.log" + self._run_subprocess_capture(opstats_cmd, opstats_stdout_log, opstats_stderr_log) + + if not keep_intermediate_files: + runner_input_dir = stage_io_dir / "runner_inputs" + if runner_input_dir.exists(): + for raw_file in runner_input_dir.glob("*.raw"): + raw_file.unlink() + + runner_stdout_text = runner_stdout_log.read_text() if runner_stdout_log.is_file() else "" + runner_stderr_text = runner_stderr_log.read_text() if runner_stderr_log.is_file() else "" + runner_metrics = self._parse_perf_metrics_from_text(runner_stdout_text + "\n" + runner_stderr_text) + + stage_results[stage_name] = { + "batch_input_json_path": str(batch_json_path), + "runner_command": " ".join(runner_cmd), + "opstats_command": " ".join(opstats_cmd), + "runner_metrics": runner_metrics, + "log_paths": { + "runner_command": str(runner_command_file), + "runner_stdout": str(runner_stdout_log), + "runner_stderr": str(runner_stderr_log), + "opstats_command": str(opstats_command_file), + "opstats_stdout": str(opstats_stdout_log), + "opstats_stderr": str(opstats_stderr_log), + }, + "profiling_output_dir": str(stage_profiling_dir), + "runner_outputs_dir": str(stage_runner_outputs_dir), + "opstats_output_dir": str(stage_opstats_dir), + } + + compile_stdout_text = "" + compile_stderr_text = "" + compile_stdout_log = dirs["compile_logs"] / "compiler_stdout.log" + compile_stderr_log = dirs["compile_logs"] / "compiler_stderr.log" + if compile_stdout_log.is_file(): + compile_stdout_text = compile_stdout_log.read_text() + if compile_stderr_log.is_file(): + compile_stderr_text = compile_stderr_log.read_text() + + compile_command = None + compile_cmd_file = dirs["compile_logs"] / "compiler_command.txt" + hashed_compile_params_path = qpc_path.parent / "hashed_compile_params.json" + if hashed_compile_params_path.is_file(): + with open(hashed_compile_params_path, "r") as f: + compile_command_list = json.load(f).get("command", []) + compile_command = " ".join(compile_command_list) + if compile_command: + compile_cmd_file.write_text(compile_command) + elif compile_cmd_file.is_file(): + compile_command = compile_cmd_file.read_text().strip() + + compile_metrics = self._parse_perf_metrics_from_text(compile_stdout_text + "\n" + compile_stderr_text) + primary_stage = stages_to_run[0] + primary_result = stage_results[primary_stage] + + result = { + "qpc_path": str(qpc_path), + "output_dir": str(dirs["root"]), + "stages_ran": stages_to_run, + "batch_input_json_path": primary_result["batch_input_json_path"], + "batch_input_json_paths": {stage: stage_results[stage]["batch_input_json_path"] for stage in stages_to_run}, + "compile_command": compile_command, + "runner_command": primary_result["runner_command"], + "runner_commands": {stage: stage_results[stage]["runner_command"] for stage in stages_to_run}, + "opstats_command": primary_result["opstats_command"], + "opstats_commands": {stage: stage_results[stage]["opstats_command"] for stage in stages_to_run}, + "compile_metrics": compile_metrics, + "runner_metrics": {stage: stage_results[stage]["runner_metrics"] for stage in stages_to_run}, + "log_paths": { + "compile_stdout": str(compile_stdout_log), + "compile_stderr": str(compile_stderr_log), + "compile_command": str(compile_cmd_file), + **{stage: stage_results[stage]["log_paths"] for stage in stages_to_run}, + }, + "profiling_output_dir": primary_result["profiling_output_dir"], + "runner_outputs_dir": primary_result["runner_outputs_dir"], + "opstats_output_dir": primary_result["opstats_output_dir"], + "profiling_output_dirs": {stage: stage_results[stage]["profiling_output_dir"] for stage in stages_to_run}, + "runner_outputs_dirs": {stage: stage_results[stage]["runner_outputs_dir"] for stage in stages_to_run}, + "opstats_output_dirs": {stage: stage_results[stage]["opstats_output_dir"] for stage in stages_to_run}, + } + return result + def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int): """ Validates and retrieves the number of speculative tokens for TLM models. diff --git a/examples/run_performance_analysis.py b/examples/run_performance_analysis.py new file mode 100644 index 000000000..3e930d9cf --- /dev/null +++ b/examples/run_performance_analysis.py @@ -0,0 +1,123 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- +import argparse +import json +import logging +import os +import warnings +from pathlib import Path +from typing import Optional + +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers +from QEfficient.utils.constants import Constants + + +def _suppress_warnings(): + os.environ["PYTHONWARNINGS"] = "ignore" + warnings.filterwarnings("ignore") + warnings.simplefilter("ignore") + logging.captureWarnings(True) + for logger_name in ("torch", "torch.onnx", "onnx", "onnxruntime"): + logging.getLogger(logger_name).setLevel(logging.ERROR) + + +def evaluate_model_performance( + model_name: str, + prompt_len: int = Constants.PROMPT_LEN, + ctx_len: int = Constants.CTX_LEN, + batch_size: int = 1, + prefill_only: bool = False, + num_cores: int = 14, + num_hidden_layers: int = 2, + runner_num_iters: int = 10, + profiling_type: str = "raw_device_stats", + profiling_start_iter: int = 2, + write_output_start_iter: Optional[int] = None, + output_dir: str = None, +): + _suppress_warnings() + replace_transformers_quantizers() + + model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=num_hidden_layers) + + result = model.evaluate_performance( + batch_size=batch_size, + prefill_only=prefill_only, + compile_kwargs={ + "prefill_seq_len": prompt_len, + "ctx_len": ctx_len, + "batch_size": batch_size, + "num_cores": num_cores, + "mxfp6_matmul": True, + "aic_enable_depth_first": False, + "mxint8_kv_cache": True, + }, + runner_num_iters=runner_num_iters, + profiling_type=profiling_type, + profiling_start_iter=profiling_start_iter, + write_output_start_iter=write_output_start_iter, + output_dir=output_dir, + ) + + report_path = Path(result["output_dir"]) / "performance_report.json" + report_path.write_text(json.dumps(result, indent=2)) + print(f"Performance report written to: {report_path}") + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Compile and run qaic performance analysis for CausalLM") + parser.add_argument("--model-name", type=str, required=True, help="HuggingFace model ID") + parser.add_argument("--prompt-len", type=int, default=Constants.PROMPT_LEN, help="Prefill sequence length") + parser.add_argument("--ctx-len", type=int, default=Constants.CTX_LEN, help="Context length") + parser.add_argument("--batch-size", type=int, default=1, help="Batch size for compile and synthetic runner IO") + parser.add_argument("--prefill-only", action="store_true", help="Run performance analysis only for prefill stage") + parser.add_argument("--num-cores", type=int, default=14, help="Number of AI100 cores for compile") + parser.add_argument( + "--num-hidden-layers", + type=int, + default=2, + help="Number of model layers to load for performance analysis", + ) + parser.add_argument("--runner-num-iters", type=int, default=10, help="qaic-runner iterations") + parser.add_argument( + "--profiling-type", + type=str, + default="raw_device_stats", + choices=["stats", "trace", "latency", "raw_device_stats"], + help="qaic-runner profiling type", + ) + parser.add_argument( + "--profiling-start-iter", + type=int, + default=2, + help="qaic-runner profiling start iteration", + ) + parser.add_argument( + "--write-output-start-iter", + type=int, + default=None, + help="qaic-runner output write start iteration (must be >0 and < profiling-start-iter)", + ) + parser.add_argument("--output-dir", type=str, default=None, help="Directory for performance artifacts") + args = parser.parse_args() + + evaluate_model_performance( + model_name=args.model_name, + prompt_len=args.prompt_len, + ctx_len=args.ctx_len, + batch_size=args.batch_size, + prefill_only=args.prefill_only, + num_cores=args.num_cores, + num_hidden_layers=args.num_hidden_layers, + runner_num_iters=args.runner_num_iters, + profiling_type=args.profiling_type, + profiling_start_iter=args.profiling_start_iter, + write_output_start_iter=args.write_output_start_iter, + output_dir=args.output_dir, + ) diff --git a/tests/unit_test/utils/test_auto_model_api.py b/tests/unit_test/utils/test_auto_model_api.py index ae2a1d722..68c8cfdb4 100644 --- a/tests/unit_test/utils/test_auto_model_api.py +++ b/tests/unit_test/utils/test_auto_model_api.py @@ -20,6 +20,9 @@ All tests run on CPU only, using tiny in-memory models. """ +import subprocess +from pathlib import Path + import pytest import torch from transformers import GPT2Config, GPT2LMHeadModel @@ -658,3 +661,249 @@ def test_prefill_only_chunked_transform_has_module_mapping(self): assert hasattr(PrefillOnlyChunkedTransform, "_module_mapping") assert isinstance(PrefillOnlyChunkedTransform._module_mapping, dict) + + +@pytest.mark.cpu_only +class TestQEFFAutoModelForCausalLMEvaluatePerformance: + """evaluate_performance() API behavior and artifact generation.""" + + def _setup_qeff(self, tmp_path, monkeypatch): + from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM + + qeff = QEFFAutoModelForCausalLM(make_tiny_gpt2()) + fake_compile_dir = tmp_path / "compile" / "qpc-fakehash" + fake_qpc_path = fake_compile_dir / "qpc" + fake_qpc_path.mkdir(parents=True, exist_ok=True) + (fake_qpc_path / "programqpc.bin").write_bytes(b"qpc") + (fake_compile_dir / "specializations.json").write_text( + '{"specializations": [{"batch_size": "1", "seq_len": "8", "ctx_len": "16"}, {"batch_size": "1", "seq_len": "1", "ctx_len": "16"}]}' + ) + + compile_calls = [] + + def fake_compile(**kwargs): + compile_calls.append(kwargs) + return fake_qpc_path + + monkeypatch.setattr(qeff, "compile", fake_compile) + return qeff, fake_qpc_path, compile_calls + + def _mock_subprocess(self, monkeypatch): + import QEfficient.transformers.models.modeling_auto as modeling_auto + + def fake_run(cmd, capture_output=True, text=True): + exe = Path(cmd[0]).name + if exe == "qaic-runner": + profiling_dir = Path(cmd[cmd.index("--aic-profiling-out-dir") + 1]) + profiling_dir.mkdir(parents=True, exist_ok=True) + (profiling_dir / "runner_profile.raw").write_bytes(b"profile") + + out_dir = Path(cmd[cmd.index("--write-output-dir") + 1]) + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "runner_output.raw").write_bytes(b"output") + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout="Inference per second: 111.5\n", stderr="" + ) + + if exe == "qaic-opstats": + opstats_out = Path(cmd[cmd.index("--output-dir") + 1]) + opstats_out.mkdir(parents=True, exist_ok=True) + (opstats_out / "summary.txt").write_text("summary") + (opstats_out / "trace.json").write_text("{}") + return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="opstats ok\n", stderr="") + + return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(modeling_auto.subprocess, "run", fake_run) + + def test_has_evaluate_performance_method(self): + from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM + + assert hasattr(QEFFAutoModelForCausalLM, "evaluate_performance") + assert callable(QEFFAutoModelForCausalLM.evaluate_performance) + + def test_evaluate_performance_always_calls_compile_and_forces_perf_flags(self, tmp_path, monkeypatch): + qeff, _, compile_calls = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + qeff.evaluate_performance( + output_dir=str(tmp_path / "perf"), + compile_kwargs={"aic_perf_warning": False, "aic_perf_warnings": False}, + runner_num_iters=1, + ) + + assert len(compile_calls) == 1 + compile_kwargs_used = compile_calls[0] + assert compile_kwargs_used["aic_perf_metrics"] is True + assert compile_kwargs_used["aic_perf_warning"] is True + assert "aic_perf_warnings" not in compile_kwargs_used + assert compile_kwargs_used["stats_level"] == 70 + assert compile_kwargs_used["ddr_stats"] is True + assert compile_kwargs_used["aic_pmu_recipe"] == "KernelUtil" + + def test_evaluate_performance_non_raw_profiling_does_not_force_raw_compile_flags(self, tmp_path, monkeypatch): + qeff, _, compile_calls = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_trace"), + profiling_type="trace", + runner_num_iters=1, + ) + + assert len(compile_calls) == 1 + compile_kwargs_used = compile_calls[0] + assert "stats_level" not in compile_kwargs_used + assert "ddr_stats" not in compile_kwargs_used + assert "aic_pmu_recipe" not in compile_kwargs_used + + def test_evaluate_performance_creates_artifacts_with_new_directory_structure(self, tmp_path, monkeypatch): + qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + result = qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_no_opstats"), + runner_num_iters=1, + ) + + assert Path(result["batch_input_json_paths"]["prefill"]).is_file() + assert Path(result["batch_input_json_paths"]["decode"]).is_file() + assert Path(result["profiling_output_dirs"]["prefill"], "runner_profile.raw").is_file() + assert Path(result["profiling_output_dirs"]["decode"], "runner_profile.raw").is_file() + assert Path(result["runner_outputs_dirs"]["prefill"], "runner_output.raw").is_file() + assert Path(result["runner_outputs_dirs"]["decode"], "runner_output.raw").is_file() + assert Path(result["log_paths"]["prefill"]["runner_command"]).is_file() + assert Path(result["log_paths"]["decode"]["runner_command"]).is_file() + assert "qaic-runner" in Path(result["log_paths"]["prefill"]["runner_command"]).read_text() + assert "qaic-runner" in Path(result["log_paths"]["decode"]["runner_command"]).read_text() + assert Path(result["opstats_output_dirs"]["prefill"], "summary.txt").is_file() + assert Path(result["opstats_output_dirs"]["prefill"], "trace.json").is_file() + assert Path(result["opstats_output_dirs"]["decode"], "summary.txt").is_file() + assert Path(result["opstats_output_dirs"]["decode"], "trace.json").is_file() + assert Path(result["log_paths"]["prefill"]["opstats_command"]).is_file() + assert Path(result["log_paths"]["decode"]["opstats_command"]).is_file() + assert "qaic-opstats" in Path(result["log_paths"]["prefill"]["opstats_command"]).read_text() + assert "qaic-opstats" in Path(result["log_paths"]["decode"]["opstats_command"]).read_text() + assert "--aic-profiling-out-dir" in result["runner_command"] + assert "--write-output-start-iter 1" in result["runner_command"] + + output_dir = Path(result["output_dir"]) + assert (output_dir / "compile").is_dir() + assert (output_dir / "io").is_dir() + assert (output_dir / "io" / "prefill").is_dir() + assert (output_dir / "io" / "decode").is_dir() + assert (output_dir / "performance_analysis").is_dir() + assert (output_dir / "compile" / "compile_logs").is_dir() + assert (output_dir / "performance_analysis" / "profiling").is_dir() + assert (output_dir / "performance_analysis" / "profiling" / "prefill").is_dir() + assert (output_dir / "performance_analysis" / "profiling" / "decode").is_dir() + assert (output_dir / "performance_analysis" / "runner_outputs").is_dir() + assert (output_dir / "performance_analysis" / "runner_outputs" / "prefill").is_dir() + assert (output_dir / "performance_analysis" / "runner_outputs" / "decode").is_dir() + assert (output_dir / "performance_analysis" / "opstats").is_dir() + assert (output_dir / "performance_analysis" / "opstats" / "prefill").is_dir() + assert (output_dir / "performance_analysis" / "opstats" / "decode").is_dir() + + def test_evaluate_performance_without_output_dir_uses_qpc_parent_layout(self, tmp_path, monkeypatch): + qeff, fake_qpc_path, compile_calls = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + result = qeff.evaluate_performance( + runner_num_iters=1, + ) + + assert len(compile_calls) == 1 + assert "compile_dir" not in compile_calls[0] + + expected_root = fake_qpc_path.parent.parent + assert Path(result["output_dir"]) == expected_root + assert (expected_root / "io").is_dir() + assert (expected_root / "io" / "prefill").is_dir() + assert (expected_root / "io" / "decode").is_dir() + assert (expected_root / "performance_analysis").is_dir() + assert (fake_qpc_path.parent / "compile_logs").is_dir() + + def test_evaluate_performance_writes_opstats_command_log(self, tmp_path, monkeypatch): + qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + result = qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_with_opstats"), + runner_num_iters=1, + ) + + assert Path(result["opstats_output_dirs"]["prefill"], "summary.txt").is_file() + assert Path(result["opstats_output_dirs"]["prefill"], "trace.json").is_file() + assert Path(result["opstats_output_dirs"]["decode"], "summary.txt").is_file() + assert Path(result["opstats_output_dirs"]["decode"], "trace.json").is_file() + assert Path(result["log_paths"]["prefill"]["runner_command"]).is_file() + assert Path(result["log_paths"]["decode"]["runner_command"]).is_file() + assert Path(result["log_paths"]["prefill"]["opstats_command"]).is_file() + assert Path(result["log_paths"]["decode"]["opstats_command"]).is_file() + assert "qaic-opstats" in Path(result["log_paths"]["prefill"]["opstats_command"]).read_text() + assert "qaic-opstats" in Path(result["log_paths"]["decode"]["opstats_command"]).read_text() + + def test_evaluate_performance_allows_write_output_start_iter_override(self, tmp_path, monkeypatch): + qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + result = qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_custom_write_output"), + profiling_start_iter=4, + write_output_start_iter=2, + runner_num_iters=1, + ) + + assert "--write-output-start-iter 2" in result["runner_commands"]["prefill"] + assert "--write-output-start-iter 2" in result["runner_commands"]["decode"] + + def test_evaluate_performance_rejects_invalid_write_output_start_iter(self, tmp_path, monkeypatch): + qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + with pytest.raises(ValueError, match="write_output_start_iter"): + qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_bad_write_output"), + profiling_start_iter=3, + write_output_start_iter=3, + runner_num_iters=1, + ) + + def test_evaluate_performance_prefill_only_runs_only_prefill_stage(self, tmp_path, monkeypatch): + qeff, _, compile_calls = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + result = qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_prefill_only"), + prefill_only=True, + runner_num_iters=1, + ) + + assert len(compile_calls) == 1 + assert compile_calls[0]["prefill_only"] is True + assert result["stages_ran"] == ["prefill"] + assert "prefill" in result["runner_commands"] + assert "decode" not in result["runner_commands"] + assert "prefill" in result["batch_input_json_paths"] + assert "decode" not in result["batch_input_json_paths"] + assert Path(result["output_dir"], "io", "prefill").is_dir() + assert not Path(result["output_dir"], "io", "decode").exists() + + def test_evaluate_performance_prompt_len_one_runs_decode_only(self, tmp_path, monkeypatch): + qeff, _, _ = self._setup_qeff(tmp_path, monkeypatch) + self._mock_subprocess(monkeypatch) + + result = qeff.evaluate_performance( + output_dir=str(tmp_path / "perf_decode_only"), + prefill_only=True, + compile_kwargs={"prefill_seq_len": 1}, + runner_num_iters=1, + ) + + assert result["stages_ran"] == ["decode"] + assert "decode" in result["runner_commands"] + assert "prefill" not in result["runner_commands"] + assert "decode" in result["batch_input_json_paths"] + assert "prefill" not in result["batch_input_json_paths"] + assert Path(result["output_dir"], "io", "decode").is_dir() + assert not Path(result["output_dir"], "io", "prefill").exists()