diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh old mode 100644 new mode 100755 index 1e3b331f427..2be02460944 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -137,6 +137,53 @@ else QNN_SDK_ROOT="" fi +# Set dynamic max export times +PLATFORM="x86" +if [[ "$(uname)" == "Darwin" ]]; then + PLATFORM="macos" +elif [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then + PLATFORM="arm64" +fi + +BUFFER_TIME=25 + +# Lookup threshold based on platform:dtype:mode +case "${PLATFORM}:${DTYPE}:${MODE}:${PT2E_QUANTIZE}" in + + # Linux x86 configurations + "x86:fp32:portable:") ACT_EXPORT_TIME=72 ;; + "x86:fp32:xnnpack+custom:") ACT_EXPORT_TIME=276 ;; + "x86:bf16:portable:") ACT_EXPORT_TIME=75 ;; + "x86:bf16:custom:") ACT_EXPORT_TIME=65 ;; + "x86:fp32:xnnpack+custom+qe:") ACT_EXPORT_TIME=285 ;; + "x86:fp32:xnnpack+custom+quantize_kv:") ACT_EXPORT_TIME=295 ;; + "x86:fp32:xnnpack+quantize_kv:") ACT_EXPORT_TIME=356 ;; + "x86:fp32:qnn:16a16w") ACT_EXPORT_TIME=334 ;; + "x86:fp32:qnn:8a8w") ACT_EXPORT_TIME=81 ;; + + # Linux ARM64 configurations + "arm64:fp32:portable:") ACT_EXPORT_TIME=124 ;; + "arm64:fp32:xnnpack+custom:") ACT_EXPORT_TIME=483 ;; + "arm64:bf16:portable:") ACT_EXPORT_TIME=118 ;; + "arm64:bf16:custom:") ACT_EXPORT_TIME=102 ;; + "arm64:fp32:xnnpack+custom+qe:") ACT_EXPORT_TIME=486 ;; + "arm64:fp32:xnnpack+custom+quantize_kv:") ACT_EXPORT_TIME=521 ;; + "arm64:fp32:xnnpack+quantize_kv:") ACT_EXPORT_TIME=514 ;; + + # macOS configurations + "macos:fp32:mps:") ACT_EXPORT_TIME=30 ;; + "macos:fp32:coreml:") ACT_EXPORT_TIME=61 ;; + "macos:fp32:xnnpack+custom+quantize_kv:") ACT_EXPORT_TIME=133 ;; + + # Default fallback for unknown configurations + *) + ACT_EXPORT_TIME=450 + echo "Warning: No threshold defined for ${PLATFORM}:${DTYPE}:${MODE}:${PT2E_QUANTIZE}, using default: $((ACT_EXPORT_TIME + BUFFER_TIME))s" + ;; +esac + +MAX_EXPORT_TIME=$((ACT_EXPORT_TIME + BUFFER_TIME)) + echo "QNN option ${QNN}" echo "QNN_SDK_ROOT: ${QNN_SDK_ROOT}" @@ -254,9 +301,24 @@ fi if [[ "${QUANTIZE_KV_CACHE}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} model.quantize_kv_cache=true" fi + +EXPORT_START_TIME=$(date +%s) + # Add dynamically linked library location $PYTHON_EXECUTABLE -m extension.llm.export.export_llm ${EXPORT_ARGS} +EXPORT_END_TIME=$(date +%s) +EXPORT_DURATION=$((EXPORT_END_TIME - EXPORT_START_TIME)) +echo "Model export completed at $(date +"%Y-%m-%d %H:%M:%S") - Duration: ${EXPORT_DURATION} seconds" + +# Check export time against threshold. Default is 500 seconds. +if [ $EXPORT_DURATION -gt $MAX_EXPORT_TIME ]; then + echo "Failure: Export took ${EXPORT_DURATION}s (threshold: ${MAX_EXPORT_TIME}s). This PR may have regressed export time — review changes or bump the threshold if appropriate." +fi + +echo "Success; Export time check passed: ${EXPORT_DURATION}s <= ${MAX_EXPORT_TIME}s" + + # Create tokenizer.bin. echo "Creating tokenizer.bin" $PYTHON_EXECUTABLE -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin diff --git a/scripts/check_model_export_times.py b/scripts/check_model_export_times.py new file mode 100644 index 00000000000..f85a7c5a793 --- /dev/null +++ b/scripts/check_model_export_times.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import re +from collections import defaultdict +from datetime import datetime + +import requests + + +class GithubActionsClient: + + def __init__(self, token: str): + + self.base_url = "https://api.github.com/repos/pytorch/executorch" + self.__headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github+json", + } + + def get_runs(self, params=None): + + runs_url = f"{self.base_url}/actions/runs" + response = requests.get(runs_url, headers=self.__headers, params=params) + response.raise_for_status() + + return response.json()["workflow_runs"] + + def get_jobs(self, run_id: int, jobs_per_page: int = 100): + + jobs_url = f"{self.base_url}/actions/runs/{run_id}/jobs" + all_jobs = [] + page = 1 + + while True: + response = requests.get( + jobs_url, + headers=self.__headers, + params={"per_page": jobs_per_page, "page": page}, + ) + response.raise_for_status() + + json_response = response.json() + jobs = json_response["jobs"] + + if not jobs: # No more jobs + break + + all_jobs.extend(jobs) + + # Stop if we got fewer jobs than requested (last page) + if len(jobs) < jobs_per_page: + break + + page += 1 + + return all_jobs + + def get_job_logs(self, job_id: int): + + logs_url = f"{self.base_url}/actions/jobs/{job_id}/logs" + response = requests.get(logs_url, headers=self.__headers) + response.raise_for_status() + + return response.content.decode() + + +def extract_model_export_times(log): + + duration = re.search(r"Model export completed .* Duration: (\d+)", log) + docker_image = re.search(r"DOCKER_IMAGE:\s*(.+?)(?:\s|$)", log) + dtype = re.search(r"DTYPE=(\w+)", log) + mode = re.search(r"MODE=(\S+)", log) + runner = re.search(r"runner:\s*(\S+)", log) + + log_extract = { + "duration": duration.group(1) if duration else None, + "docker_image": docker_image.group(1) if docker_image else None, + "dtype": dtype.group(1) if dtype else None, + "mode": mode.group(1) if mode else None, + "runner": runner.group(1) if runner else None, + } + + return log_extract + + +def extract_full_model_export_times(gha_client, filters=None, run_id=None): + + if run_id: + # run_id will be a list when using nargs='+' + if isinstance(run_id, list): + all_runs = [{"id": rid} for rid in run_id] + else: + # Fallback for single string + all_runs = [{"id": run_id}] + else: + # No run_id provided, fetch runs using filters + all_runs = gha_client.get_runs(params=filters) + + model_tracker = defaultdict(list) + + for idx, run in enumerate(all_runs, 1): + + run_id_val = run["id"] + print(f"Processing run {idx}/{len(all_runs)}: ID {run_id_val}") + + try: + jobs = gha_client.get_jobs(run_id_val) + + for job in jobs: + + if job["conclusion"] == "skipped": + continue + + if not ("test-llama" in job["name"]): + continue + + try: + log = gha_client.get_job_logs(job_id=job["id"]) + + extracted_config = extract_model_export_times(log) + extracted_config["job_name"] = job["name"] + + if extracted_config["duration"]: + model_tracker[run_id_val].append(extracted_config) + + except Exception as e: + print(f" Warning: Failed to get logs for job {job['id']}: {e}") + continue + + except Exception as e: + print(f" Error: Failed to get jobs for run {run_id_val}: {e}") + continue + + return model_tracker + + +def print_results_as_table(results_dict): + """Print results as a formatted markdown table.""" + + # Extract all jobs from the defaultdict + all_jobs = [] + for run_id, jobs in results_dict.items(): + for job in jobs: + job["run_id"] = run_id # Add run_id to each job + all_jobs.append(job) + + if not all_jobs: + print("No jobs found.") + return + + # Print header + print("\n## Model Export Times\n") + print("| Run ID | Job Name | DType | Mode | Runner | Docker Image | Duration (s) |") + print("|--------|----------|-------|------|--------|--------------|--------------|") + + # Print each job + for job in all_jobs: + run_id = job.get("run_id", "N/A") + job_name = job.get("job_name", "N/A")[:60] # Truncate long names + dtype = job.get("dtype", "N/A") + mode = job.get("mode", "N/A") + runner = job.get("runner", "N/A") + docker_image = job.get("docker_image", "None") + duration = job.get("duration", "N/A") + + # Truncate docker image if too long + if docker_image and len(docker_image) > 40: + docker_image = docker_image[:37] + "..." + + print( + f"| {run_id} | {job_name} | {dtype} | {mode} | {runner} | {docker_image} | {duration} |" + ) + + # Print summary statistics + print(f"\n**Total Jobs:** {len(all_jobs)}") + + # Calculate average duration + durations = [ + int(job["duration"]) for job in all_jobs if job.get("duration", "").isdigit() + ] + if durations: + avg_duration = sum(durations) / len(durations) + print(f"**Average Duration:** {avg_duration:.1f} seconds") + print(f"**Min Duration:** {min(durations)} seconds") + print(f"**Max Duration:** {max(durations)} seconds") + + +def main(): + + parser = argparse.ArgumentParser( + description="A tool to get all model export times for the different configurations based on the githug actions runs" + ) + + parser.add_argument( + "--github_token", + metavar="executable", + type=str, + help="Your github access token", + default="", + ) + + parser.add_argument( + "--created_time", + metavar="executable", + type=str, + help="The date of the earliest github runs to include of the format YYYY-MM-DD", + default=datetime.today().strftime("%Y-%m-%d"), + ) + + parser.add_argument( + "--run_id", + metavar="RUN_ID", + type=str, + nargs="+", # Accept one or more arguments + help="One or more run IDs to extract model export times from", + default=None, + ) + + args = parser.parse_args() + + gha_client = GithubActionsClient(token=args.github_token) + + filters = {"created": f">={args.created_time}"} + + model_tracker_output = extract_full_model_export_times( + gha_client, filters=filters, run_id=args.run_id + ) + + print_results_as_table(model_tracker_output) + + +if __name__ == "__main__": + main()