diff --git a/src/madengine/deployment/k8s_pvc.py b/src/madengine/deployment/k8s_pvc.py new file mode 100644 index 00000000..2c57aa1a --- /dev/null +++ b/src/madengine/deployment/k8s_pvc.py @@ -0,0 +1,239 @@ +""" +Kubernetes PVC lifecycle management mixin. + +Handles PersistentVolumeClaim creation, deletion, and storage class +resolution for both per-job results and long-lived shared data volumes. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import time +from pathlib import Path +from typing import Optional + +from jinja2 import Template + +try: + from kubernetes.client.rest import ApiException + + KUBERNETES_AVAILABLE = True +except ImportError: + KUBERNETES_AVAILABLE = False + +try: + import yaml + + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False + + +class KubernetesPVCMixin: + """PVC lifecycle management for Kubernetes deployments.""" + + def _k8s_data_storage_class(self) -> Optional[str]: + """StorageClass for long-lived ``madengine-shared-data`` (NFS RWX recommended).""" + return ( + self.k8s_config.get("data_storage_class") + or self.k8s_config.get("nfs_storage_class") + or self.k8s_config.get("storage_class") + ) + + def _k8s_results_storage_class(self, nnodes: int) -> Optional[str]: + """ + Per-job results: local-path (RWO) for single-node, NFS (RWX) for multi-node. + + Falls back to ``storage_class`` for backward compatibility. + """ + if nnodes > 1: + return ( + self.k8s_config.get("multi_node_results_storage_class") + or self.k8s_config.get("nfs_storage_class") + or self.k8s_config.get("storage_class") + ) + return ( + self.k8s_config.get("single_node_results_storage_class") + or self.k8s_config.get("local_path_storage_class") + or self.k8s_config.get("storage_class") + ) + + def _create_results_pvc(self, nnodes: int = 1) -> str: + """ + Create a PersistentVolumeClaim for per-job results. + + Single-node uses ReadWriteOnce (typically local-path). Multi-node uses + ReadWriteMany (typically nfs-banff or other RWX class). + """ + pvc_name = f"{self.job_name}-results" + access_mode = "ReadWriteMany" if nnodes > 1 else "ReadWriteOnce" + storage_class = self._k8s_results_storage_class(nnodes) + + template_dir = Path(__file__).parent / "templates" / "kubernetes" + pvc_template = template_dir / "pvc.yaml.j2" + + with open(pvc_template, "r") as f: + pvc_template_str = f.read() + + template = Template(pvc_template_str) + self.console.print( + f"[dim] Results PVC: access={access_mode}, " + f"storageClass={storage_class or '(cluster default)'}[/dim]" + ) + if nnodes > 1 and not storage_class: + self.console.print( + "[yellow]⚠️ Multi-node: set k8s.nfs_storage_class or " + "multi_node_results_storage_class to an RWX class (e.g. nfs-banff).[/yellow]" + ) + pvc_yaml = template.render( + pvc_name=pvc_name, + namespace=self.namespace, + access_mode=access_mode, + storage_size=self.k8s_config.get("results_storage_size", "10Gi"), + storage_class=storage_class, + ) + + # Create PVC (retry on 409 "object is being deleted" until it is gone) + pvc_dict = yaml.safe_load(pvc_yaml) + max_create_retries = 6 + create_wait_seconds = 5 + for attempt in range(max_create_retries): + try: + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, body=pvc_dict + ) + return pvc_name + except ApiException as e: + if e.status == 409 and e.body and "object is being deleted" in (e.body or ""): + if attempt < max_create_retries - 1: + self.console.print( + f"[dim]PVC still terminating, waiting {create_wait_seconds}s before retry ({attempt + 1}/{max_create_retries})[/dim]" + ) + time.sleep(create_wait_seconds) + else: + raise + else: + raise + + def _wait_for_pvc_deleted(self, pvc_name: str, max_wait: int = 90) -> None: + """Block until the PVC is fully removed (or timeout).""" + for i in range(max_wait): + try: + self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, namespace=self.namespace + ) + if i > 0 and i % 10 == 0: + self.console.print( + f"[dim]Waiting for PVC {pvc_name} to be removed... ({i}s)[/dim]" + ) + time.sleep(1) + except ApiException as e: + if e.status == 404: + return + raise + + def _create_or_get_data_pvc(self, nnodes: int = 1) -> str: + """ + Create or reuse ``madengine-shared-data`` for long-lived datasets (cache). + + Always uses ReadWriteMany + an NFS-style StorageClass so the same PVC + works for single- and multi-pod jobs. Use ``data_storage_class`` or + ``nfs_storage_class`` (e.g. nfs-banff), not local-path. + + Args: + nnodes: Reserved for logging (shared-data access mode does not depend on it). + + Returns: + Name of the PVC (existing or newly created) + """ + pvc_name = "madengine-shared-data" + + if self.k8s_config.get("recreate_shared_data_pvc"): + try: + self.core_v1.delete_namespaced_persistent_volume_claim( + name=pvc_name, namespace=self.namespace + ) + self.console.print( + "[yellow]recreate_shared_data_pvc: deleted existing " + f"{pvc_name} (backup data first if needed)[/yellow]" + ) + self._wait_for_pvc_deleted(pvc_name) + except ApiException as e: + if e.status != 404: + raise + + try: + existing_pvc = self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace, + ) + self.console.print(f"[dim]✓ Using existing data PVC: {pvc_name}[/dim]") + + access_modes = existing_pvc.spec.access_modes or [] + if "ReadWriteMany" not in access_modes: + self.console.print( + f"[yellow]⚠️ Warning: {pvc_name} is not ReadWriteMany " + f"(modes: {access_modes}).[/yellow]" + ) + self.console.print( + "[yellow] For NFS-backed long-lived data, delete the PVC and re-run with " + "k8s.data_storage_class / nfs_storage_class set, or use " + "recreate_shared_data_pvc (after backup).[/yellow]" + ) + return pvc_name + + except ApiException as e: + if e.status != 404: + raise + + access_mode = "ReadWriteMany" + storage_class = self._k8s_data_storage_class() + self.console.print(f"[blue]Creating shared data PVC: {pvc_name}...[/blue]") + self.console.print( + f"[dim] Access mode: {access_mode}; storageClass={storage_class or '(cluster default)'}; " + f"nnodes={nnodes}[/dim]" + ) + if not storage_class: + self.console.print( + "[yellow]⚠️ Set k8s.nfs_storage_class or data_storage_class to an RWX class " + "(e.g. nfs-banff) for shared-data. Default SC may be local-path (RWO-only).[/yellow]" + ) + + template_dir = Path(__file__).parent / "templates" / "kubernetes" + pvc_template = template_dir / "pvc-data.yaml.j2" + + with open(pvc_template, "r") as f: + pvc_template_str = f.read() + + template = Template(pvc_template_str) + pvc_yaml = template.render( + pvc_name=pvc_name, + namespace=self.namespace, + access_mode=access_mode, + storage_size=self.k8s_config.get("data_storage_size", "100Gi"), + storage_class=storage_class, + ) + + pvc_dict = yaml.safe_load(pvc_yaml) + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, body=pvc_dict + ) + + self.console.print("[dim]Waiting for PVC to be bound...[/dim]") + for _ in range(30): + try: + pvc = self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, namespace=self.namespace + ) + if pvc.status.phase == "Bound": + self.console.print("[green]✓ PVC bound successfully[/green]") + break + except ApiException: + pass + time.sleep(1) + else: + self.console.print( + f"[yellow]⚠️ Warning: PVC created but not bound yet. " + f"Check: kubectl describe pvc {pvc_name}[/yellow]" + ) + + return pvc_name diff --git a/src/madengine/deployment/k8s_results.py b/src/madengine/deployment/k8s_results.py new file mode 100644 index 00000000..0cd1c8c6 --- /dev/null +++ b/src/madengine/deployment/k8s_results.py @@ -0,0 +1,1389 @@ +""" +Kubernetes results collection and performance reporting mixin. + +Handles collecting pod logs, PVC artifacts, parsing performance metrics, +aggregating multi-node results, and writing to perf.csv / perf_super. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import re +import subprocess +import time +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +from .common import normalize_launcher +from madengine.utils.path_utils import scripts_base_dir_from +from madengine.utils.run_details import flatten_tags_in_place, get_build_number, get_pipeline + +try: + from kubernetes.client.rest import ApiException + + KUBERNETES_AVAILABLE = True +except ImportError: + KUBERNETES_AVAILABLE = False + +try: + from madengine.reporting.update_perf_csv import update_perf_csv + from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv + REPORTING_AVAILABLE = True +except ImportError: + REPORTING_AVAILABLE = False + + +def _pod_job_name_label_selector(deployment_id: str) -> str: + """Selector for the ``job-name`` pod label; value must be a valid <=63-char label value.""" + from .k8s_names import sanitize_k8s_label_value + return f"job-name={sanitize_k8s_label_value(deployment_id)}" + + +class KubernetesResultsMixin: + """Results collection and performance reporting for Kubernetes deployments.""" + + # Standard perf.csv header (must match container_runner.ensure_perf_csv_exists) + _PERF_CSV_HEADER = ( + "model,n_gpus,nnodes,gpus_per_node,training_precision,pipeline,args,tags," + "docker_file,base_docker,docker_sha,docker_image,git_commit,machine_name," + "deployment_type,launcher,gpu_architecture,performance,metric,relative_change," + "status,build_duration,test_duration,dataname,data_provider_type,data_size," + "data_download_duration,build_number,additional_docker_run_options" + ) + + def collect_results(self, deployment_id: str) -> Dict[str, Any]: + """ + Enhanced results collection from K8s pods following vLLM multi-node best practices. + + For Data Parallel deployments (vLLM, SGLang): + - Each pod runs an independent replica + - Only pod-0 reports metrics to avoid duplicates + - Total throughput = pod-0 throughput x num_replicas + + Collects: + 1. Pod logs (``k8s_results///pod.log``) + 2. PVC mirror per pod (``...//pvc/``), mapped from ``/results//`` + 3. File artifacts via kubectl cp when pods are still running (keep-alive path) + + Returns: + Dict with logs, artifacts, and performance results + """ + results = { + "job_name": deployment_id, + "namespace": self.namespace, + "logs": [], + "artifacts": [], + "successful_runs": [], + "failed_runs": [], + } + + # Create results directory for this deployment + results_dir = Path(f"./k8s_results/{deployment_id}") + results_dir.mkdir(parents=True, exist_ok=True) + + self.console.print(f"[cyan]📦 Collecting results from K8s job: {deployment_id}[/cyan]") + + try: + # Get pods for this job + pods = self.core_v1.list_namespaced_pod( + namespace=self.namespace, + label_selector=_pod_job_name_label_selector(deployment_id), + ) + + # Get model info and build info from manifest + model_keys = list(self.manifest["built_models"].keys()) + if model_keys: + model_key = model_keys[0] + model_info = self.manifest["built_models"][model_key] + else: + model_info = {} + + # Get build info from built_images + image_keys = list(self.manifest.get("built_images", {}).keys()) + if image_keys: + image_key = image_keys[0] + build_info = self.manifest["built_images"][image_key] + else: + build_info = {} + + # Check if this is a multi-node distributed job + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + is_distributed = distributed_config.get("enabled", False) + nnodes = distributed_config.get("nnodes", 1) + is_multinode = is_distributed and nnodes > 1 + + # Determine launcher_type the same way as _prepare_template_context does + # (deployment_config doesn't store launcher_type directly) + launcher_config = self.config.additional_context.get("launcher", {}) + launcher_type = ( + launcher_config.get("type") + if launcher_config.get("type") is not None + else distributed_config.get("launcher") + ) + + # Normalize launcher based on deployment type and validity + launcher_type = normalize_launcher(launcher_type, "kubernetes") + + is_ray_launcher = launcher_type in ["vllm", "sglang"] + + # Sort pods by name to ensure consistent ordering (pod-0 is master) + sorted_pods = sorted(pods.items, key=lambda p: p.metadata.name) + + # ======================================================================== + # Per-Node Collection Strategy + # Collect logs and artifacts from ALL nodes + # Parse performance from ALL nodes (each reports node-local metrics) + # Aggregate metrics based on type (sum for throughput, etc.) + # ======================================================================== + + per_node_metrics = [] # Store performance from each node + results["nodes"] = [] # Store per-node details for display + + # Special handling for Ray-based launchers (vLLM, SGLang) + # These report per-replica metrics, need scaling + if is_multinode and is_ray_launcher: + self.console.print( + f"[cyan]Multi-node Ray deployment: {nnodes} nodes (Data Parallel mode)[/cyan]" + ) + + # Collect from ALL pods + for pod_index, pod in enumerate(sorted_pods): + pod_name = pod.metadata.name + pod_dir = results_dir / pod_name + pod_dir.mkdir(exist_ok=True) + + # Extract node rank from pod name (e.g., madengine-dummy-torchrun-0 -> 0) + try: + node_rank = int(pod_name.rsplit('-', 1)[-1]) + except (ValueError, IndexError): + node_rank = pod_index + + self.console.print(f"[dim] Collecting from pod: {pod_name} (node-{node_rank})[/dim]") + + try: + # 1. Collect pod logs + log = self.core_v1.read_namespaced_pod_log( + name=pod_name, namespace=self.namespace + ) + log_file = pod_dir / "pod.log" + log_file.write_text(log) + results["logs"].append({ + "pod": pod_name, + "log": log, + "file": str(log_file) + }) + + # 2. Parse NODE-LOCAL performance from log + perf_data = self._parse_performance_from_log( + log, model_info.get("name", "") + ) + + # Pod phase/exit can lag right after Job success; poll until terminal or timeout + pod = self._refresh_pod_until_terminal_phase(pod_name) + pod_status = pod.status.phase if pod else "Unknown" + pod_exit_code = ( + self._primary_workload_container_exit_code(pod) if pod else -1 + ) + + # Store per-node info for display table + node_info = { + "node_id": node_rank, + "pod_name": pod_name, + "status": "SUCCESS" if pod_status == "Succeeded" and pod_exit_code == 0 else "FAILED", + "exit_code": pod_exit_code, + "performance": perf_data.get("performance") if perf_data else None, + "metric": perf_data.get("metric") if perf_data else None, + "duration": perf_data.get("duration") if perf_data else None, + "log_file": str(log_file) + } + results["nodes"].append(node_info) + + if perf_data: + # For Ray launchers, this is per-replica metric + if is_multinode and is_ray_launcher: + perf_data["is_per_replica"] = True + per_node_metrics.append(perf_data) + self.console.print( + f"[green] ✓ Parsed performance: {perf_data['performance']:.2f} " + f"{perf_data['metric']} (node-{node_rank})[/green]" + ) + else: + self.console.print( + f"[dim] No performance metric found in node-{node_rank} log[/dim]" + ) + + except ApiException as e: + self.console.print( + f"[red]✗ Failed to get logs for pod {pod_name}: {e.reason}[/red]" + ) + results["nodes"].append({ + "node_id": node_rank, + "pod_name": pod_name, + "status": "FAILED", + "exit_code": -1, + "performance": None, + "metric": None, + "error": f"Failed to get logs: {e.reason}" + }) + except Exception as e: + self.console.print( + f"[red]✗ Error collecting from pod {pod_name}: {e}[/red]" + ) + results["nodes"].append({ + "node_id": node_rank, + "pod_name": pod_name, + "status": "FAILED", + "exit_code": -1, + "performance": None, + "metric": None, + "error": str(e) + }) + + self.console.print( + f"[green]✓ Collected logs from {len(results['logs'])} pods[/green]" + ) + + # Collect artifacts from PVC before deciding success/failure (needed for multiple_results fallback) + k8s_pod_names = [p.metadata.name for p in sorted_pods] + self._collect_from_pvc(deployment_id, results_dir, results, pod_names=k8s_pod_names) + + # ======================================================================== + # Aggregate per-node metrics + # ======================================================================== + if per_node_metrics: + # Special handling for Ray launchers - multiply by nnodes + if is_multinode and is_ray_launcher: + original_perf = per_node_metrics[0]["performance"] + aggregated_perf = original_perf * nnodes + self.console.print( + f"[green] Per-replica: {original_perf:.1f} req/s[/green]" + ) + self.console.print( + f"[green] Total capacity: {aggregated_perf:.1f} req/s ({nnodes} nodes)[/green]" + ) + + # Create aggregated record manually for Ray + aggregated_record = { + "model": per_node_metrics[0]["model"], + "performance": aggregated_perf, + "metric": per_node_metrics[0]["metric"], + "status": "SUCCESS", + "topology": f"{nnodes}N×{per_node_metrics[0].get('local_gpus', 1)}G", + "nnodes": nnodes, + "launcher": launcher_type or "N/A", + "deployment_type": "kubernetes", + "gpu_architecture": per_node_metrics[0].get("gpu_architecture", "N/A"), + "duration": per_node_metrics[0].get("duration", "N/A"), + "data_name": per_node_metrics[0].get("data_name", "N/A"), + "data_provider": per_node_metrics[0].get("data_provider", "N/A"), + "aggregation_method": "scaled_by_nnodes", + "nodes_contributing": nnodes + } + else: + # Use new aggregation logic for other launchers + aggregated_record = self._aggregate_node_metrics( + per_node_metrics, + nnodes, + launcher_type + ) + + if aggregated_record: + # Full reporting pipeline: perf_entry at project root, then update_* (same as local/SLURM) + self._ensure_perf_csv_exists() + run_details_dict = self._build_perf_entry_from_aggregated( + aggregated_record, model_info, build_info, deployment_id + ) + perf_entry_path = Path("perf_entry.json") + with open(perf_entry_path, "w", encoding="utf-8") as f: + json.dump(run_details_dict, f, indent=2) + if run_details_dict.get("status") == "SUCCESS": + update_perf_csv(perf_csv="perf.csv", single_result=str(perf_entry_path)) + else: + update_perf_csv(perf_csv="perf.csv", exception_result=str(perf_entry_path)) + scripts_path = model_info.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + try: + if run_details_dict.get("status") == "SUCCESS": + num_entries = update_perf_super_json( + single_result=str(perf_entry_path), + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + else: + num_entries = update_perf_super_json( + exception_result=str(perf_entry_path), + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries, + ) + except Exception as e: + self.console.print(f"[yellow]⚠ Could not update perf_super: {e}[/yellow]") + results["successful_runs"].append({ + "model": model_info.get("name"), + "perf_data": aggregated_record, + "nodes": results["nodes"], + "per_node_metrics": per_node_metrics + }) + self.console.print( + f"[green]✓ Aggregated performance from {len(per_node_metrics)} nodes[/green]" + ) + self.console.print( + f"[green]✓ Updated perf_entry.json, perf.csv, perf_super.* (Docker-compatible)[/green]" + ) + else: + # No performance from log: try multiple_results CSV (same contract as local Docker) + # Resolve single CSV path (one pod) or merged CSV path (multi-pod with sum/avg rules) + resolved_csv_path = self._resolve_multiple_results_csv( + results_dir, results, model_info + ) + if resolved_csv_path and REPORTING_AVAILABLE: + # Docker-compatible flow: produce perf.csv, perf_entry.*, perf_super.* + gpu_arch = "N/A" + if results.get("logs"): + log_content = results["logs"][0].get("log", "") + m = re.search(r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)", log_content) + if m: + gpu_arch = m.group(1) + self._ensure_perf_csv_exists() + common_info = self._build_common_info_dict( + model_info, build_info, deployment_id, gpu_arch + ) + common_info_path = Path("common_info.json") + with open(common_info_path, "w", encoding="utf-8") as f: + json.dump(common_info, f, indent=2) + update_perf_csv( + perf_csv="perf.csv", + multiple_results=str(resolved_csv_path), + common_info=str(common_info_path), + model_name=model_info.get("name", ""), + ) + scripts_path = model_info.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + num_entries = update_perf_super_json( + perf_super_json="perf_super.json", + multiple_results=str(resolved_csv_path), + common_info=str(common_info_path), + model_name=model_info.get("name", ""), + scripts_base_dir=scripts_base_dir, + ) + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries, + ) + # Build successful_runs for display (one entry per CSV row) + import csv as _csv + model_name = model_info.get("name", "") + with open(resolved_csv_path, "r", encoding="utf-8", errors="ignore") as f: + reader = _csv.DictReader(f) + for row in reader: + row = {k.strip(): v for k, v in row.items() if k} + if row.get("performance") and row.get("metric"): + display_model = f"{model_name}_{row.get('model', '')}" + record = self._create_multiple_result_row_record( + model_info, build_info, deployment_id, + { + "model": display_model, + "performance": row.get("performance"), + "metric": row.get("metric", ""), + "gpu_architecture": gpu_arch, + "duration": row.get("test_duration", "N/A"), + }, + ) + if record: + results["successful_runs"].append({ + "model": display_model, + "perf_data": record, + "nodes": [], + "per_node_metrics": [{"model": display_model, "performance": row.get("performance"), "metric": row.get("metric", "")}], + }) + self.console.print( + f"[green]✓ Updated perf.csv, perf_entry.*, perf_super.* (Docker-compatible)[/green]" + ) + elif resolved_csv_path and not REPORTING_AVAILABLE: + # Fallback when reporting module not available: legacy row-by-row write + fallback_metrics = self._parse_multiple_results_from_artifacts( + results_dir, results, model_info, build_info + ) + if fallback_metrics: + for item in fallback_metrics: + record = self._create_multiple_result_row_record( + model_info, build_info, deployment_id, item + ) + if record: + self._write_to_perf_csv(record) + results["successful_runs"].append({ + "model": item["model"], + "perf_data": record, + "nodes": [], + "per_node_metrics": [item], + }) + self.console.print( + f"[green]✓ Wrote {len(fallback_metrics)} row(s) from multiple_results to perf.csv[/green]" + ) + if not resolved_csv_path: + # No multiple_results CSV found: record failure + error_msg = "No performance metrics found from any node" + failure_record = self._create_failure_record( + model_info, build_info, deployment_id, error_msg + ) + self._write_to_perf_csv(failure_record) + results["failed_runs"].append({ + "model": model_info.get("name", "Unknown"), + "error": error_msg, + "nodes": results["nodes"] + }) + self.console.print( + f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" + ) + elif resolved_csv_path and not REPORTING_AVAILABLE and not results.get("successful_runs"): + # Legacy path ran but produced no valid rows + error_msg = "No performance metrics found from any node" + failure_record = self._create_failure_record( + model_info, build_info, deployment_id, error_msg + ) + self._write_to_perf_csv(failure_record) + results["failed_runs"].append({ + "model": model_info.get("name", "Unknown"), + "error": error_msg, + "nodes": results["nodes"] + }) + self.console.print( + f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" + ) + + # 4. Generate summary + self._generate_results_summary(results, results_dir) + + except Exception as e: + self.console.print(f"[yellow]⚠ Results collection incomplete: {e}[/yellow]") + + return results + + def _collect_artifacts_immediately(self, deployment_id: str, pod_name: str) -> None: + """ + Collect artifacts immediately from a running pod during the sleep period. + This is called when we detect the "Keeping pod alive" message in logs. + """ + try: + # Create results directory + results_dir = Path("k8s_results") / deployment_id + results_dir.mkdir(parents=True, exist_ok=True) + + pod_dir = results_dir / pod_name + pod_dir.mkdir(exist_ok=True) + + # Collect artifacts + artifacts = self._collect_pod_artifacts(pod_name, pod_dir) + + if artifacts: + self.console.print(f"[green]✓ Collected {len(artifacts)} artifacts from {pod_name}[/green]") + else: + self.console.print(f"[yellow]⚠ No artifacts collected from {pod_name}[/yellow]") + + except Exception as e: + self.console.print(f"[yellow]⚠ Error collecting artifacts: {e}[/yellow]") + + def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: + """ + Collect file artifacts from pod using kubectl cp. + + Collects: + - perf.csv (performance results) + - *_env.csv (environment details from rocEnvTool) + - profiling outputs (rocprof*, results*, *.db) + - tracing outputs (*_output/ directories) + - tool-specific outputs + + Args: + pod_name: Name of the Kubernetes pod + dest_dir: Local directory to save artifacts + + Returns: + List of collected artifact metadata + """ + artifacts = [] + + # Define artifact patterns to collect + artifact_patterns = [ + {"pattern": "perf.csv", "type": "performance"}, + {"pattern": "*_env.csv", "type": "environment"}, + {"pattern": "results*", "type": "profiling"}, + {"pattern": "*.db", "type": "profiling"}, + {"pattern": "trace.*", "type": "tracing"}, + {"pattern": "prof.csv", "type": "profiling"}, # Raw profiler output before post-script renames it + {"pattern": "gpu_info_*.csv", "type": "profiling"}, + {"pattern": "library_trace.csv", "type": "tracing"}, + ] + + for artifact_def in artifact_patterns: + pattern = artifact_def["pattern"] + artifact_type = artifact_def["type"] + + try: + # Try direct kubectl cp without exec (works during the sleep period) + # For patterns with wildcards, try common specific filenames + if '*' in pattern: + # Expand pattern to specific known files + if pattern == "*_env.csv": + specific_files = ["dummy_prof_env.csv", "dummy_data_minio_env.csv"] + elif pattern == "gpu_info_*.csv": + specific_files = ["gpu_info_power_profiler_output.csv", "gpu_info_vram_profiler_output.csv"] + elif pattern == "results*": + specific_files = ["results.csv", "results.txt", "results.json"] + elif pattern == "trace.*": + specific_files = ["trace.txt", "trace.csv", "trace.json"] + else: + specific_files = [] + + for filename in specific_files: + local_path = dest_dir / filename + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{pod_name}:/workspace/{filename}", + str(local_path) + ] + + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=30 + ) + + if cp_result.returncode == 0 and local_path.exists(): + artifacts.append({ + "pod": pod_name, + "type": artifact_type, + "source": f"/workspace/{filename}", + "local_path": str(local_path), + "size": local_path.stat().st_size + }) + self.console.print( + f"[dim] ✓ Collected {artifact_type}: {filename}[/dim]" + ) + elif cp_result.stderr and "No such file" not in cp_result.stderr: + # Log unexpected errors (but not "file not found") + self.console.print( + f"[yellow] ⚠ Failed to collect {filename}: {cp_result.stderr.strip()}[/yellow]" + ) + else: + # Direct file - try to copy it + local_path = dest_dir / pattern + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{pod_name}:/workspace/{pattern}", + str(local_path) + ] + + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=30 + ) + + if cp_result.returncode == 0 and local_path.exists(): + artifacts.append({ + "pod": pod_name, + "type": artifact_type, + "source": f"/workspace/{pattern}", + "local_path": str(local_path), + "size": local_path.stat().st_size + }) + self.console.print( + f"[dim] ✓ Collected {artifact_type}: {pattern}[/dim]" + ) + elif cp_result.stderr and "No such file" not in cp_result.stderr: + # Log unexpected errors (but not "file not found") + self.console.print( + f"[yellow] ⚠ Failed to collect {pattern}: {cp_result.stderr.strip()}[/yellow]" + ) + + except subprocess.TimeoutExpired: + pass # Timeout - skip this file + except Exception: + pass # File not found or not accessible - this is expected + + # Try to collect known output directories using kubectl cp directly (during sleep period) + output_directories = ["rocprof_output", "rpd_output", "trace_output"] + for dir_name in output_directories: + try: + local_dir = dest_dir / dir_name + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{pod_name}:/workspace/{dir_name}", + str(local_dir) + ] + + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=60 + ) + + if cp_result.returncode == 0 and local_dir.exists(): + # Count files in directory + file_count = sum(1 for _ in local_dir.rglob('*') if _.is_file()) + if file_count > 0: + total_size = sum(f.stat().st_size for f in local_dir.rglob('*') if f.is_file()) + artifacts.append({ + "pod": pod_name, + "type": "tool_output_directory", + "source": f"/workspace/{dir_name}", + "local_path": str(local_dir), + "file_count": file_count, + "size": total_size + }) + self.console.print( + f"[dim] ✓ Collected directory: {dir_name} ({file_count} files, {total_size} bytes)[/dim]" + ) + except Exception: + pass # Directory not found - this is expected + + return artifacts + + def _collect_from_pvc( + self, + deployment_id: str, + results_dir: Path, + results: Dict, + pod_names: Optional[List[str]] = None, + ): + """ + Collect all artifacts from the PVC using a temporary busybox pod. + + This is the best practice for collecting results from completed K8s jobs. + kubectl cp doesn't work on completed pods, so we use a helper pod. + + When ``pod_names`` is provided, each ``/results//`` is copied to + ``results_dir//pvc/`` by matching subdir to pod name (exact or + ``pod.startswith(subdir + "-")``). Unmatched subdirs go under + ``results_dir/pvc_unmapped//``. When ``pod_names`` is omitted, the + legacy layout ``results_dir//`` is used. + + Args: + deployment_id: Job deployment ID + results_dir: Local directory to save results + results: Results dict to update + pod_names: Full Kubernetes pod names for this job (ordered) + """ + from .kubernetes import assign_pvc_subdirs_to_pods + + pvc_name = f"{deployment_id}-results" + + try: + # Create a temporary pod to access PVC + collector_pod_name = f"collector-{deployment_id[:15]}" + + self.console.print(f"[dim]📦 Collecting artifacts from PVC: {pvc_name}[/dim]") + + collector_spec: Dict[str, Any] = { + "restartPolicy": "Never", + "containers": [{ + "name": "collector", + "image": "busybox:latest", + "command": ["sh", "-c", "sleep 600"], + "volumeMounts": [{"name": "results", "mountPath": "/results"}] + }], + "volumes": [{"name": "results", "persistentVolumeClaim": {"claimName": pvc_name}}] + } + ips = getattr(self, "_image_pull_secrets_for_pods", None) or [] + if ips: + collector_spec["imagePullSecrets"] = ips + + collector_pod_spec = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": {"name": collector_pod_name, "namespace": self.namespace}, + "spec": collector_spec, + } + + # Delete existing collector pod if it exists (prevents 409 Conflict) + try: + self.core_v1.delete_namespaced_pod( + collector_pod_name, self.namespace, grace_period_seconds=0 + ) + time.sleep(2) # Wait for pod to be deleted + except ApiException as e: + if e.status != 404: # 404 means pod doesn't exist, which is fine + pass + + # Create collector pod + self.core_v1.create_namespaced_pod(self.namespace, collector_pod_spec) + + # Wait for pod to be ready + for _ in range(30): # Wait up to 30 seconds + try: + pod_status = self.core_v1.read_namespaced_pod_status( + collector_pod_name, self.namespace + ) + if pod_status.status.phase == "Running": + break + except ApiException as e: + # Pod not found yet or not ready - this is expected during startup + if e.status != 404: + self.console.print(f"[dim]Waiting for collector pod (status: {e.status})...[/dim]") + time.sleep(1) + else: + raise Exception("Collector pod did not start in time") + + # Mount / NFS may need a moment before another pod sees prior job writes. + time.sleep(2) + + # List pod result directories in PVC (retry: NFS can lag right after Job completion) + list_cmd = [ + "kubectl", + "exec", + collector_pod_name, + "-n", + self.namespace, + "-c", + "collector", + "--", + "ls", + "-1", + "/results/", + ] + list_result = subprocess.CompletedProcess( + args=list_cmd, returncode=-1, stdout="", stderr="" + ) + pod_dirs: List[str] = [] + for attempt in range(45): + list_result = subprocess.run( + list_cmd, capture_output=True, text=True, timeout=30 + ) + if list_result.returncode == 0 and list_result.stdout.strip(): + pod_dirs = [ + d + for d in list_result.stdout.strip().split("\n") + if d and d != "lost+found" + ] + if pod_dirs: + break + if list_result.stderr.strip(): + self.console.print( + f"[dim] PVC ls attempt {attempt + 1} (rc={list_result.returncode}): " + f"{list_result.stderr.strip()[:300]}[/dim]" + ) + time.sleep(1) + + if pod_dirs: + pvc_map: Dict[str, str] = {} + if pod_names: + pvc_map = assign_pvc_subdirs_to_pods(pod_dirs, pod_names) + + for pod_dir_name in pod_dirs: + if not pod_dir_name: + continue + + matched_pod = pvc_map.get(pod_dir_name) if pod_names else None + if pod_names: + if matched_pod: + local_pod_dir = results_dir / matched_pod / "pvc" + else: + local_pod_dir = results_dir / "pvc_unmapped" / pod_dir_name + else: + local_pod_dir = results_dir / pod_dir_name + + local_pod_dir.mkdir(parents=True, exist_ok=True) + + cp_cmd = [ + "kubectl", + "cp", + "-c", + "collector", + f"{self.namespace}/{collector_pod_name}:/results/{pod_dir_name}", + str(local_pod_dir), + ] + + cp_result = subprocess.run(cp_cmd, capture_output=True, text=True, timeout=60) + + if cp_result.returncode == 0: + # Count collected files + file_count = sum(1 for _ in local_pod_dir.rglob('*') if _.is_file()) + if file_count > 0: + art: Dict[str, Any] = { + "source": f"PVC:{pvc_name}/{pod_dir_name}", + "local_path": str(local_pod_dir), + "file_count": file_count, + "type": "pvc_collection", + "pvc_subdir": pod_dir_name, + } + if pod_names: + art["k8s_pod"] = matched_pod + results["artifacts"].append(art) + if matched_pod: + dest_hint = f"{matched_pod}/pvc" + elif pod_names: + dest_hint = f"pvc_unmapped/{pod_dir_name}" + else: + dest_hint = pod_dir_name + self.console.print( + f"[dim] ✓ Collected {file_count} files from {pod_dir_name} → {dest_hint}[/dim]" + ) + + self.console.print(f"[green]✓ Collected artifacts from PVC[/green]") + else: + hint = "" + if list_result.returncode != 0 or list_result.stderr.strip(): + hint = ( + f" (kubectl exec rc={list_result.returncode}" + + ( + f", stderr={list_result.stderr.strip()[:400]!r}" + if list_result.stderr.strip() + else "" + ) + + ")" + ) + self.console.print( + f"[yellow]⚠ No results found in PVC after retries{hint}[/yellow]" + ) + + # Cleanup collector pod + self.core_v1.delete_namespaced_pod( + collector_pod_name, self.namespace, grace_period_seconds=0 + ) + + except Exception as e: + self.console.print(f"[yellow]⚠ Could not collect from PVC: {e}[/yellow]") + + def _generate_results_summary(self, results: Dict, results_dir: Path): + """ + Generate a summary JSON of all collected artifacts. + + Args: + results: Results dict with logs and artifacts + results_dir: Directory where results are saved + """ + summary = { + "job_name": results["job_name"], + "namespace": results["namespace"], + "collected_at": datetime.now().isoformat(), + "k8s_results_layout": ( + "Per pod: //pod.log (API log) and " + "//pvc/ (mirror of /results//). " + "Unmatched PVC subdirs: /pvc_unmapped//." + ), + "layout_version": 2, + "pods": len(results["logs"]), + "total_artifacts": len(results["artifacts"]), + "artifacts_by_type": {}, + "artifacts": results["artifacts"], + "successful_runs": len(results["successful_runs"]), + "failed_runs": len(results["failed_runs"]), + } + + # Group artifacts by type + for artifact in results["artifacts"]: + artifact_type = artifact.get("type", "unknown") + summary["artifacts_by_type"][artifact_type] = summary["artifacts_by_type"].get(artifact_type, 0) + 1 + + summary_file = results_dir / "results_summary.json" + summary_file.write_text(json.dumps(summary, indent=2)) + + self.console.print(f"[green]✓ Results summary: {summary_file}[/green]") + + # Print summary table if artifacts were collected + if summary["artifacts_by_type"]: + from rich.table import Table + table = Table(title="Collected Artifacts") + table.add_column("Type", style="cyan") + table.add_column("Count", justify="right", style="green") + + for artifact_type, count in sorted(summary["artifacts_by_type"].items()): + table.add_row(artifact_type, str(count)) + + self.console.print(table) + + def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: str, error_msg: str) -> Dict: + """ + Create a failure record for perf.csv when performance metrics are missing. + + Args: + model_info: Model information from manifest + build_info: Build information from manifest + pod_name: Kubernetes pod name + error_msg: Error message describing the failure + + Returns: + Dict with all perf.csv fields marked as FAILED + """ + # Get topology information for failure record + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + nproc_per_node = distributed_config.get("nproc_per_node") + if nproc_per_node is None: + nproc_per_node = int(model_info.get("n_gpus", 1)) + # Launcher: use distributed.launcher when set, otherwise "native" for k8s + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") + + # Create a record with the same structure as successful runs + # but with performance=0, metric="", and status="FAILED" + result = { + # Core identification + "model": model_info.get("name", ""), + "n_gpus": str(nnodes * nproc_per_node), + "nnodes": str(nnodes), + "gpus_per_node": str(nproc_per_node), + + # Model configuration + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + + # Build information + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + + # Runtime information + "git_commit": "", + "machine_name": pod_name, + "deployment_type": "kubernetes", + "launcher": launcher, + "gpu_architecture": "", + + # Performance metrics - FAILED + "performance": "0", + "metric": error_msg, # Store error message in metric field + "relative_change": "", + "status": "FAILURE", # Use "FAILURE" to match CSV schema + + # Timing + "build_duration": build_info.get("build_duration", ""), + "test_duration": "", + + # Data information + "dataname": model_info.get("data", ""), + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + + # Build tracking + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags_in_place(result) + return result + + def _ensure_perf_csv_exists(self) -> None: + """Ensure perf.csv exists with standard header (same as Docker container_runner).""" + perf_csv_path = Path("perf.csv") + if not perf_csv_path.exists(): + perf_csv_path.write_text(self._PERF_CSV_HEADER + "\n", encoding="utf-8") + self.console.print("[dim]Created perf.csv with standard header[/dim]") + + def _build_perf_entry_from_aggregated( + self, + aggregated_record: Dict[str, Any], + model_info: Dict[str, Any], + build_info: Dict[str, Any], + deployment_id: str, + ) -> Dict[str, Any]: + """Build full run_details dict from aggregated record for perf_entry and update_* pipeline.""" + from madengine.utils.config_parser import ConfigParser + + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + nproc_per_node = distributed_config.get("nproc_per_node") + if nproc_per_node is None: + nproc_per_node = int(model_info.get("n_gpus", 1)) + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") + test_duration = aggregated_record.get("test_duration") or aggregated_record.get("duration", "") + run_details = { + "model": model_info.get("name", aggregated_record.get("model", "")), + "n_gpus": str(aggregated_record.get("n_gpus", nnodes * nproc_per_node)), + "nnodes": str(aggregated_record.get("nnodes", nnodes)), + "gpus_per_node": str(aggregated_record.get("gpus_per_node", nproc_per_node)), + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": "", + "machine_name": deployment_id, + "deployment_type": "kubernetes", + "launcher": launcher, + "gpu_architecture": aggregated_record.get("gpu_architecture", ""), + "performance": str(aggregated_record.get("performance", "")), + "metric": aggregated_record.get("metric", ""), + "relative_change": "", + "status": aggregated_record.get("status", "SUCCESS"), + "build_duration": build_info.get("build_duration", ""), + "test_duration": test_duration, + "dataname": aggregated_record.get("data_name", model_info.get("data", "")), + "data_provider_type": aggregated_record.get("data_provider", ""), + "data_size": "", + "data_download_duration": "", + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags_in_place(run_details) + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = scripts_base_dir_from(scripts_path) + config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) + run_details["configs"] = config_parser.parse_and_load( + model_info.get("args", ""), scripts_path + ) + except Exception: + run_details["configs"] = None + return run_details + + def _build_common_info_dict( + self, + model_info: Dict, + build_info: Dict, + deployment_id: str, + gpu_architecture: str = "", + ) -> Dict: + """ + Build common_info dict for update_perf_csv / update_perf_super (Docker-compatible). + Same shape as container_runner create_run_details_dict; model/performance/metric + are omitted so they are filled from the multiple_results CSV. + """ + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + nproc_per_node = distributed_config.get("nproc_per_node") + if nproc_per_node is None: + nproc_per_node = int(model_info.get("n_gpus", 1)) + total_gpus = nnodes * nproc_per_node + gpus_per_node = str(nproc_per_node) + nnodes_str = str(nnodes) + # Launcher: use distributed.launcher when set, otherwise "native" for k8s + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") + result = { + "n_gpus": str(total_gpus), + "nnodes": nnodes_str, + "gpus_per_node": gpus_per_node, + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": "", + "machine_name": deployment_id, + "deployment_type": "kubernetes", + "launcher": launcher, + "gpu_architecture": gpu_architecture, + "relative_change": "", + "build_duration": build_info.get("build_duration", ""), + "test_duration": "", + "dataname": model_info.get("data", ""), + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags_in_place(result) + return result + + def _create_multiple_result_row_record( + self, + model_info: Dict, + build_info: Dict, + deployment_id: str, + item: Dict, + ) -> Dict: + """ + Build one perf.csv row for a single row from a multiple_results CSV. + Same shape as _create_failure_record but with SUCCESS and item's performance/metric/model. + """ + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + nproc_per_node = distributed_config.get("nproc_per_node") + if nproc_per_node is None: + nproc_per_node = int(model_info.get("n_gpus", 1)) + + # Launcher: use distributed.launcher when set, otherwise "native" for k8s + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") + result = { + "model": item.get("model", model_info.get("name", "")), + "n_gpus": str(nnodes * nproc_per_node), + "nnodes": str(nnodes), + "gpus_per_node": str(nproc_per_node), + "training_precision": model_info.get("training_precision", ""), + "pipeline": get_pipeline(), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": "", + "machine_name": deployment_id, + "deployment_type": "kubernetes", + "launcher": launcher, + "gpu_architecture": item.get("gpu_architecture", ""), + "performance": str(item.get("performance", "")), + "metric": item.get("metric", ""), + "relative_change": "", + "status": "SUCCESS", + "build_duration": build_info.get("build_duration", ""), + "test_duration": item.get("duration", ""), + "dataname": model_info.get("data", ""), + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": get_build_number(), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + flatten_tags_in_place(result) + return result + + def _parse_multiple_results_from_artifacts( + self, + results_dir: Path, + results: Dict, + model_info: Dict, + build_info: Dict, + ) -> List[Dict]: + """ + Parse performance from a multiple_results CSV (e.g. perf_dummy.csv) collected from PVC. + Used when the model only writes CSV and does not print 'performance: X Y' to the log + (same contract as local container_runner multiple_results handling). + + Returns: + List of perf_data dicts (same shape as _parse_node_performance), or empty list. + """ + import csv as csv_module + multiple_results_file = model_info.get("multiple_results") + filename = Path(multiple_results_file).name if multiple_results_file else None + # Try to get gpu_architecture from first pod log + gpu_arch = "N/A" + if results.get("logs"): + log_content = results["logs"][0].get("log", "") + gpu_arch_match = re.search(r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)", log_content) + if gpu_arch_match: + gpu_arch = gpu_arch_match.group(1) + parsed_list = [] + for art in results.get("artifacts", []): + if art.get("type") != "pvc_collection": + continue + local_path = Path(art.get("local_path", "")) + if not local_path.is_dir(): + continue + # Prefer exact filename (same as Docker multiple_results); fallback to any perf_*.csv + csv_path = (local_path / filename) if filename else None + if not csv_path or not csv_path.is_file(): + perf_csvs = sorted(local_path.glob("perf_*.csv")) + csv_path = perf_csvs[0] if perf_csvs else None + if not csv_path or not csv_path.is_file(): + continue + try: + with open(csv_path, "r", encoding="utf-8", errors="ignore") as f: + reader = csv_module.DictReader(f) + reader.fieldnames = [f.strip() for f in (reader.fieldnames or [])] + if not reader.fieldnames or "performance" not in reader.fieldnames or "metric" not in reader.fieldnames: + continue + for row_idx, row in enumerate(reader): + perf_val = row.get("performance", "").strip() + metric_val = row.get("metric", "").strip() + if not perf_val or not metric_val: + continue + try: + perf_float = float(perf_val) + except (ValueError, TypeError): + continue + # Same model naming as local handle_multiple_results: model_name + "_" + str(model) + row_model = row.get("model", row_idx) + display_model = f"{model_info.get('name')}_{row_model}" + parsed_list.append({ + "model": display_model, + "performance": perf_float, + "metric": metric_val, + "node_id": row_idx, + "local_gpus": 1, + "duration": "N/A", + "gpu_architecture": gpu_arch, + "data_name": "N/A", + "data_provider": "N/A", + }) + if parsed_list: + self.console.print( + f"[green] ✓ Parsed performance from {csv_path.name} ({len(parsed_list)} row(s))[/green]" + ) + return parsed_list + except Exception as e: + self.console.print( + f"[dim] Could not parse {csv_path.name} from PVC: {e}[/dim]" + ) + return [] + + def _aggregation_for_extra_column(self, column_name: str) -> str: + """ + Return how to aggregate an extra CSV column when merging multi-node results. + Best practice: throughput/counts -> sum; latencies/utilization -> average; + duration/capacity -> max; identifiers -> first. + """ + col = column_name.lower().strip() + # Sum: counts, totals, throughput-like + if any(k in col for k in [ + "count", "total", "samples", "tokens", "throughput", + "requests", "images", "bandwidth", "ops" + ]): + return "sum" + # Average: rates per unit, utilization, ratios + if any(k in col for k in [ + "utilization", "usage", "percent", "ratio", "latency", + "time_ms", "ttft", "tpot", "accuracy", "loss" + ]): + return "average" + # Max: duration (slowest node), memory, capacity + if any(k in col for k in [ + "duration", "time", "seconds", "memory", "bytes", "mb", "gb" + ]): + return "max" + return "first" + + def _merge_multi_node_multiple_results_csv( + self, csv_paths: List[Path], output_path: Path + ) -> bool: + """ + Merge multiple pod multiple_results CSVs into one with sum/average rules. + Rows are aligned by index (row 0 from each pod -> one merged row 0). + - performance: aggregated by _determine_aggregation_method(metric) (sum or average). + - Other numeric columns: by _aggregation_for_extra_column (sum/average/max). + - model, metric: taken from first CSV. + """ + import csv as csv_module + import statistics + + required = ["model", "performance", "metric"] + rows_by_index: Dict[int, List[Dict]] = {} + + for path in csv_paths: + try: + with open(path, "r", encoding="utf-8", errors="ignore") as f: + reader = csv_module.DictReader(f) + fieldnames = [c.strip() for c in (reader.fieldnames or [])] + if not all(h in fieldnames for h in required): + continue + for idx, row in enumerate(reader): + row = {k.strip(): v for k, v in row.items() if k} + if not row.get("performance") or not row.get("metric"): + continue + try: + float(str(row["performance"]).strip()) + except (ValueError, TypeError): + continue + if idx not in rows_by_index: + rows_by_index[idx] = [] + rows_by_index[idx].append(row) + except Exception as e: + self.console.print(f"[dim] Could not read {path.name}: {e}[/dim]") + continue + + if not rows_by_index: + return False + + # Build union of columns (required first, then rest) + extra_cols = set() + for group in rows_by_index.values(): + for row in group: + extra_cols.update(k for k in row if k not in required) + all_columns = list(required) + sorted(extra_cols) + merged_rows = [] + for idx in sorted(rows_by_index.keys()): + group = rows_by_index[idx] + first = group[0] + metric_name = (first.get("metric") or "").strip() + perf_agg = self._determine_aggregation_method(metric_name) + perf_values = [] + for r in group: + try: + perf_values.append(float(str(r.get("performance", "")).strip())) + except (ValueError, TypeError): + pass + if not perf_values: + continue + if perf_agg == "sum": + performance = sum(perf_values) + elif perf_agg == "average": + performance = statistics.mean(perf_values) + elif perf_agg == "max": + performance = max(perf_values) + else: + performance = sum(perf_values) + merged = { + "model": first.get("model", ""), + "performance": performance, + "metric": first.get("metric", ""), + } + for col in all_columns: + if col in merged: + continue + values = [r.get(col) for r in group] + try: + nums = [float(str(v).strip()) for v in values if v is not None and str(v).strip()] + except (ValueError, TypeError): + nums = [] + if nums: + extra_agg = self._aggregation_for_extra_column(col) + if extra_agg == "sum": + merged[col] = sum(nums) + elif extra_agg == "average": + merged[col] = statistics.mean(nums) + elif extra_agg == "max": + merged[col] = max(nums) + else: + merged[col] = first.get(col, "") + else: + merged[col] = first.get(col, "") + merged_rows.append(merged) + + if not merged_rows: + return False + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", newline="", encoding="utf-8") as f: + writer = csv_module.DictWriter(f, fieldnames=all_columns, extrasaction="ignore") + writer.writeheader() + writer.writerows(merged_rows) + self.console.print( + f"[green] ✓ Merged {len(csv_paths)} pod CSV(s) into {len(merged_rows)} row(s) → {output_path.name}[/green]" + ) + return True + + def _resolve_multiple_results_csv( + self, results_dir: Path, results: Dict, model_info: Dict + ) -> Optional[Path]: + """ + Resolve path to a single multiple_results CSV for update_perf_csv. + Single pod: return that CSV path. Multi-pod: merge all pod CSVs with + sum/average rules and return path to merged file. + """ + multiple_results_file = model_info.get("multiple_results") + filename = Path(multiple_results_file).name if multiple_results_file else None + csv_paths: List[Path] = [] + for art in results.get("artifacts", []): + if art.get("type") != "pvc_collection": + continue + local_path = Path(art.get("local_path", "")) + if not local_path.is_dir(): + continue + csv_path = (local_path / filename) if filename else None + if not csv_path or not csv_path.is_file(): + perf_csvs = sorted(local_path.glob("perf_*.csv")) + csv_path = perf_csvs[0] if perf_csvs else None + if csv_path and csv_path.is_file(): + csv_paths.append(csv_path) + if not csv_paths: + return None + if len(csv_paths) == 1: + return csv_paths[0] + merged_path = results_dir / "multiple_results_merged.csv" + if self._merge_multi_node_multiple_results_csv(csv_paths, merged_path): + return merged_path + return csv_paths[0] diff --git a/src/madengine/deployment/k8s_scripts.py b/src/madengine/deployment/k8s_scripts.py new file mode 100644 index 00000000..2583c822 --- /dev/null +++ b/src/madengine/deployment/k8s_scripts.py @@ -0,0 +1,352 @@ +""" +Kubernetes script and tool loading mixin. + +Handles loading madengine common scripts, tool wrapper scripts, and Primus +experiment files for embedding into Kubernetes ConfigMaps. Since madengine +is not installed inside model Docker images, these scripts must be bundled +into the ConfigMap so the init container can recreate the expected layout. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import re +from pathlib import Path +from typing import Dict, List + +from madengine.utils.path_utils import get_madengine_root + +from .primus_backend import ( + infer_primus_examples_overlay_subdirs, + merged_primus_config, +) + + +class KubernetesScriptsMixin: + """Script and tool loading for Kubernetes ConfigMap embedding.""" + + def gather_system_env_details( + self, pre_scripts: List[Dict], model_name: str + ) -> None: + """ + Gather system environment details by adding rocEnvTool to pre-scripts. + + This ensures K8s deployment collects the same system info as local execution. + + Args: + pre_scripts: List of pre-script configurations + model_name: The model name (used for output file naming) + """ + pre_env_details = { + "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", + "args": model_name.replace("/", "_") + "_env" + } + pre_scripts.append(pre_env_details) + self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]") + + def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None: + """ + Add tool pre/post scripts to execution lists (similar to local execution). + + Extracts pre_scripts and post_scripts from tools.json definitions and adds them + to the pre_scripts and post_scripts lists for execution in K8s pods. + + Args: + pre_scripts: List to append tool pre-scripts to + post_scripts: List to append tool post-scripts to + """ + tools_config = self._get_tools_config() + if not tools_config: + return + + tools_json_path = get_madengine_root() / "scripts" / "common" / "tools.json" + if not tools_json_path.exists(): + return + + with open(tools_json_path, "r") as f: + tools_definitions = json.load(f) + + for tool in tools_config: + tool_name = tool.get("name") + if not tool_name or tool_name not in tools_definitions.get("tools", {}): + continue + + tool_def = tools_definitions["tools"][tool_name] + + if "pre_scripts" in tool_def: + pre_scripts[:0] = tool_def["pre_scripts"] + + if "post_scripts" in tool_def: + post_scripts.extend(tool_def["post_scripts"]) + + def _load_common_scripts(self, script_list: List[Dict]) -> Dict[str, str]: + """ + Load common script contents from madengine package for embedding in ConfigMap. + + Since madengine is not installed in model Docker images, we need to embed + the common scripts (pre_scripts, post_scripts, and tool wrapper scripts) in the ConfigMap. + + Args: + script_list: List of script configurations with 'path' field + + Returns: + Dict mapping relative script paths to their contents + """ + script_contents = {} + madengine_root = get_madengine_root() + + for script_config in script_list: + script_path = script_config.get("path", "") + if not script_path: + continue + + abs_script_path = madengine_root / script_path + + if abs_script_path.exists() and abs_script_path.is_file(): + with open(abs_script_path, "r") as f: + script_contents[script_path] = f.read() + self.console.print(f"[dim]Loaded common script: {script_path}[/dim]") + + if "run_rocenv_tool.sh" in script_path: + rocenv_dir = abs_script_path.parent / "rocEnvTool" + if rocenv_dir.exists() and rocenv_dir.is_dir(): + for py_file in rocenv_dir.glob("*.py"): + rel_path = f"scripts/common/pre_scripts/rocEnvTool/{py_file.name}" + with open(py_file, "r") as f: + script_contents[rel_path] = f.read() + self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") + + for json_file in rocenv_dir.glob("*.json"): + rel_path = f"scripts/common/pre_scripts/rocEnvTool/{json_file.name}" + with open(json_file, "r") as f: + script_contents[rel_path] = f.read() + self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") + else: + self.console.print(f"[yellow]Warning: Script not found: {script_path} (at {abs_script_path})[/yellow]") + + tools_config = self._get_tools_config() + if tools_config: + self._load_tool_wrapper_scripts(script_contents, tools_config, madengine_root) + + return script_contents + + def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], + tools_config: List[Dict], madengine_root: Path) -> None: + """ + Load tool wrapper scripts and tools.json for K8s ConfigMap. + + This enables profiling tools like rocprof to work in K8s deployments. + + Args: + script_contents: Dict to populate with script contents + tools_config: List of tool configurations from manifest + madengine_root: Path to madengine package root + """ + tools_json_path = madengine_root / "scripts" / "common" / "tools.json" + if tools_json_path.exists(): + with open(tools_json_path, "r") as f: + tools_definitions = json.load(f) + script_contents["scripts/common/tools.json"] = json.dumps(tools_definitions, indent=2) + self.console.print(f"[dim]Loaded tools.json[/dim]") + else: + self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") + return + + for tool in tools_config: + tool_name = tool.get("name") + if not tool_name: + continue + + if tool_name not in tools_definitions.get("tools", {}): + self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") + continue + + tool_def = tools_definitions["tools"][tool_name] + + cmd = tool.get("cmd", tool_def.get("cmd", "")) + + if "scripts/common/tools/" in cmd: + parts = cmd.split() + for part in parts: + if "scripts/common/tools/" in part: + script_rel_path = part.replace("../", "") + abs_script_path = madengine_root / script_rel_path + + if abs_script_path.exists() and abs_script_path.is_file(): + with open(abs_script_path, "r") as f: + script_contents[script_rel_path] = f.read() + self.console.print(f"[dim]Loaded tool script: {script_rel_path}[/dim]") + + if script_rel_path.endswith('.py'): + tools_dir = abs_script_path.parent + utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] + for util_file in utility_modules: + util_path = tools_dir / util_file + if util_path.exists(): + util_rel_path = f"scripts/common/tools/{util_file}" + if util_rel_path not in script_contents: + with open(util_path, "r") as f: + script_contents[util_rel_path] = f.read() + self.console.print(f"[dim]Loaded tool utility module: {util_rel_path}[/dim]") + else: + self.console.print(f"[yellow]Warning: Tool script not found: {script_rel_path} (at {abs_script_path})[/yellow]") + break + + for script_config in tool_def.get("pre_scripts", []): + script_path = script_config.get("path", "") + if script_path and script_path not in script_contents: + abs_script_path = madengine_root / script_path + if abs_script_path.exists(): + with open(abs_script_path, "r") as f: + script_contents[script_path] = f.read() + self.console.print(f"[dim]Loaded tool pre-script: {script_path}[/dim]") + + for script_config in tool_def.get("post_scripts", []): + script_path = script_config.get("path", "") + if script_path and script_path not in script_contents: + abs_script_path = madengine_root / script_path + if abs_script_path.exists(): + with open(abs_script_path, "r") as f: + script_contents[script_path] = f.read() + self.console.print(f"[dim]Loaded tool post-script: {script_path}[/dim]") + + for script_config in tool_def.get("pre_scripts", []): + script_path = script_config.get("path", "") + if script_path: + abs_script_path = madengine_root / script_path + if abs_script_path.exists(): + with open(abs_script_path, "r") as f: + script_content = f.read() + tool_refs = re.findall(r'(?:\.\./)?scripts/common/tools/[\w_]+\.py', script_content) + for tool_ref in tool_refs: + tool_script_path = tool_ref.strip('"\'').replace("../", "") + abs_tool_path = madengine_root / tool_script_path + + if abs_tool_path.exists() and tool_script_path not in script_contents: + with open(abs_tool_path, "r") as tf: + script_contents[tool_script_path] = tf.read() + self.console.print(f"[dim]Loaded tool dependency: {tool_script_path}[/dim]") + + if tool_script_path.endswith('.py'): + tools_dir = abs_tool_path.parent + utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] + for util_file in utility_modules: + util_path = tools_dir / util_file + if util_path.exists(): + util_rel_path = f"scripts/common/tools/{util_file}" + if util_rel_path not in script_contents: + with open(util_path, "r") as uf: + script_contents[util_rel_path] = uf.read() + self.console.print(f"[dim]Loaded utility module (from dependency): {util_rel_path}[/dim]") + + def _bundle_primus_k8s_examples_overlay( + self, model_scripts_contents: Dict[str, str], model_name: str = "" + ) -> None: + """ + Add Primus experiment files from ``scripts/Primus`` into ``model_scripts_contents`` + using ConfigMap keys under ``Primus/...`` (not ``scripts/Primus/...``). + + The init container writes paths like ``/workspace/Primus/examples/...``, matching + ``PRIMUS_ROOT=/workspace/Primus`` in the Primus Dockerfile. The Job volume hides + image layers under ``/workspace``, so this bundle is what makes K8s runs work. + + Always includes when present: + + - ``requirements.txt`` (repo root; ``pip install -r`` from ``run_pretrain.sh``) + - ``examples/scripts/`` (``prepare_experiment.py``, NCCL helper shells, etc.) + - ``examples/run_pretrain.sh`` + - The backend subtree from ``distributed.primus.config_path`` (torchtitan, + megatron, MaxText, ...). + """ + manifest = getattr(self, "manifest", None) + primus_cfg = merged_primus_config( + manifest if isinstance(manifest, dict) else None, + self.config.additional_context, + ) + config_path = primus_cfg.get("config_path") or "" + backend_hint = (primus_cfg.get("backend") or "").strip() + subdirs = infer_primus_examples_overlay_subdirs( + config_path, + backend_hint=backend_hint, + model_name=model_name or "", + ) + cwd = Path.cwd() + primus_repo = cwd / "scripts" / "Primus" + if not primus_repo.is_dir(): + self.console.print( + f"[yellow]Primus K8s: {primus_repo} not found — skipping Primus ConfigMap bundle.[/yellow]" + ) + return + + def _add_primus_file(host_file: Path) -> bool: + try: + content = host_file.read_text(encoding="utf-8", errors="strict") + except (UnicodeDecodeError, OSError): + self.console.print( + f"[dim]Skipping non-text Primus file for K8s bundle: {host_file}[/dim]" + ) + return False + rel_under_repo = host_file.relative_to(primus_repo) + key = str(Path("Primus") / rel_under_repo) + model_scripts_contents[key] = content + return True + + req = primus_repo / "requirements.txt" + if req.is_file(): + if _add_primus_file(req): + self.console.print("[dim]Primus K8s: bundled Primus/requirements.txt[/dim]") + + ex_scripts = primus_repo / "examples" / "scripts" + if ex_scripts.is_dir(): + n_scripts = 0 + for f in ex_scripts.rglob("*"): + if not f.is_file(): + continue + if _add_primus_file(f): + n_scripts += 1 + self.console.print( + f"[dim]Primus K8s: bundled Primus/examples/scripts for ConfigMap ({n_scripts} files)[/dim]" + ) + + run_pre = primus_repo / "examples" / "run_pretrain.sh" + if run_pre.is_file(): + if _add_primus_file(run_pre): + self.console.print("[dim]Primus K8s: bundled Primus/examples/run_pretrain.sh[/dim]") + + for sub in subdirs: + base = primus_repo / "examples" / sub + if not base.is_dir(): + self.console.print( + f"[yellow]Primus K8s: scripts/Primus/examples/{sub} not found under {cwd} — " + "skipping that subtree.[/yellow]" + ) + continue + n = 0 + for f in base.rglob("*"): + if not f.is_file(): + continue + if _add_primus_file(f): + n += 1 + self.console.print( + f"[dim]Primus K8s: bundled Primus/examples/{sub} for ConfigMap ({n} files)[/dim]" + ) + + def _load_k8s_tools(self) -> Dict: + """ + Load K8s-specific tools configuration. + + Returns: + Dict with K8s tools configuration + """ + k8s_tools_file = Path(__file__).parent.parent / "scripts" / "k8s" / "tools.json" + + if k8s_tools_file.exists(): + try: + with open(k8s_tools_file, "r") as f: + return json.load(f) + except Exception as e: + self.console.print(f"[yellow]Warning: Failed to load K8s tools config: {e}[/yellow]") + return {} + else: + self.console.print(f"[yellow]Warning: K8s tools.json not found at {k8s_tools_file}[/yellow]") + return {} diff --git a/src/madengine/deployment/k8s_template_context.py b/src/madengine/deployment/k8s_template_context.py new file mode 100644 index 00000000..78c01072 --- /dev/null +++ b/src/madengine/deployment/k8s_template_context.py @@ -0,0 +1,858 @@ +""" +Kubernetes template context preparation mixin. + +Handles building the Jinja2 template context dictionary, environment variable +preparation, data provider configuration, and tools configuration enrichment +for Kubernetes Job manifest rendering. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Optional + +from .common import configure_multi_node_profiling +from .k8s_names import sanitize_k8s_container_name, sanitize_k8s_label_value +from .k8s_secrets import ( + CONFIGMAP_MAX_BYTES, + SECRETS_STRATEGY_FROM_LOCAL, + build_registry_secret_data, + estimate_configmap_payload_bytes, + merge_secrets_config, + resolve_image_pull_secret_refs, + resolve_runtime_secret_name, +) +from .primus_backend import ( + infer_primus_backend_from_model_name, + merged_primus_config, +) +from madengine.core.dataprovider import Data +from madengine.core.errors import ConfigurationError +from madengine.utils.gpu_config import resolve_runtime_gpus +from madengine.utils.path_utils import get_madengine_root + + +class KubernetesTemplateContextMixin: + """Template context preparation for Kubernetes manifest rendering.""" + + def _prepare_template_context( + self, model_info: Dict, image_info: Dict + ) -> Dict[str, Any]: + """ + Prepare context dictionary for Jinja2 template rendering. + + Args: + model_info: Model configuration from build_manifest.json + image_info: Image information from build_manifest.json + + Returns: + Context dictionary with all template variables + """ + # Use hierarchical GPU resolution: runtime > deployment > model > default + additional_context = self.config.additional_context.copy() + additional_context["k8s"] = self.k8s_config + gpu_count = resolve_runtime_gpus(model_info, additional_context) + model_name = model_info["name"] + + # Load manifest and credential content for ConfigMap + with open(self.config.manifest_file, "r") as f: + manifest_content = f.read() + + credential_content = "{}" + credential_path = Path("credential.json") + if credential_path.exists(): + with open(credential_path, "r") as f: + credential_content = f.read() + + # Load data.json content if exists + data_json_content = None + data_path = Path("data.json") + if data_path.exists(): + with open(data_path, "r") as f: + data_json_content = f.read() + self.console.print(f"[dim]Loaded data.json[/dim]") + + # Load model scripts directory content (entire folder, not just one file) + # This matches local execution which mounts the entire MODEL_DIR/scripts folder + model_script_path = model_info.get("scripts") # e.g., "scripts/dummy/run_data_minio.sh" + model_script_dir = None + model_script_filename = None + model_scripts_contents = {} # Store all scripts in the directory + + if model_script_path: + script_file = Path(model_script_path) + # Extract directory and filename + model_script_dir = str(script_file.parent) # e.g., "scripts/dummy" + model_script_filename = script_file.name # e.g., "run_data_minio.sh" + + # Bundle entire scripts/ directory recursively for reliability across + # different model types (vllm, sglang, etc.) with varying file types and subdirs + scripts_dir_path = Path(model_script_dir) + if scripts_dir_path.exists() and scripts_dir_path.is_dir(): + cwd = Path.cwd() + for f in scripts_dir_path.rglob("*"): + if not f.is_file(): + continue + try: + content = f.read_text(encoding="utf-8", errors="strict") + except (UnicodeDecodeError, OSError): + # Skip binary or unreadable files (ConfigMap is text-only) + self.console.print( + f"[dim]Skipping non-text file: {f.relative_to(scripts_dir_path)}[/dim]" + ) + continue + relative_path = ( + str(f.relative_to(cwd)) if f.is_absolute() else str(f) + ) + model_scripts_contents[relative_path] = content + self.console.print( + f"[dim]Loaded {len(model_scripts_contents)} file(s) from {model_script_dir}[/dim]" + ) + elif script_file.exists(): + # Fallback: load single file if directory doesn't exist + with open(script_file, "r") as f: + model_scripts_contents[model_script_path] = f.read() + self.console.print(f"[dim]Loaded single script: {model_script_path}[/dim]") + else: + self.console.print(f"[yellow]Warning: Script not found: {model_script_path}[/yellow]") + + # Load K8s tools configuration + k8s_tools_config = self._load_k8s_tools() + + # Prepare data configuration first + data_config = self._prepare_data_config(model_info) + + # Store for use in deploy() method + self._data_config = data_config + + # K8s best practice: Auto-create shared data PVC if needed + # K8s philosophy: Separate compute (pods) from storage (PVC) + if data_config and not self.k8s_config.get("data_pvc"): + # PVC will be auto-created during deployment + # Use consistent name for reusability across training runs + self.console.print( + f"[cyan]📦 Data provider detected: Will auto-create shared data PVC[/cyan]" + ) + self.console.print( + f"[dim] PVC name: madengine-shared-data (reusable across runs)[/dim]" + ) + self.console.print( + f"[dim] Access mode: RWO for single-node, RWX for multi-node (auto-selected)[/dim]" + ) + self.console.print( + f"[dim] To use existing PVC, add 'data_pvc' to your K8s config[/dim]" + ) + # Set PVC name now so templates are rendered with correct value + self.k8s_config["data_pvc"] = "madengine-shared-data" + + # Determine data provider script if model needs data + data_provider_script = None + data_provider_script_content = None + if data_config: + provider_type = data_config.get("provider_type", "local") + if provider_type in k8s_tools_config.get("data_providers", {}): + data_provider_script = k8s_tools_config["data_providers"][provider_type] + + # Load K8s data provider script content + k8s_script_path = get_madengine_root() / data_provider_script["script"] + if k8s_script_path.exists(): + with open(k8s_script_path, "r") as f: + data_provider_script_content = f.read() + self.console.print(f"[dim]Loaded K8s data provider: {data_provider_script['script']}[/dim]") + else: + self.console.print(f"[yellow]Warning: K8s script not found: {k8s_script_path}[/yellow]") + + # Get launcher configuration from manifest's deployment_config or additional_context + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + launcher_config = self.config.additional_context.get("launcher", {}) + + # Merge manifest and runtime launcher config (runtime overrides) + # Use explicit None checking to handle 0 values correctly + launcher_type = ( + launcher_config.get("type") + if launcher_config.get("type") is not None + else distributed_config.get("launcher") + ) + + nnodes = ( + launcher_config.get("nnodes") + if launcher_config.get("nnodes") is not None + else distributed_config.get("nnodes", 1) + ) + + # Store for use in deploy() method + self._nnodes = nnodes + + nproc_per_node = ( + launcher_config.get("nproc_per_node") + if launcher_config.get("nproc_per_node") is not None + else distributed_config.get("nproc_per_node") + if distributed_config.get("nproc_per_node") is not None + else int(model_info.get("n_gpus", 1)) + ) + + master_port = launcher_config.get("master_port", 29500) + + # Validate configuration + if launcher_type == "torchrun": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring torchrun: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "deepspeed": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring DeepSpeed: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "torchtitan": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring TorchTitan: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "vllm": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring vLLM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "sglang": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring SGLang: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "megatron": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring Megatron-LM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "primus": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring Primus: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + self._bundle_primus_k8s_examples_overlay(model_scripts_contents, model_name) + + # Determine if we need multi-node setup + create_headless_service = False + launcher_command = None + + if launcher_type == "torchrun": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node detected: Creating headless service for pod discovery[/dim]") + + # Generate torchrun launcher command + launcher_command = self._generate_torchrun_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "deepspeed": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node DeepSpeed: Creating headless service for pod discovery[/dim]") + + model_script = model_info.get("scripts", "run.sh") + + # Check if script is a bash script - if so, execute it directly + # as it will handle the launcher internally + if model_script.endswith('.sh'): + self.console.print(f"[dim]Detected bash script ({model_script}), will execute directly[/dim]") + launcher_command = self._generate_bash_script_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_script + ) + else: + # Python script - use DeepSpeed launcher + launcher_command = self._generate_deepspeed_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_script + ) + + elif launcher_type == "torchtitan": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node TorchTitan: Creating headless service for pod discovery[/dim]") + + # Generate TorchTitan launcher command + launcher_command = self._generate_torchtitan_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "vllm": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node vLLM: Creating headless service for Ray cluster[/dim]") + + # Generate vLLM launcher command (pass model args so run.sh gets --model_repo etc.) + launcher_command = self._generate_vllm_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh"), + model_args=model_info.get("args", ""), + ) + + elif launcher_type == "sglang": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node SGLang: Creating headless service for Ray cluster[/dim]") + + # Generate SGLang launcher command (pass model args so run.sh gets CLI args) + launcher_command = self._generate_sglang_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh"), + model_args=model_info.get("args", ""), + ) + + elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + if nnodes < 3: + raise ValueError( + f"SGLang Disaggregated requires minimum 3 nodes " + f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" + ) + + # Always create headless service for disaggregated architecture + create_headless_service = True + self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]") + self.console.print(f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]") + + # Generate SGLang Disaggregated launcher command + launcher_command = self._generate_sglang_disagg_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "megatron": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node Megatron-LM: Creating headless service for pod discovery[/dim]") + + # Generate Megatron-LM launcher command + launcher_command = self._generate_megatron_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "primus": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node Primus: Creating headless service for pod discovery[/dim]") + + # Generate Primus launcher command (env-only: PRIMUS_CONFIG_PATH, PRIMUS_CLI_EXTRA) + launcher_command = self._generate_primus_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh"), + model_args=model_info.get("args", "") or "", + model_name=model_info.get("name", "") or "", + ) + primus_cfg = merged_primus_config(self.manifest, self.config.additional_context) + backend_hint = (primus_cfg.get("backend") or "").strip().lower() + inferred_backend = infer_primus_backend_from_model_name( + model_info.get("name", "") or "" + ) + config_path_lower = (primus_cfg.get("config_path") or "").lower() + looks_maxtext = ( + backend_hint == "maxtext" + or inferred_backend == "MaxText" + or "maxtext" in config_path_lower + ) + if looks_maxtext and nnodes > 1: + self.console.print( + "[yellow]Warning: Primus MaxText multi-node may run in-container apt installs " + "(InfiniBand-related packages) inside run_pretrain.sh. Ensure your image or " + "cluster policy allows this, or use a pre-baked image.[/yellow]" + ) + + # Prepare pre/post scripts (similar to local execution) + pre_scripts = [] + post_scripts = [] + + # Get pre/post scripts from manifest context if available + if "context" in self.manifest: + if "pre_scripts" in self.manifest["context"]: + pre_scripts.extend(self.manifest["context"]["pre_scripts"]) + if "post_scripts" in self.manifest["context"]: + post_scripts.extend(self.manifest["context"]["post_scripts"]) + + # Add system environment collection (rocEnvTool) - same as local execution + # This is controlled by generate_sys_env_details flag (default: True) + generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) + if generate_sys_env_details: + self.gather_system_env_details(pre_scripts, model_info["name"]) + + # Add tool pre/post scripts to the execution lists (like local execution) + self._add_tool_scripts(pre_scripts, post_scripts) + + # Load pre/post script contents for ConfigMap (since madengine not installed in container) + pre_post_script_contents = self._load_common_scripts(pre_scripts + post_scripts) + + merged_sec = merge_secrets_config(self.k8s_config) + strategy = merged_sec.get("strategy", SECRETS_STRATEGY_FROM_LOCAL) + cred_path = Path("credential.json") + cred_exists = cred_path.exists() + + created_pull_preview: List[str] = [] + if cred_exists and strategy == SECRETS_STRATEGY_FROM_LOCAL: + try: + parsed = json.loads(cred_path.read_text(encoding="utf-8")) + if build_registry_secret_data(parsed): + created_pull_preview.append(f"{self.job_name}-registry-pull") + except (json.JSONDecodeError, OSError): + pass + + if strategy == SECRETS_STRATEGY_FROM_LOCAL: + include_credential_in_configmap = not cred_exists + else: + include_credential_in_configmap = False + + created_runtime_name: Optional[str] = ( + f"{self.job_name}-runtime" + if strategy == SECRETS_STRATEGY_FROM_LOCAL and cred_exists + else None + ) + runtime_credentials_secret_name = resolve_runtime_secret_name( + strategy, merged_sec, created_runtime_name + ) + + image_pull_secrets = resolve_image_pull_secret_refs( + strategy, merged_sec, created_pull_preview + ) + + ap_prof = self.k8s_config.get("allow_privileged_profiling") + if ap_prof is None: + privileged_profiling = bool(self._get_tools_config()) + else: + privileged_profiling = bool(ap_prof) + + _pytorch_native = frozenset( + {"torchrun", "deepspeed", "torchtitan", "megatron", "primus"} + ) + subdomain_val = ( + self.service_name + if nnodes > 1 and launcher_type in _pytorch_native + else None + ) + + # Build complete context + context = { + # Job metadata + "job_name": self.job_name, + "job_label": self.job_label, + "main_container_name": getattr( + self, "main_container_name", None + ) + or sanitize_k8s_container_name(self.job_name), + "namespace": self.namespace, + "model_name": model_name, + "model_label": sanitize_k8s_label_value(model_name), + # ConfigMap + "configmap_name": self.configmap_name, + "manifest_content": manifest_content, + "credential_content": credential_content, + "include_credential_in_configmap": include_credential_in_configmap, + "runtime_credentials_secret_name": runtime_credentials_secret_name, + "image_pull_secrets": image_pull_secrets, + "privileged_profiling": privileged_profiling, + "ttl_seconds_after_finished": self.k8s_config.get( + "ttl_seconds_after_finished" + ), + "data_json_content": data_json_content, + "model_scripts_contents": model_scripts_contents, # All scripts in directory + "model_script_path": model_script_path, + "model_script_dir": model_script_dir, + "model_script_filename": model_script_filename, + # K8s tools + "data_provider_script": data_provider_script, + "data_provider_script_content": data_provider_script_content, + # Image + "image": image_info["registry_image"], + "image_pull_policy": self.k8s_config.get("image_pull_policy", "Always"), + # Resources + "gpu_resource_name": self.gpu_resource_name, + "gpu_count": gpu_count, + "memory": self.k8s_config.get("memory", "128Gi"), + "memory_limit": self.k8s_config.get("memory_limit", "256Gi"), + "cpu": self.k8s_config.get("cpu", "32"), + "cpu_limit": self.k8s_config.get("cpu_limit", "64"), + # Job spec + "completions": nnodes, + "parallelism": nnodes, + "completion_mode": "Indexed" if nnodes > 1 else None, + "backoff_limit": self.k8s_config.get("backoff_limit", 3), + # Pod spec + "node_selector": self.k8s_config.get("node_selector", {}), + "tolerations": self.k8s_config.get("tolerations", []), + "host_ipc": nnodes > 1, # Enable for multi-node + "subdomain": subdomain_val, + # Execution + "gpu_visibility": ",".join(str(i) for i in range(gpu_count)), # e.g., "0" for 1 GPU, "0,1" for 2 GPUs + "gpu_architecture": self.manifest.get("context", {}).get( + "gpu_architecture", "gfx90a" + ), + "model_script": f"{model_info.get('scripts', 'run.sh')} {model_info.get('args', '')}".strip(), + "launcher_type": launcher_type, + "launcher_command": launcher_command, + "nnodes": nnodes, + "nproc_per_node": nproc_per_node, + "master_port": master_port, + "timeout": self.config.timeout, + # Environment - Merge base env vars with data/tools env vars + "env_vars": self._prepare_env_vars(model_info), + # Volumes + "results_pvc": f"{self.job_name}-results", # Always create a PVC for results + "pvc_name": f"{self.job_name}-results", # PVC name for template + "data_pvc": self.k8s_config.get("data_pvc"), + # Multi-node + "create_headless_service": create_headless_service, + "service_name": self.service_name, + "ports": [29500] if create_headless_service else [], + # Data provider configuration (already prepared above) + "data_config": data_config, + # Tools configuration - from manifest.context or additional_context + "tools_config": self._get_tools_config(), + # Tool command chains (pre-built for template) + "launcher_tool_chain": self._build_tool_command_chain( + self._get_tools_config(), "bash /tmp/run_launcher.sh" + ) if launcher_command else None, + "direct_script_tool_chain": self._build_tool_command_chain( + self._get_tools_config(), f"bash {model_info.get('scripts', 'run.sh')}" + ), + # Pre/Post scripts - includes rocEnvTool and any user-defined scripts + "pre_scripts": pre_scripts, + "post_scripts": post_scripts, + # Common script contents for ConfigMap (embedded since madengine not in container) + "common_script_contents": pre_post_script_contents, + # Multiple results file (e.g. perf_dummy.csv) - copied to PVC for K8s result collection + "multiple_results": model_info.get("multiple_results") or "", + } + + est = estimate_configmap_payload_bytes(context) + if est > CONFIGMAP_MAX_BYTES: + raise ConfigurationError( + f"ConfigMap payload would be ~{est} bytes; Kubernetes limit is ~1 MiB. " + "Reduce embedded scripts or use a smaller scripts directory." + ) + + return context + + def _get_tools_config(self) -> List[Dict]: + """ + Get tools configuration from manifest.context or additional_context. + + Prioritizes runtime additional_context, falls back to manifest.context. + + For multi-node runs: + - Checks rocprofv3 availability (required for MPI profiling) + - Upgrades "rocprof" to "rocprofv3" for multi-node compatibility + - Logs warnings if rocprofv3 not available + + Returns: + List of tool configurations (enriched with cmd from tools.json) + """ + # Cache the result to avoid repeated expensive checks and duplicate warnings + if hasattr(self, '_cached_tools_config'): + return self._cached_tools_config + + # Check runtime additional_context first (allows runtime override) + tools = self.config.additional_context.get("tools", []) + + # Fall back to manifest.context if no runtime tools + if not tools and "context" in self.manifest: + tools = self.manifest["context"].get("tools", []) + + # Apply multi-node profiling logic if applicable + distributed_config = self.config.additional_context.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + + if nnodes > 1 and tools: + # Configure multi-node profiling (handles rocprofv3 detection and tool upgrades) + # Create a simple logger wrapper for configure_multi_node_profiling + class ConsoleLogger: + def __init__(self, console): + self.console = console + + def info(self, msg): + self.console.print(f"[cyan]{msg}[/cyan]") + + def warning(self, msg): + self.console.print(f"[yellow]{msg}[/yellow]") + + def debug(self, msg): + pass # Skip debug messages in console + + profiling_config = configure_multi_node_profiling( + nnodes=nnodes, + tools_config=tools, + logger=ConsoleLogger(self.console) + ) + + if profiling_config["enabled"]: + tools = profiling_config["tools"] + else: + # rocprofv3 not available - skip profiling for multi-node + tools = [] + + # Enrich tools with cmd from tools.json for K8s template usage + result = self._enrich_tools_with_cmd(tools) + + # Cache the result for subsequent calls + self._cached_tools_config = result + return result + + def _build_tool_command_chain(self, tools_config: List[Dict], base_command: str) -> str: + """ + Build a command chain from multiple tools, wrapping the base command. + + Tools are chained from outermost to innermost: + tool_n wraps tool_2 wraps tool_1 wraps base_command + + Each tool's OUTPUT_FILE env var is set inline to avoid conflicts. + + Args: + tools_config: List of enriched tool configurations + base_command: The base command to wrap (e.g., "bash /tmp/run_launcher.sh") + + Returns: + Complete command chain string + """ + if not tools_config: + return base_command + + # Filter tools that have a cmd field + tools_with_cmd = [t for t in tools_config if t.get("cmd")] + + if not tools_with_cmd: + return base_command + + # Build command chain from inside out (reverse order) + cmd_chain = base_command + for tool in reversed(tools_with_cmd): + tool_cmd = tool["cmd"].replace("../scripts/common/", "scripts/common/") + + # Set OUTPUT_FILE inline for this specific tool (if defined in tool's env_vars) + tool_env_vars = tool.get("env_vars", {}) + if "OUTPUT_FILE" in tool_env_vars: + output_file = tool_env_vars["OUTPUT_FILE"] + # Prepend OUTPUT_FILE=value to this tool's command only + cmd_chain = f"OUTPUT_FILE={output_file} {tool_cmd} {cmd_chain}" + else: + cmd_chain = f"{tool_cmd} {cmd_chain}" + + return cmd_chain + + def _enrich_tools_with_cmd(self, tools: List[Dict]) -> List[Dict]: + """ + Enrich tools configuration with cmd field from tools.json. + + This is needed for K8s template to generate the correct encapsulation command. + + Args: + tools: List of tool configurations (may only have 'name' field) + + Returns: + Enriched list with 'cmd' field added from tools.json + """ + if not tools: + return tools + + # Load tools.json + tools_json_path = Path(__file__).parent.parent / "scripts" / "common" / "tools.json" + if not tools_json_path.exists(): + self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") + return tools + + with open(tools_json_path, "r") as f: + tools_definitions = json.load(f) + + enriched_tools = [] + for tool in tools: + tool_name = tool.get("name") + if not tool_name: + enriched_tools.append(tool) + continue + + # Get tool definition from tools.json + if tool_name not in tools_definitions.get("tools", {}): + self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") + enriched_tools.append(tool) + continue + + tool_def = tools_definitions["tools"][tool_name] + + # Create enriched tool config with cmd + enriched_tool = tool.copy() + if "cmd" not in enriched_tool and "cmd" in tool_def: + enriched_tool["cmd"] = tool_def["cmd"] + + # Also copy env_vars if present + if "env_vars" not in enriched_tool and "env_vars" in tool_def: + enriched_tool["env_vars"] = tool_def["env_vars"] + + enriched_tools.append(enriched_tool) + + return enriched_tools + + def _prepare_env_vars(self, model_info: Dict) -> Dict[str, str]: + """ + Prepare environment variables from multiple sources. + + Merges env vars from: + 1. Base additional_context + 2. Data provider + 3. Tools configuration + + Args: + model_info: Model configuration + + Returns: + Merged environment variables dict + """ + env_vars = {} + + # 1. Base environment variables from additional_context + base_env = self.config.additional_context.get("env_vars", {}) + env_vars.update(base_env) + + # 1b. Critical ROCm environment variable (if not already set) + # HSA_NO_SCRATCH_RECLAIM=1 required for AMD MI300X and newer GPUs + # Prevents performance degradation and NCCL errors + if "HSA_NO_SCRATCH_RECLAIM" not in env_vars: + env_vars["HSA_NO_SCRATCH_RECLAIM"] = "1" + + # 2. Data provider environment variables + data_config = self._prepare_data_config(model_info) + if data_config: + if "env_vars" in data_config: + # Exclude MAD_DATAHOME from data provider's env vars (we set it explicitly below for K8s) + data_provider_env = {k: v for k, v in data_config["env_vars"].items() if k != "MAD_DATAHOME"} + env_vars.update(data_provider_env) + # Always set MAD_DATAHOME for K8s (PVC mount point /data, not /data_dlm_0) + if "datahome" in data_config: + env_vars["MAD_DATAHOME"] = data_config["datahome"] + + # 3. Tools configuration environment variables + # Check both additional_context and manifest.context for tools + tools_config = self.config.additional_context.get("tools", []) + if not tools_config and "context" in self.manifest: + tools_config = self.manifest["context"].get("tools", []) + + for tool in tools_config: + if "env_vars" in tool: + # Skip OUTPUT_FILE as it's set inline in command chain to avoid conflicts + tool_env_vars = {k: v for k, v in tool["env_vars"].items() if k != "OUTPUT_FILE"} + env_vars.update(tool_env_vars) + + return env_vars + + def _prepare_data_config(self, model_info: Dict) -> Optional[Dict]: + """ + Prepare data provider configuration for K8s pod. + + Args: + model_info: Model configuration + + Returns: + Data configuration dict or None + """ + if "data" not in model_info or not model_info["data"]: + return None + + # Initialize data provider if needed + if not self.data: + try: + # Create minimal context for data provider + # We only need the data.json file to be present + data_json_file = "data.json" + if os.path.exists(data_json_file): + # Import Context and create minimal instance + # Data provider needs this to function + self.context_for_data = type('obj', (object,), { + 'ctx': {}, + 'sh': lambda cmd: os.popen(cmd).read().strip() + })() + self.data = Data( + self.context_for_data, + filename=data_json_file, + force_mirrorlocal=False + ) + else: + self.console.print("[yellow]Warning: data.json not found, data provider unavailable[/yellow]") + return None + except Exception as e: + self.console.print(f"[yellow]Warning: Could not initialize data provider: {e}[/yellow]") + return None + + try: + # Get data environment variables + data_env = self.data.get_env(model_info["data"]) + + # Find data provider for this data + dp = self.data.find_dataprovider(model_info["data"]) + if not dp: + self.console.print(f"[yellow]Warning: Data provider not found for {model_info['data']}[/yellow]") + return None + + # Get provider type and source path + provider_type = dp.provider_type if hasattr(dp, 'provider_type') else "local" + source_url = dp.config.get("path", "") if hasattr(dp, 'config') else "" + + # K8s best practice: Always use /data (PVC mount point) + # PVC provides persistent, shared storage across all pods/nodes + # Separation of storage (PVC) from compute (pods) is K8s standard + # FORCE datahome to /data for K8s (override data provider's default /data_dlm_0) + + # Filter out MAD_DATAHOME from data provider env vars (will be set explicitly below) + filtered_data_env = {k: v for k, v in (data_env or {}).items() if k != "MAD_DATAHOME"} + # Add MAD_DATAHOME with correct K8s value + filtered_data_env["MAD_DATAHOME"] = "/data" + + return { + "data_name": model_info["data"], + "env_vars": filtered_data_env, + "provider_type": provider_type, + "source_url": source_url, + "datahome": "/data", # Always use PVC mount point for K8s + } + except Exception as e: + self.console.print(f"[yellow]Warning: Could not prepare data config: {e}[/yellow]") + return None diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index 927ec878..212fe941 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -9,11 +9,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -import json -import os -import subprocess import time -from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional @@ -33,56 +29,29 @@ except ImportError: YAML_AVAILABLE = False -from jinja2 import Template - from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env -from .common import ( - configure_multi_node_profiling, - normalize_launcher, -) from .config_loader import ConfigLoader, apply_deployment_config +from .k8s_names import ( + sanitize_k8s_container_name, + sanitize_k8s_label_value, + sanitize_k8s_object_name, +) +from .k8s_pvc import KubernetesPVCMixin +from .k8s_results import KubernetesResultsMixin +from .k8s_scripts import KubernetesScriptsMixin from .k8s_secrets import ( - CONFIGMAP_MAX_BYTES, SECRETS_STRATEGY_FROM_LOCAL, create_or_update_secrets_from_credentials, delete_job_secrets_if_exist, - estimate_configmap_payload_bytes, merge_secrets_config, - resolve_image_pull_secret_refs, - resolve_runtime_secret_name, - build_registry_secret_data, -) -from madengine.core.dataprovider import Data -from madengine.core.context import Context -from madengine.core.errors import ConfigurationError -from madengine.utils.gpu_config import resolve_runtime_gpus -from madengine.utils.path_utils import get_madengine_root, scripts_base_dir_from -from madengine.utils.run_details import flatten_tags_in_place, get_build_number, get_pipeline - -try: - from madengine.reporting.update_perf_csv import update_perf_csv - from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv - REPORTING_AVAILABLE = True -except ImportError: - REPORTING_AVAILABLE = False - - -from .k8s_names import ( - sanitize_k8s_container_name, - sanitize_k8s_label_value, - sanitize_k8s_object_name, ) +from .k8s_template_context import KubernetesTemplateContextMixin from .kubernetes_launcher_mixin import KubernetesLauncherMixin def _pod_job_name_label_selector(deployment_id: str) -> str: """Selector for the ``job-name`` pod label; value must be a valid ≤63-char label value.""" return f"job-name={sanitize_k8s_label_value(deployment_id)}" -from .primus_backend import ( - infer_primus_backend_from_model_name, - infer_primus_examples_overlay_subdirs, - merged_primus_config, -) def match_pvc_subdir_to_k8s_pod( @@ -123,7 +92,14 @@ def assign_pvc_subdirs_to_pods(pod_dirs: List[str], pod_names: List[str]) -> Dic return mapping -class KubernetesDeployment(KubernetesLauncherMixin, BaseDeployment): +class KubernetesDeployment( + KubernetesLauncherMixin, + KubernetesResultsMixin, + KubernetesTemplateContextMixin, + KubernetesScriptsMixin, + KubernetesPVCMixin, + BaseDeployment, +): """ Kubernetes cluster deployment using Python client library. @@ -342,1183 +318,6 @@ def prepare(self) -> bool: traceback.print_exc() return False - def gather_system_env_details( - self, pre_scripts: List[Dict], model_name: str - ) -> None: - """ - Gather system environment details by adding rocEnvTool to pre-scripts. - - This ensures K8s deployment collects the same system info as local execution. - - Args: - pre_scripts: List of pre-script configurations - model_name: The model name (used for output file naming) - """ - # Add rocEnvTool pre-script with model-specific output name - pre_env_details = { - "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", - "args": model_name.replace("/", "_") + "_env" - } - pre_scripts.append(pre_env_details) - self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]") - - def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None: - """ - Add tool pre/post scripts to execution lists (similar to local execution). - - Extracts pre_scripts and post_scripts from tools.json definitions and adds them - to the pre_scripts and post_scripts lists for execution in K8s pods. - - Args: - pre_scripts: List to append tool pre-scripts to - post_scripts: List to append tool post-scripts to - """ - tools_config = self._get_tools_config() - if not tools_config: - return - - # Load tools.json to get pre/post script definitions - tools_json_path = get_madengine_root() / "scripts" / "common" / "tools.json" - if not tools_json_path.exists(): - return - - with open(tools_json_path, "r") as f: - tools_definitions = json.load(f) - - # Add pre/post scripts from each configured tool - for tool in tools_config: - tool_name = tool.get("name") - if not tool_name or tool_name not in tools_definitions.get("tools", {}): - continue - - tool_def = tools_definitions["tools"][tool_name] - - # Add pre-scripts (at beginning, like local execution) - if "pre_scripts" in tool_def: - pre_scripts[:0] = tool_def["pre_scripts"] - - # Add post-scripts (at end, like local execution) - if "post_scripts" in tool_def: - post_scripts.extend(tool_def["post_scripts"]) - - def _load_common_scripts(self, script_list: List[Dict]) -> Dict[str, str]: - """ - Load common script contents from madengine package for embedding in ConfigMap. - - Since madengine is not installed in model Docker images, we need to embed - the common scripts (pre_scripts, post_scripts, and tool wrapper scripts) in the ConfigMap. - - Args: - script_list: List of script configurations with 'path' field - - Returns: - Dict mapping relative script paths to their contents - """ - import os - script_contents = {} - madengine_root = get_madengine_root() - - for script_config in script_list: - script_path = script_config.get("path", "") - if not script_path: - continue - - # Convert to absolute path from madengine root - abs_script_path = madengine_root / script_path - - if abs_script_path.exists() and abs_script_path.is_file(): - with open(abs_script_path, "r") as f: - script_contents[script_path] = f.read() - self.console.print(f"[dim]Loaded common script: {script_path}[/dim]") - - # If it's run_rocenv_tool.sh, also load the entire rocEnvTool directory - if "run_rocenv_tool.sh" in script_path: - rocenv_dir = abs_script_path.parent / "rocEnvTool" - if rocenv_dir.exists() and rocenv_dir.is_dir(): - # Load all Python files - for py_file in rocenv_dir.glob("*.py"): - rel_path = f"scripts/common/pre_scripts/rocEnvTool/{py_file.name}" - with open(py_file, "r") as f: - script_contents[rel_path] = f.read() - self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") - - # Load all JSON files (e.g., env_tags.json) - for json_file in rocenv_dir.glob("*.json"): - rel_path = f"scripts/common/pre_scripts/rocEnvTool/{json_file.name}" - with open(json_file, "r") as f: - script_contents[rel_path] = f.read() - self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") - else: - self.console.print(f"[yellow]Warning: Script not found: {script_path} (at {abs_script_path})[/yellow]") - - # Load tool wrapper scripts if tools are configured - tools_config = self._get_tools_config() - if tools_config: - self._load_tool_wrapper_scripts(script_contents, tools_config, madengine_root) - - return script_contents - - def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], - tools_config: List[Dict], madengine_root: Path) -> None: - """ - Load tool wrapper scripts and tools.json for K8s ConfigMap. - - This enables profiling tools like rocprof to work in K8s deployments. - - Args: - script_contents: Dict to populate with script contents - tools_config: List of tool configurations from manifest - madengine_root: Path to madengine package root - """ - # Load tools.json first - tools_json_path = madengine_root / "scripts" / "common" / "tools.json" - if tools_json_path.exists(): - with open(tools_json_path, "r") as f: - tools_definitions = json.load(f) - script_contents["scripts/common/tools.json"] = json.dumps(tools_definitions, indent=2) - self.console.print(f"[dim]Loaded tools.json[/dim]") - else: - self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") - return - - # Extract and load wrapper scripts referenced in tool commands - for tool in tools_config: - tool_name = tool.get("name") - if not tool_name: - continue - - # Get tool definition from tools.json - if tool_name not in tools_definitions.get("tools", {}): - self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") - continue - - tool_def = tools_definitions["tools"][tool_name] - - # Extract cmd - could be from tool config override or tool definition - cmd = tool.get("cmd", tool_def.get("cmd", "")) - - # Check if cmd references a script in scripts/common/tools/ - if "scripts/common/tools/" in cmd: - # Parse script path from command (e.g., "bash ../scripts/common/tools/rocprof_wrapper.sh --runtime-trace") - # or "python3 ../scripts/common/tools/gpu_info_profiler.py" - # Extract the path portion - parts = cmd.split() - for part in parts: - if "scripts/common/tools/" in part: - # Remove ../ prefix if present - script_rel_path = part.replace("../", "") - abs_script_path = madengine_root / script_rel_path - - if abs_script_path.exists() and abs_script_path.is_file(): - with open(abs_script_path, "r") as f: - script_contents[script_rel_path] = f.read() - self.console.print(f"[dim]Loaded tool script: {script_rel_path}[/dim]") - - # If it's a Python script, also load utility modules it might depend on - if script_rel_path.endswith('.py'): - tools_dir = abs_script_path.parent - # Load common utility modules that profiling tools depend on - utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] - for util_file in utility_modules: - util_path = tools_dir / util_file - if util_path.exists(): - util_rel_path = f"scripts/common/tools/{util_file}" - if util_rel_path not in script_contents: - with open(util_path, "r") as f: - script_contents[util_rel_path] = f.read() - self.console.print(f"[dim]Loaded tool utility module: {util_rel_path}[/dim]") - else: - self.console.print(f"[yellow]Warning: Tool script not found: {script_rel_path} (at {abs_script_path})[/yellow]") - break - - # Also load any tool-specific pre_scripts and post_scripts - for script_config in tool_def.get("pre_scripts", []): - script_path = script_config.get("path", "") - if script_path and script_path not in script_contents: - abs_script_path = madengine_root / script_path - if abs_script_path.exists(): - with open(abs_script_path, "r") as f: - script_contents[script_path] = f.read() - self.console.print(f"[dim]Loaded tool pre-script: {script_path}[/dim]") - - for script_config in tool_def.get("post_scripts", []): - script_path = script_config.get("path", "") - if script_path and script_path not in script_contents: - abs_script_path = madengine_root / script_path - if abs_script_path.exists(): - with open(abs_script_path, "r") as f: - script_contents[script_path] = f.read() - self.console.print(f"[dim]Loaded tool post-script: {script_path}[/dim]") - - # NEW: Scan pre-scripts for dependencies on scripts/common/tools/ files - # This handles cases like gpu_info_vram_profiler where the pre-script - # calls python3 scripts/common/tools/gpu_info_profiler.py but the tool - # definition has an empty cmd field - for script_config in tool_def.get("pre_scripts", []): - script_path = script_config.get("path", "") - if script_path: - abs_script_path = madengine_root / script_path - if abs_script_path.exists(): - # Read the pre-script to find any tool script references - with open(abs_script_path, "r") as f: - script_content = f.read() - # Look for references to scripts/common/tools/ in the pre-script - import re - # Use non-capturing group (?:...) to avoid capturing just the ../ part - tool_refs = re.findall(r'(?:\.\./)?scripts/common/tools/[\w_]+\.py', script_content) - for tool_ref in tool_refs: - # Clean up the path - tool_script_path = tool_ref.strip('"\'').replace("../", "") - abs_tool_path = madengine_root / tool_script_path - - if abs_tool_path.exists() and tool_script_path not in script_contents: - with open(abs_tool_path, "r") as tf: - script_contents[tool_script_path] = tf.read() - self.console.print(f"[dim]Loaded tool dependency: {tool_script_path}[/dim]") - - # Also load utility modules for this Python script - if tool_script_path.endswith('.py'): - tools_dir = abs_tool_path.parent - utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] - for util_file in utility_modules: - util_path = tools_dir / util_file - if util_path.exists(): - util_rel_path = f"scripts/common/tools/{util_file}" - if util_rel_path not in script_contents: - with open(util_path, "r") as uf: - script_contents[util_rel_path] = uf.read() - self.console.print(f"[dim]Loaded utility module (from dependency): {util_rel_path}[/dim]") - - def _bundle_primus_k8s_examples_overlay( - self, model_scripts_contents: Dict[str, str], model_name: str = "" - ) -> None: - """ - Add Primus experiment files from ``scripts/Primus`` into ``model_scripts_contents`` - using ConfigMap keys under ``Primus/...`` (not ``scripts/Primus/...``). - - The init container writes paths like ``/workspace/Primus/examples/...``, matching - ``PRIMUS_ROOT=/workspace/Primus`` in the Primus Dockerfile. The Job volume hides - image layers under ``/workspace``, so this bundle is what makes K8s runs work. - - Always includes when present: - - - ``requirements.txt`` (repo root; ``pip install -r`` from ``run_pretrain.sh``) - - ``examples/scripts/`` (``prepare_experiment.py``, NCCL helper shells, etc.) - - ``examples/run_pretrain.sh`` - - The backend subtree from ``distributed.primus.config_path`` (torchtitan, - megatron, MaxText, …). - """ - manifest = getattr(self, "manifest", None) - primus_cfg = merged_primus_config( - manifest if isinstance(manifest, dict) else None, - self.config.additional_context, - ) - config_path = primus_cfg.get("config_path") or "" - backend_hint = (primus_cfg.get("backend") or "").strip() - subdirs = infer_primus_examples_overlay_subdirs( - config_path, - backend_hint=backend_hint, - model_name=model_name or "", - ) - cwd = Path.cwd() - primus_repo = cwd / "scripts" / "Primus" - if not primus_repo.is_dir(): - self.console.print( - f"[yellow]Primus K8s: {primus_repo} not found — skipping Primus ConfigMap bundle.[/yellow]" - ) - return - - def _add_primus_file(host_file: Path) -> bool: - try: - content = host_file.read_text(encoding="utf-8", errors="strict") - except (UnicodeDecodeError, OSError): - self.console.print( - f"[dim]Skipping non-text Primus file for K8s bundle: {host_file}[/dim]" - ) - return False - rel_under_repo = host_file.relative_to(primus_repo) - key = str(Path("Primus") / rel_under_repo) - model_scripts_contents[key] = content - return True - - req = primus_repo / "requirements.txt" - if req.is_file(): - if _add_primus_file(req): - self.console.print("[dim]Primus K8s: bundled Primus/requirements.txt[/dim]") - - ex_scripts = primus_repo / "examples" / "scripts" - if ex_scripts.is_dir(): - n_scripts = 0 - for f in ex_scripts.rglob("*"): - if not f.is_file(): - continue - if _add_primus_file(f): - n_scripts += 1 - self.console.print( - f"[dim]Primus K8s: bundled Primus/examples/scripts for ConfigMap ({n_scripts} files)[/dim]" - ) - - run_pre = primus_repo / "examples" / "run_pretrain.sh" - if run_pre.is_file(): - if _add_primus_file(run_pre): - self.console.print("[dim]Primus K8s: bundled Primus/examples/run_pretrain.sh[/dim]") - - for sub in subdirs: - base = primus_repo / "examples" / sub - if not base.is_dir(): - self.console.print( - f"[yellow]Primus K8s: scripts/Primus/examples/{sub} not found under {cwd} — " - "skipping that subtree.[/yellow]" - ) - continue - n = 0 - for f in base.rglob("*"): - if not f.is_file(): - continue - if _add_primus_file(f): - n += 1 - self.console.print( - f"[dim]Primus K8s: bundled Primus/examples/{sub} for ConfigMap ({n} files)[/dim]" - ) - - def _prepare_template_context( - self, model_info: Dict, image_info: Dict - ) -> Dict[str, Any]: - """ - Prepare context dictionary for Jinja2 template rendering. - - Args: - model_info: Model configuration from build_manifest.json - image_info: Image information from build_manifest.json - - Returns: - Context dictionary with all template variables - """ - # Use hierarchical GPU resolution: runtime > deployment > model > default - additional_context = self.config.additional_context.copy() - additional_context["k8s"] = self.k8s_config - gpu_count = resolve_runtime_gpus(model_info, additional_context) - model_name = model_info["name"] - - # Load manifest and credential content for ConfigMap - with open(self.config.manifest_file, "r") as f: - manifest_content = f.read() - - credential_content = "{}" - credential_path = Path("credential.json") - if credential_path.exists(): - with open(credential_path, "r") as f: - credential_content = f.read() - - # Load data.json content if exists - data_json_content = None - data_path = Path("data.json") - if data_path.exists(): - with open(data_path, "r") as f: - data_json_content = f.read() - self.console.print(f"[dim]Loaded data.json[/dim]") - - # Load model scripts directory content (entire folder, not just one file) - # This matches local execution which mounts the entire MODEL_DIR/scripts folder - model_script_path = model_info.get("scripts") # e.g., "scripts/dummy/run_data_minio.sh" - model_script_dir = None - model_script_filename = None - model_scripts_contents = {} # Store all scripts in the directory - - if model_script_path: - script_file = Path(model_script_path) - # Extract directory and filename - model_script_dir = str(script_file.parent) # e.g., "scripts/dummy" - model_script_filename = script_file.name # e.g., "run_data_minio.sh" - - # Bundle entire scripts/ directory recursively for reliability across - # different model types (vllm, sglang, etc.) with varying file types and subdirs - scripts_dir_path = Path(model_script_dir) - if scripts_dir_path.exists() and scripts_dir_path.is_dir(): - cwd = Path.cwd() - for f in scripts_dir_path.rglob("*"): - if not f.is_file(): - continue - try: - content = f.read_text(encoding="utf-8", errors="strict") - except (UnicodeDecodeError, OSError): - # Skip binary or unreadable files (ConfigMap is text-only) - self.console.print( - f"[dim]Skipping non-text file: {f.relative_to(scripts_dir_path)}[/dim]" - ) - continue - relative_path = ( - str(f.relative_to(cwd)) if f.is_absolute() else str(f) - ) - model_scripts_contents[relative_path] = content - self.console.print( - f"[dim]Loaded {len(model_scripts_contents)} file(s) from {model_script_dir}[/dim]" - ) - elif script_file.exists(): - # Fallback: load single file if directory doesn't exist - with open(script_file, "r") as f: - model_scripts_contents[model_script_path] = f.read() - self.console.print(f"[dim]Loaded single script: {model_script_path}[/dim]") - else: - self.console.print(f"[yellow]Warning: Script not found: {model_script_path}[/yellow]") - - # Load K8s tools configuration - k8s_tools_config = self._load_k8s_tools() - - # Prepare data configuration first - data_config = self._prepare_data_config(model_info) - - # Store for use in deploy() method - self._data_config = data_config - - # K8s best practice: Auto-create shared data PVC if needed - # K8s philosophy: Separate compute (pods) from storage (PVC) - if data_config and not self.k8s_config.get("data_pvc"): - # PVC will be auto-created during deployment - # Use consistent name for reusability across training runs - self.console.print( - f"[cyan]📦 Data provider detected: Will auto-create shared data PVC[/cyan]" - ) - self.console.print( - f"[dim] PVC name: madengine-shared-data (reusable across runs)[/dim]" - ) - self.console.print( - f"[dim] Access mode: RWO for single-node, RWX for multi-node (auto-selected)[/dim]" - ) - self.console.print( - f"[dim] To use existing PVC, add 'data_pvc' to your K8s config[/dim]" - ) - # Set PVC name now so templates are rendered with correct value - self.k8s_config["data_pvc"] = "madengine-shared-data" - - # Determine data provider script if model needs data - data_provider_script = None - data_provider_script_content = None - if data_config: - provider_type = data_config.get("provider_type", "local") - if provider_type in k8s_tools_config.get("data_providers", {}): - data_provider_script = k8s_tools_config["data_providers"][provider_type] - - # Load K8s data provider script content - k8s_script_path = get_madengine_root() / data_provider_script["script"] - if k8s_script_path.exists(): - with open(k8s_script_path, "r") as f: - data_provider_script_content = f.read() - self.console.print(f"[dim]Loaded K8s data provider: {data_provider_script['script']}[/dim]") - else: - self.console.print(f"[yellow]Warning: K8s script not found: {k8s_script_path}[/yellow]") - - # Get launcher configuration from manifest's deployment_config or additional_context - deployment_config = self.manifest.get("deployment_config", {}) - distributed_config = deployment_config.get("distributed", {}) - launcher_config = self.config.additional_context.get("launcher", {}) - - # Merge manifest and runtime launcher config (runtime overrides) - # Use explicit None checking to handle 0 values correctly - launcher_type = ( - launcher_config.get("type") - if launcher_config.get("type") is not None - else distributed_config.get("launcher") - ) - - nnodes = ( - launcher_config.get("nnodes") - if launcher_config.get("nnodes") is not None - else distributed_config.get("nnodes", 1) - ) - - # Store for use in deploy() method - self._nnodes = nnodes - - nproc_per_node = ( - launcher_config.get("nproc_per_node") - if launcher_config.get("nproc_per_node") is not None - else distributed_config.get("nproc_per_node") - if distributed_config.get("nproc_per_node") is not None - else int(model_info.get("n_gpus", 1)) - ) - - master_port = launcher_config.get("master_port", 29500) - - # Validate configuration - if launcher_type == "torchrun": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring torchrun: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - - elif launcher_type == "deepspeed": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring DeepSpeed: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - - elif launcher_type == "torchtitan": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring TorchTitan: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - - elif launcher_type == "vllm": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring vLLM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - - elif launcher_type == "sglang": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring SGLang: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - - elif launcher_type == "megatron": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring Megatron-LM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - - elif launcher_type == "primus": - if not isinstance(nnodes, int) or nnodes < 1: - raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") - if not isinstance(nproc_per_node, int) or nproc_per_node < 1: - raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") - - self.console.print(f"[cyan]Configuring Primus: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") - self._bundle_primus_k8s_examples_overlay(model_scripts_contents, model_name) - - # Determine if we need multi-node setup - create_headless_service = False - launcher_command = None - - if launcher_type == "torchrun": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node detected: Creating headless service for pod discovery[/dim]") - - # Generate torchrun launcher command - launcher_command = self._generate_torchrun_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh") - ) - - elif launcher_type == "deepspeed": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node DeepSpeed: Creating headless service for pod discovery[/dim]") - - model_script = model_info.get("scripts", "run.sh") - - # Check if script is a bash script - if so, execute it directly - # as it will handle the launcher internally - if model_script.endswith('.sh'): - self.console.print(f"[dim]Detected bash script ({model_script}), will execute directly[/dim]") - launcher_command = self._generate_bash_script_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_script - ) - else: - # Python script - use DeepSpeed launcher - launcher_command = self._generate_deepspeed_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_script - ) - - elif launcher_type == "torchtitan": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node TorchTitan: Creating headless service for pod discovery[/dim]") - - # Generate TorchTitan launcher command - launcher_command = self._generate_torchtitan_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh") - ) - - elif launcher_type == "vllm": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node vLLM: Creating headless service for Ray cluster[/dim]") - - # Generate vLLM launcher command (pass model args so run.sh gets --model_repo etc.) - launcher_command = self._generate_vllm_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh"), - model_args=model_info.get("args", ""), - ) - - elif launcher_type == "sglang": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node SGLang: Creating headless service for Ray cluster[/dim]") - - # Generate SGLang launcher command (pass model args so run.sh gets CLI args) - launcher_command = self._generate_sglang_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh"), - model_args=model_info.get("args", ""), - ) - - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": - if nnodes < 3: - raise ValueError( - f"SGLang Disaggregated requires minimum 3 nodes " - f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" - ) - - # Always create headless service for disaggregated architecture - create_headless_service = True - self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]") - self.console.print(f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]") - - # Generate SGLang Disaggregated launcher command - launcher_command = self._generate_sglang_disagg_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh") - ) - - elif launcher_type == "megatron": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node Megatron-LM: Creating headless service for pod discovery[/dim]") - - # Generate Megatron-LM launcher command - launcher_command = self._generate_megatron_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh") - ) - - elif launcher_type == "primus": - if nnodes > 1: - create_headless_service = True - self.console.print(f"[dim]Multi-node Primus: Creating headless service for pod discovery[/dim]") - - # Generate Primus launcher command (env-only: PRIMUS_CONFIG_PATH, PRIMUS_CLI_EXTRA) - launcher_command = self._generate_primus_command( - nnodes=nnodes, - nproc_per_node=nproc_per_node, - master_port=master_port, - model_script=model_info.get("scripts", "run.sh"), - model_args=model_info.get("args", "") or "", - model_name=model_info.get("name", "") or "", - ) - primus_cfg = merged_primus_config(self.manifest, self.config.additional_context) - backend_hint = (primus_cfg.get("backend") or "").strip().lower() - inferred_backend = infer_primus_backend_from_model_name( - model_info.get("name", "") or "" - ) - config_path_lower = (primus_cfg.get("config_path") or "").lower() - looks_maxtext = ( - backend_hint == "maxtext" - or inferred_backend == "MaxText" - or "maxtext" in config_path_lower - ) - if looks_maxtext and nnodes > 1: - self.console.print( - "[yellow]Warning: Primus MaxText multi-node may run in-container apt installs " - "(InfiniBand-related packages) inside run_pretrain.sh. Ensure your image or " - "cluster policy allows this, or use a pre-baked image.[/yellow]" - ) - - # Prepare pre/post scripts (similar to local execution) - pre_scripts = [] - post_scripts = [] - - # Get pre/post scripts from manifest context if available - if "context" in self.manifest: - if "pre_scripts" in self.manifest["context"]: - pre_scripts.extend(self.manifest["context"]["pre_scripts"]) - if "post_scripts" in self.manifest["context"]: - post_scripts.extend(self.manifest["context"]["post_scripts"]) - - # Add system environment collection (rocEnvTool) - same as local execution - # This is controlled by generate_sys_env_details flag (default: True) - generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) - if generate_sys_env_details: - self.gather_system_env_details(pre_scripts, model_info["name"]) - - # Add tool pre/post scripts to the execution lists (like local execution) - self._add_tool_scripts(pre_scripts, post_scripts) - - # Load pre/post script contents for ConfigMap (since madengine not installed in container) - pre_post_script_contents = self._load_common_scripts(pre_scripts + post_scripts) - - merged_sec = merge_secrets_config(self.k8s_config) - strategy = merged_sec.get("strategy", SECRETS_STRATEGY_FROM_LOCAL) - cred_path = Path("credential.json") - cred_exists = cred_path.exists() - - created_pull_preview: List[str] = [] - if cred_exists and strategy == SECRETS_STRATEGY_FROM_LOCAL: - try: - parsed = json.loads(cred_path.read_text(encoding="utf-8")) - if build_registry_secret_data(parsed): - created_pull_preview.append(f"{self.job_name}-registry-pull") - except (json.JSONDecodeError, OSError): - pass - - if strategy == SECRETS_STRATEGY_FROM_LOCAL: - include_credential_in_configmap = not cred_exists - else: - include_credential_in_configmap = False - - created_runtime_name: Optional[str] = ( - f"{self.job_name}-runtime" - if strategy == SECRETS_STRATEGY_FROM_LOCAL and cred_exists - else None - ) - runtime_credentials_secret_name = resolve_runtime_secret_name( - strategy, merged_sec, created_runtime_name - ) - - image_pull_secrets = resolve_image_pull_secret_refs( - strategy, merged_sec, created_pull_preview - ) - - ap_prof = self.k8s_config.get("allow_privileged_profiling") - if ap_prof is None: - privileged_profiling = bool(self._get_tools_config()) - else: - privileged_profiling = bool(ap_prof) - - _pytorch_native = frozenset( - {"torchrun", "deepspeed", "torchtitan", "megatron", "primus"} - ) - subdomain_val = ( - self.service_name - if nnodes > 1 and launcher_type in _pytorch_native - else None - ) - - # Build complete context - context = { - # Job metadata - "job_name": self.job_name, - "job_label": self.job_label, - "main_container_name": getattr( - self, "main_container_name", None - ) - or sanitize_k8s_container_name(self.job_name), - "namespace": self.namespace, - "model_name": model_name, - "model_label": sanitize_k8s_label_value(model_name), - # ConfigMap - "configmap_name": self.configmap_name, - "manifest_content": manifest_content, - "credential_content": credential_content, - "include_credential_in_configmap": include_credential_in_configmap, - "runtime_credentials_secret_name": runtime_credentials_secret_name, - "image_pull_secrets": image_pull_secrets, - "privileged_profiling": privileged_profiling, - "ttl_seconds_after_finished": self.k8s_config.get( - "ttl_seconds_after_finished" - ), - "data_json_content": data_json_content, - "model_scripts_contents": model_scripts_contents, # All scripts in directory - "model_script_path": model_script_path, - "model_script_dir": model_script_dir, - "model_script_filename": model_script_filename, - # K8s tools - "data_provider_script": data_provider_script, - "data_provider_script_content": data_provider_script_content, - # Image - "image": image_info["registry_image"], - "image_pull_policy": self.k8s_config.get("image_pull_policy", "Always"), - # Resources - "gpu_resource_name": self.gpu_resource_name, - "gpu_count": gpu_count, - "memory": self.k8s_config.get("memory", "128Gi"), - "memory_limit": self.k8s_config.get("memory_limit", "256Gi"), - "cpu": self.k8s_config.get("cpu", "32"), - "cpu_limit": self.k8s_config.get("cpu_limit", "64"), - # Job spec - "completions": nnodes, - "parallelism": nnodes, - "completion_mode": "Indexed" if nnodes > 1 else None, - "backoff_limit": self.k8s_config.get("backoff_limit", 3), - # Pod spec - "node_selector": self.k8s_config.get("node_selector", {}), - "tolerations": self.k8s_config.get("tolerations", []), - "host_ipc": nnodes > 1, # Enable for multi-node - "subdomain": subdomain_val, - # Execution - "gpu_visibility": ",".join(str(i) for i in range(gpu_count)), # e.g., "0" for 1 GPU, "0,1" for 2 GPUs - "gpu_architecture": self.manifest.get("context", {}).get( - "gpu_architecture", "gfx90a" - ), - "model_script": f"{model_info.get('scripts', 'run.sh')} {model_info.get('args', '')}".strip(), - "launcher_type": launcher_type, - "launcher_command": launcher_command, - "nnodes": nnodes, - "nproc_per_node": nproc_per_node, - "master_port": master_port, - "timeout": self.config.timeout, - # Environment - Merge base env vars with data/tools env vars - "env_vars": self._prepare_env_vars(model_info), - # Volumes - "results_pvc": f"{self.job_name}-results", # Always create a PVC for results - "pvc_name": f"{self.job_name}-results", # PVC name for template - "data_pvc": self.k8s_config.get("data_pvc"), - # Multi-node - "create_headless_service": create_headless_service, - "service_name": self.service_name, - "ports": [29500] if create_headless_service else [], - # Data provider configuration (already prepared above) - "data_config": data_config, - # Tools configuration - from manifest.context or additional_context - "tools_config": self._get_tools_config(), - # Tool command chains (pre-built for template) - "launcher_tool_chain": self._build_tool_command_chain( - self._get_tools_config(), "bash /tmp/run_launcher.sh" - ) if launcher_command else None, - "direct_script_tool_chain": self._build_tool_command_chain( - self._get_tools_config(), f"bash {model_info.get('scripts', 'run.sh')}" - ), - # Pre/Post scripts - includes rocEnvTool and any user-defined scripts - "pre_scripts": pre_scripts, - "post_scripts": post_scripts, - # Common script contents for ConfigMap (embedded since madengine not in container) - "common_script_contents": pre_post_script_contents, - # Multiple results file (e.g. perf_dummy.csv) - copied to PVC for K8s result collection - "multiple_results": model_info.get("multiple_results") or "", - } - - est = estimate_configmap_payload_bytes(context) - if est > CONFIGMAP_MAX_BYTES: - raise ConfigurationError( - f"ConfigMap payload would be ~{est} bytes; Kubernetes limit is ~1 MiB. " - "Reduce embedded scripts or use a smaller scripts directory." - ) - - return context - - def _get_tools_config(self) -> List[Dict]: - """ - Get tools configuration from manifest.context or additional_context. - - Prioritizes runtime additional_context, falls back to manifest.context. - - For multi-node runs: - - Checks rocprofv3 availability (required for MPI profiling) - - Upgrades "rocprof" to "rocprofv3" for multi-node compatibility - - Logs warnings if rocprofv3 not available - - Returns: - List of tool configurations (enriched with cmd from tools.json) - """ - # Cache the result to avoid repeated expensive checks and duplicate warnings - if hasattr(self, '_cached_tools_config'): - return self._cached_tools_config - - # Check runtime additional_context first (allows runtime override) - tools = self.config.additional_context.get("tools", []) - - # Fall back to manifest.context if no runtime tools - if not tools and "context" in self.manifest: - tools = self.manifest["context"].get("tools", []) - - # Apply multi-node profiling logic if applicable - distributed_config = self.config.additional_context.get("distributed", {}) - nnodes = distributed_config.get("nnodes", 1) - - if nnodes > 1 and tools: - # Configure multi-node profiling (handles rocprofv3 detection and tool upgrades) - # Create a simple logger wrapper for configure_multi_node_profiling - class ConsoleLogger: - def __init__(self, console): - self.console = console - def info(self, msg): - self.console.print(f"[cyan]{msg}[/cyan]") - def warning(self, msg): - self.console.print(f"[yellow]{msg}[/yellow]") - def debug(self, msg): - pass # Skip debug messages in console - - profiling_config = configure_multi_node_profiling( - nnodes=nnodes, - tools_config=tools, - logger=ConsoleLogger(self.console) - ) - - if profiling_config["enabled"]: - tools = profiling_config["tools"] - else: - # rocprofv3 not available - skip profiling for multi-node - tools = [] - - # Enrich tools with cmd from tools.json for K8s template usage - result = self._enrich_tools_with_cmd(tools) - - # Cache the result for subsequent calls - self._cached_tools_config = result - return result - - def _build_tool_command_chain(self, tools_config: List[Dict], base_command: str) -> str: - """ - Build a command chain from multiple tools, wrapping the base command. - - Tools are chained from outermost to innermost: - tool_n wraps tool_2 wraps tool_1 wraps base_command - - Each tool's OUTPUT_FILE env var is set inline to avoid conflicts. - - Args: - tools_config: List of enriched tool configurations - base_command: The base command to wrap (e.g., "bash /tmp/run_launcher.sh") - - Returns: - Complete command chain string - """ - if not tools_config: - return base_command - - # Filter tools that have a cmd field - tools_with_cmd = [t for t in tools_config if t.get("cmd")] - - if not tools_with_cmd: - return base_command - - # Build command chain from inside out (reverse order) - cmd_chain = base_command - for tool in reversed(tools_with_cmd): - tool_cmd = tool["cmd"].replace("../scripts/common/", "scripts/common/") - - # Set OUTPUT_FILE inline for this specific tool (if defined in tool's env_vars) - tool_env_vars = tool.get("env_vars", {}) - if "OUTPUT_FILE" in tool_env_vars: - output_file = tool_env_vars["OUTPUT_FILE"] - # Prepend OUTPUT_FILE=value to this tool's command only - cmd_chain = f"OUTPUT_FILE={output_file} {tool_cmd} {cmd_chain}" - else: - cmd_chain = f"{tool_cmd} {cmd_chain}" - - return cmd_chain - - def _enrich_tools_with_cmd(self, tools: List[Dict]) -> List[Dict]: - """ - Enrich tools configuration with cmd field from tools.json. - - This is needed for K8s template to generate the correct encapsulation command. - - Args: - tools: List of tool configurations (may only have 'name' field) - - Returns: - Enriched list with 'cmd' field added from tools.json - """ - if not tools: - return tools - - # Load tools.json - tools_json_path = Path(__file__).parent.parent / "scripts" / "common" / "tools.json" - if not tools_json_path.exists(): - self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") - return tools - - with open(tools_json_path, "r") as f: - tools_definitions = json.load(f) - - enriched_tools = [] - for tool in tools: - tool_name = tool.get("name") - if not tool_name: - enriched_tools.append(tool) - continue - - # Get tool definition from tools.json - if tool_name not in tools_definitions.get("tools", {}): - self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") - enriched_tools.append(tool) - continue - - tool_def = tools_definitions["tools"][tool_name] - - # Create enriched tool config with cmd - enriched_tool = tool.copy() - if "cmd" not in enriched_tool and "cmd" in tool_def: - enriched_tool["cmd"] = tool_def["cmd"] - - # Also copy env_vars if present - if "env_vars" not in enriched_tool and "env_vars" in tool_def: - enriched_tool["env_vars"] = tool_def["env_vars"] - - enriched_tools.append(enriched_tool) - - return enriched_tools - - def _load_k8s_tools(self) -> Dict: - """ - Load K8s-specific tools configuration. - - Returns: - Dict with K8s tools configuration - """ - k8s_tools_file = Path(__file__).parent.parent / "scripts" / "k8s" / "tools.json" - - if k8s_tools_file.exists(): - try: - with open(k8s_tools_file, "r") as f: - return json.load(f) - except Exception as e: - self.console.print(f"[yellow]Warning: Failed to load K8s tools config: {e}[/yellow]") - return {} - else: - self.console.print(f"[yellow]Warning: K8s tools.json not found at {k8s_tools_file}[/yellow]") - return {} - - def _prepare_env_vars(self, model_info: Dict) -> Dict[str, str]: - """ - Prepare environment variables from multiple sources. - - Merges env vars from: - 1. Base additional_context - 2. Data provider - 3. Tools configuration - - Args: - model_info: Model configuration - - Returns: - Merged environment variables dict - """ - env_vars = {} - - # 1. Base environment variables from additional_context - base_env = self.config.additional_context.get("env_vars", {}) - env_vars.update(base_env) - - # 1b. Critical ROCm environment variable (if not already set) - # HSA_NO_SCRATCH_RECLAIM=1 required for AMD MI300X and newer GPUs - # Prevents performance degradation and NCCL errors - if "HSA_NO_SCRATCH_RECLAIM" not in env_vars: - env_vars["HSA_NO_SCRATCH_RECLAIM"] = "1" - - # 2. Data provider environment variables - data_config = self._prepare_data_config(model_info) - if data_config: - if "env_vars" in data_config: - # Exclude MAD_DATAHOME from data provider's env vars (we set it explicitly below for K8s) - data_provider_env = {k: v for k, v in data_config["env_vars"].items() if k != "MAD_DATAHOME"} - env_vars.update(data_provider_env) - # Always set MAD_DATAHOME for K8s (PVC mount point /data, not /data_dlm_0) - if "datahome" in data_config: - env_vars["MAD_DATAHOME"] = data_config["datahome"] - - # 3. Tools configuration environment variables - # Check both additional_context and manifest.context for tools - tools_config = self.config.additional_context.get("tools", []) - if not tools_config and "context" in self.manifest: - tools_config = self.manifest["context"].get("tools", []) - - for tool in tools_config: - if "env_vars" in tool: - # Skip OUTPUT_FILE as it's set inline in command chain to avoid conflicts - tool_env_vars = {k: v for k, v in tool["env_vars"].items() if k != "OUTPUT_FILE"} - env_vars.update(tool_env_vars) - - return env_vars - - def _prepare_data_config(self, model_info: Dict) -> Optional[Dict]: - """ - Prepare data provider configuration for K8s pod. - - Args: - model_info: Model configuration - - Returns: - Data configuration dict or None - """ - if "data" not in model_info or not model_info["data"]: - return None - - # Initialize data provider if needed - if not self.data: - try: - # Create minimal context for data provider - # We only need the data.json file to be present - import os - data_json_file = "data.json" - if os.path.exists(data_json_file): - # Import Context and create minimal instance - # Data provider needs this to function - self.context_for_data = type('obj', (object,), { - 'ctx': {}, - 'sh': lambda cmd: os.popen(cmd).read().strip() - })() - self.data = Data( - self.context_for_data, - filename=data_json_file, - force_mirrorlocal=False - ) - else: - self.console.print("[yellow]Warning: data.json not found, data provider unavailable[/yellow]") - return None - except Exception as e: - self.console.print(f"[yellow]Warning: Could not initialize data provider: {e}[/yellow]") - return None - - try: - # Get data environment variables - data_env = self.data.get_env(model_info["data"]) - - # Find data provider for this data - dp = self.data.find_dataprovider(model_info["data"]) - if not dp: - self.console.print(f"[yellow]Warning: Data provider not found for {model_info['data']}[/yellow]") - return None - - # Get provider type and source path - provider_type = dp.provider_type if hasattr(dp, 'provider_type') else "local" - source_url = dp.config.get("path", "") if hasattr(dp, 'config') else "" - - # K8s best practice: Always use /data (PVC mount point) - # PVC provides persistent, shared storage across all pods/nodes - # Separation of storage (PVC) from compute (pods) is K8s standard - # FORCE datahome to /data for K8s (override data provider's default /data_dlm_0) - - # Filter out MAD_DATAHOME from data provider env vars (will be set explicitly below) - filtered_data_env = {k: v for k, v in (data_env or {}).items() if k != "MAD_DATAHOME"} - # Add MAD_DATAHOME with correct K8s value - filtered_data_env["MAD_DATAHOME"] = "/data" - - return { - "data_name": model_info["data"], - "env_vars": filtered_data_env, - "provider_type": provider_type, - "source_url": source_url, - "datahome": "/data", # Always use PVC mount point for K8s - } - except Exception as e: - self.console.print(f"[yellow]Warning: Could not prepare data config: {e}[/yellow]") - return None - def _save_debug_manifests(self): """Save rendered manifests to disk for debugging.""" output_dir = Path(self.k8s_config.get("output_dir", "./k8s_manifests")) @@ -1538,213 +337,7 @@ def _save_debug_manifests(self): f"[yellow]Debug: Manifests saved to {output_dir}[/yellow]" ) - def _k8s_data_storage_class(self) -> Optional[str]: - """StorageClass for long-lived ``madengine-shared-data`` (NFS RWX recommended).""" - return ( - self.k8s_config.get("data_storage_class") - or self.k8s_config.get("nfs_storage_class") - or self.k8s_config.get("storage_class") - ) - - def _k8s_results_storage_class(self, nnodes: int) -> Optional[str]: - """ - Per-job results: local-path (RWO) for single-node, NFS (RWX) for multi-node. - - Falls back to ``storage_class`` for backward compatibility. - """ - if nnodes > 1: - return ( - self.k8s_config.get("multi_node_results_storage_class") - or self.k8s_config.get("nfs_storage_class") - or self.k8s_config.get("storage_class") - ) - return ( - self.k8s_config.get("single_node_results_storage_class") - or self.k8s_config.get("local_path_storage_class") - or self.k8s_config.get("storage_class") - ) - - def _create_results_pvc(self, nnodes: int = 1) -> str: - """ - Create a PersistentVolumeClaim for per-job results. - - Single-node uses ReadWriteOnce (typically local-path). Multi-node uses - ReadWriteMany (typically nfs-banff or other RWX class). - """ - pvc_name = f"{self.job_name}-results" - access_mode = "ReadWriteMany" if nnodes > 1 else "ReadWriteOnce" - storage_class = self._k8s_results_storage_class(nnodes) - - template_dir = Path(__file__).parent / "templates" / "kubernetes" - pvc_template = template_dir / "pvc.yaml.j2" - - with open(pvc_template, "r") as f: - pvc_template_str = f.read() - - template = Template(pvc_template_str) - self.console.print( - f"[dim] Results PVC: access={access_mode}, " - f"storageClass={storage_class or '(cluster default)'}[/dim]" - ) - if nnodes > 1 and not storage_class: - self.console.print( - "[yellow]⚠️ Multi-node: set k8s.nfs_storage_class or " - "multi_node_results_storage_class to an RWX class (e.g. nfs-banff).[/yellow]" - ) - pvc_yaml = template.render( - pvc_name=pvc_name, - namespace=self.namespace, - access_mode=access_mode, - storage_size=self.k8s_config.get("results_storage_size", "10Gi"), - storage_class=storage_class, - ) - - # Create PVC (retry on 409 "object is being deleted" until it is gone) - pvc_dict = yaml.safe_load(pvc_yaml) - max_create_retries = 6 - create_wait_seconds = 5 - for attempt in range(max_create_retries): - try: - self.core_v1.create_namespaced_persistent_volume_claim( - namespace=self.namespace, body=pvc_dict - ) - return pvc_name - except ApiException as e: - if e.status == 409 and e.body and "object is being deleted" in (e.body or ""): - if attempt < max_create_retries - 1: - self.console.print( - f"[dim]PVC still terminating, waiting {create_wait_seconds}s before retry ({attempt + 1}/{max_create_retries})[/dim]" - ) - time.sleep(create_wait_seconds) - else: - raise - else: - raise - - def _wait_for_pvc_deleted(self, pvc_name: str, max_wait: int = 90) -> None: - """Block until the PVC is fully removed (or timeout).""" - for i in range(max_wait): - try: - self.core_v1.read_namespaced_persistent_volume_claim( - name=pvc_name, namespace=self.namespace - ) - if i > 0 and i % 10 == 0: - self.console.print( - f"[dim]Waiting for PVC {pvc_name} to be removed... ({i}s)[/dim]" - ) - time.sleep(1) - except ApiException as e: - if e.status == 404: - return - raise - - def _create_or_get_data_pvc(self, nnodes: int = 1) -> str: - """ - Create or reuse ``madengine-shared-data`` for long-lived datasets (cache). - - Always uses ReadWriteMany + an NFS-style StorageClass so the same PVC - works for single- and multi-pod jobs. Use ``data_storage_class`` or - ``nfs_storage_class`` (e.g. nfs-banff), not local-path. - Args: - nnodes: Reserved for logging (shared-data access mode does not depend on it). - - Returns: - Name of the PVC (existing or newly created) - """ - pvc_name = "madengine-shared-data" - - if self.k8s_config.get("recreate_shared_data_pvc"): - try: - self.core_v1.delete_namespaced_persistent_volume_claim( - name=pvc_name, namespace=self.namespace - ) - self.console.print( - "[yellow]recreate_shared_data_pvc: deleted existing " - f"{pvc_name} (backup data first if needed)[/yellow]" - ) - self._wait_for_pvc_deleted(pvc_name) - except ApiException as e: - if e.status != 404: - raise - - try: - existing_pvc = self.core_v1.read_namespaced_persistent_volume_claim( - name=pvc_name, - namespace=self.namespace, - ) - self.console.print(f"[dim]✓ Using existing data PVC: {pvc_name}[/dim]") - - access_modes = existing_pvc.spec.access_modes or [] - if "ReadWriteMany" not in access_modes: - self.console.print( - f"[yellow]⚠️ Warning: {pvc_name} is not ReadWriteMany " - f"(modes: {access_modes}).[/yellow]" - ) - self.console.print( - "[yellow] For NFS-backed long-lived data, delete the PVC and re-run with " - "k8s.data_storage_class / nfs_storage_class set, or use " - "recreate_shared_data_pvc (after backup).[/yellow]" - ) - return pvc_name - - except ApiException as e: - if e.status != 404: - raise - - access_mode = "ReadWriteMany" - storage_class = self._k8s_data_storage_class() - self.console.print(f"[blue]Creating shared data PVC: {pvc_name}...[/blue]") - self.console.print( - f"[dim] Access mode: {access_mode}; storageClass={storage_class or '(cluster default)'}; " - f"nnodes={nnodes}[/dim]" - ) - if not storage_class: - self.console.print( - "[yellow]⚠️ Set k8s.nfs_storage_class or data_storage_class to an RWX class " - "(e.g. nfs-banff) for shared-data. Default SC may be local-path (RWO-only).[/yellow]" - ) - - template_dir = Path(__file__).parent / "templates" / "kubernetes" - pvc_template = template_dir / "pvc-data.yaml.j2" - - with open(pvc_template, "r") as f: - pvc_template_str = f.read() - - template = Template(pvc_template_str) - pvc_yaml = template.render( - pvc_name=pvc_name, - namespace=self.namespace, - access_mode=access_mode, - storage_size=self.k8s_config.get("data_storage_size", "100Gi"), - storage_class=storage_class, - ) - - pvc_dict = yaml.safe_load(pvc_yaml) - self.core_v1.create_namespaced_persistent_volume_claim( - namespace=self.namespace, body=pvc_dict - ) - - self.console.print("[dim]Waiting for PVC to be bound...[/dim]") - for _ in range(30): - try: - pvc = self.core_v1.read_namespaced_persistent_volume_claim( - name=pvc_name, namespace=self.namespace - ) - if pvc.status.phase == "Bound": - self.console.print("[green]✓ PVC bound successfully[/green]") - break - except ApiException: - pass - time.sleep(1) - else: - self.console.print( - f"[yellow]⚠️ Warning: PVC created but not bound yet. " - f"Check: kubectl describe pvc {pvc_name}[/yellow]" - ) - - return pvc_name - def _cleanup_existing_resources(self): """Delete existing Job, ConfigMap, and Service if they exist.""" # Delete existing Job @@ -1766,7 +359,7 @@ def _cleanup_existing_resources(self): ) except ApiException: pass - + # Delete existing ConfigMap try: self.core_v1.delete_namespaced_config_map( @@ -1777,7 +370,7 @@ def _cleanup_existing_resources(self): except ApiException as e: if e.status != 404: pass - + # Delete existing Service if hasattr(self, 'service_yaml') and self.service_yaml: try: @@ -1789,7 +382,7 @@ def _cleanup_existing_resources(self): except ApiException as e: if e.status != 404: pass - + # Delete existing collector pod (must be done before PVC to allow PVC deletion) collector_pod_name = f"collector-{self.job_name}" try: @@ -1804,7 +397,7 @@ def _cleanup_existing_resources(self): except ApiException as e: if e.status != 404: pass - + # Delete existing PVC pvc_name = f"{self.job_name}-results" try: @@ -1813,7 +406,7 @@ def _cleanup_existing_resources(self): namespace=self.namespace ) self.console.print(f"[dim]Deleted existing PVC: {pvc_name}[/dim]") - + # Wait for PVC to be fully deleted (not just marked for deletion) max_wait = 90 # Maximum 90 seconds (PV can take time to detach) wait_interval = 1 # Check every 1 second @@ -1835,7 +428,7 @@ def _cleanup_existing_resources(self): except ApiException as e: if e.status != 404: pass - + # Wait a moment for other resources to be deleted time.sleep(1) @@ -1844,13 +437,13 @@ def deploy(self) -> DeploymentResult: try: # Clean up any existing resources first self._cleanup_existing_resources() - + # 1. Create PVC for results storage self.console.print("[blue]Creating PVC for results storage...[/blue]") nnodes_deploy = getattr(self, "_nnodes", 1) pvc_name = self._create_results_pvc(nnodes=nnodes_deploy) self.console.print(f"[green]✓ Created PVC: {pvc_name}[/green]") - + # 1b. Create or reuse data PVC if data provider is configured and auto-creation was flagged if hasattr(self, '_data_config') and self._data_config: # Check if we set the PVC name during prepare (auto-creation case) @@ -1859,7 +452,7 @@ def deploy(self) -> DeploymentResult: # Auto-creation mode: create/reuse the PVC nnodes = getattr(self, '_nnodes', 1) self._create_or_get_data_pvc(nnodes=nnodes) - + # 2. Create Secrets from local credential.json (strategy: from_local_credentials) merged_sec = merge_secrets_config(self.k8s_config) strategy = merged_sec.get("strategy", SECRETS_STRATEGY_FROM_LOCAL) @@ -1928,18 +521,18 @@ def deploy(self) -> DeploymentResult: def monitor(self, deployment_id: str) -> DeploymentResult: """ Monitor Job status using Python API. - + If live_output is enabled, streams pod logs in real-time. Otherwise, polls status periodically. """ # Check if live output is requested live_output = self.config.additional_context.get("live_output", False) - + if live_output: return self._monitor_with_live_logs(deployment_id) else: return self._monitor_status_only(deployment_id) - + def _monitor_status_only(self, deployment_id: str) -> DeploymentResult: """Monitor Job status without streaming logs.""" try: @@ -1985,21 +578,21 @@ def _monitor_status_only(self, deployment_id: str) -> DeploymentResult: message=f"Job {deployment_id} not found", ) raise - + def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: """Monitor Job and stream logs in real-time.""" self.console.print(f"\n[cyan]═══ Streaming pod logs (--live-output) ═══[/cyan]\n") - + pod_name = None log_position = 0 - + while True: try: # Check job status job = self.batch_v1.read_namespaced_job_status( name=deployment_id, namespace=self.namespace ) - + # Get pod if we don't have it yet if not pod_name: pods = self.core_v1.list_namespaced_pod( @@ -2009,7 +602,7 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: if pods.items: pod_name = pods.items[0].metadata.name self.console.print(f"[dim]Following logs from pod: {pod_name}[/dim]\n") - + # Stream logs if we have a pod if pod_name: try: @@ -2019,7 +612,7 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: namespace=self.namespace, tail_lines=100 if log_position == 0 else None ) - + # Print new log lines and trigger artifact collection if logs: log_lines = logs.split('\n') @@ -2028,11 +621,11 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: if line.strip(): print(line) log_position = len(log_lines) - + except ApiException as e: if e.status != 400: # Ignore "container not ready" errors pass - + # Check if job completed if job.status.succeeded: self.console.print(f"\n[green]✓ Job {deployment_id} completed successfully[/green]\n") @@ -2041,7 +634,7 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: deployment_id=deployment_id, message=f"Job {deployment_id} completed successfully", ) - + if job.status.failed: self.console.print(f"\n[red]✗ Job {deployment_id} failed[/red]\n") # Print final logs @@ -2052,9 +645,9 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: deployment_id=deployment_id, message=f"Job {deployment_id} failed", ) - + time.sleep(2) # Poll every 2 seconds - + except ApiException as e: if e.status == 404: return DeploymentResult( @@ -2063,17 +656,17 @@ def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: message=f"Job {deployment_id} not found", ) raise - + def _print_pod_logs_on_failure(self, deployment_id: str): """Print pod logs when job fails (for debugging).""" try: self.console.print(f"\n[yellow]═══ Pod logs (last 50 lines) ═══[/yellow]\n") - + pods = self.core_v1.list_namespaced_pod( namespace=self.namespace, label_selector=_pod_job_name_label_selector(deployment_id), ) - + for pod in pods.items: pod_name = pod.metadata.name try: @@ -2132,1355 +725,6 @@ def _refresh_pod_until_terminal_phase( time.sleep(interval_seconds) return last - def collect_results(self, deployment_id: str) -> Dict[str, Any]: - """ - Enhanced results collection from K8s pods following vLLM multi-node best practices. - - For Data Parallel deployments (vLLM, SGLang): - - Each pod runs an independent replica - - Only pod-0 reports metrics to avoid duplicates - - Total throughput = pod-0 throughput × num_replicas - - Collects: - 1. Pod logs (``k8s_results///pod.log``) - 2. PVC mirror per pod (``...//pvc/``), mapped from ``/results//`` - 3. File artifacts via kubectl cp when pods are still running (keep-alive path) - - Returns: - Dict with logs, artifacts, and performance results - """ - results = { - "job_name": deployment_id, - "namespace": self.namespace, - "logs": [], - "artifacts": [], - "successful_runs": [], - "failed_runs": [], - } - - # Create results directory for this deployment - results_dir = Path(f"./k8s_results/{deployment_id}") - results_dir.mkdir(parents=True, exist_ok=True) - - self.console.print(f"[cyan]📦 Collecting results from K8s job: {deployment_id}[/cyan]") - - try: - # Get pods for this job - pods = self.core_v1.list_namespaced_pod( - namespace=self.namespace, - label_selector=_pod_job_name_label_selector(deployment_id), - ) - - # Get model info and build info from manifest - model_keys = list(self.manifest["built_models"].keys()) - if model_keys: - model_key = model_keys[0] - model_info = self.manifest["built_models"][model_key] - else: - model_info = {} - - # Get build info from built_images - image_keys = list(self.manifest.get("built_images", {}).keys()) - if image_keys: - image_key = image_keys[0] - build_info = self.manifest["built_images"][image_key] - else: - build_info = {} - - # Check if this is a multi-node distributed job - deployment_config = self.manifest.get("deployment_config", {}) - distributed_config = deployment_config.get("distributed", {}) - is_distributed = distributed_config.get("enabled", False) - nnodes = distributed_config.get("nnodes", 1) - is_multinode = is_distributed and nnodes > 1 - - # Determine launcher_type the same way as _prepare_template_context does - # (deployment_config doesn't store launcher_type directly) - launcher_config = self.config.additional_context.get("launcher", {}) - launcher_type = ( - launcher_config.get("type") - if launcher_config.get("type") is not None - else distributed_config.get("launcher") - ) - - # Normalize launcher based on deployment type and validity - launcher_type = normalize_launcher(launcher_type, "kubernetes") - - is_ray_launcher = launcher_type in ["vllm", "sglang"] - - # Sort pods by name to ensure consistent ordering (pod-0 is master) - sorted_pods = sorted(pods.items, key=lambda p: p.metadata.name) - - # ======================================================================== - # NEW: Per-Node Collection Strategy - # Collect logs and artifacts from ALL nodes - # Parse performance from ALL nodes (each reports node-local metrics) - # Aggregate metrics based on type (sum for throughput, etc.) - # ======================================================================== - - per_node_metrics = [] # Store performance from each node - results["nodes"] = [] # Store per-node details for display - - # Special handling for Ray-based launchers (vLLM, SGLang) - # These report per-replica metrics, need scaling - if is_multinode and is_ray_launcher: - self.console.print( - f"[cyan]Multi-node Ray deployment: {nnodes} nodes (Data Parallel mode)[/cyan]" - ) - - # Collect from ALL pods - for pod_index, pod in enumerate(sorted_pods): - pod_name = pod.metadata.name - pod_dir = results_dir / pod_name - pod_dir.mkdir(exist_ok=True) - - # Extract node rank from pod name (e.g., madengine-dummy-torchrun-0 -> 0) - try: - node_rank = int(pod_name.rsplit('-', 1)[-1]) - except (ValueError, IndexError): - node_rank = pod_index - - self.console.print(f"[dim] Collecting from pod: {pod_name} (node-{node_rank})[/dim]") - - try: - # 1. Collect pod logs - log = self.core_v1.read_namespaced_pod_log( - name=pod_name, namespace=self.namespace - ) - log_file = pod_dir / "pod.log" - log_file.write_text(log) - results["logs"].append({ - "pod": pod_name, - "log": log, - "file": str(log_file) - }) - - # 2. Parse NODE-LOCAL performance from log - perf_data = self._parse_performance_from_log( - log, model_info.get("name", "") - ) - - # Pod phase/exit can lag right after Job success; poll until terminal or timeout - pod = self._refresh_pod_until_terminal_phase(pod_name) - pod_status = pod.status.phase if pod else "Unknown" - pod_exit_code = ( - self._primary_workload_container_exit_code(pod) if pod else -1 - ) - - # Store per-node info for display table - node_info = { - "node_id": node_rank, - "pod_name": pod_name, - "status": "SUCCESS" if pod_status == "Succeeded" and pod_exit_code == 0 else "FAILED", - "exit_code": pod_exit_code, - "performance": perf_data.get("performance") if perf_data else None, - "metric": perf_data.get("metric") if perf_data else None, - "duration": perf_data.get("duration") if perf_data else None, - "log_file": str(log_file) - } - results["nodes"].append(node_info) - - if perf_data: - # For Ray launchers, this is per-replica metric - if is_multinode and is_ray_launcher: - perf_data["is_per_replica"] = True - per_node_metrics.append(perf_data) - self.console.print( - f"[green] ✓ Parsed performance: {perf_data['performance']:.2f} " - f"{perf_data['metric']} (node-{node_rank})[/green]" - ) - else: - self.console.print( - f"[dim] No performance metric found in node-{node_rank} log[/dim]" - ) - - except ApiException as e: - self.console.print( - f"[red]✗ Failed to get logs for pod {pod_name}: {e.reason}[/red]" - ) - results["nodes"].append({ - "node_id": node_rank, - "pod_name": pod_name, - "status": "FAILED", - "exit_code": -1, - "performance": None, - "metric": None, - "error": f"Failed to get logs: {e.reason}" - }) - except Exception as e: - self.console.print( - f"[red]✗ Error collecting from pod {pod_name}: {e}[/red]" - ) - results["nodes"].append({ - "node_id": node_rank, - "pod_name": pod_name, - "status": "FAILED", - "exit_code": -1, - "performance": None, - "metric": None, - "error": str(e) - }) - - self.console.print( - f"[green]✓ Collected logs from {len(results['logs'])} pods[/green]" - ) - - # Collect artifacts from PVC before deciding success/failure (needed for multiple_results fallback) - k8s_pod_names = [p.metadata.name for p in sorted_pods] - self._collect_from_pvc(deployment_id, results_dir, results, pod_names=k8s_pod_names) - - # ======================================================================== - # Aggregate per-node metrics - # ======================================================================== - if per_node_metrics: - # Special handling for Ray launchers - multiply by nnodes - if is_multinode and is_ray_launcher: - original_perf = per_node_metrics[0]["performance"] - aggregated_perf = original_perf * nnodes - self.console.print( - f"[green] Per-replica: {original_perf:.1f} req/s[/green]" - ) - self.console.print( - f"[green] Total capacity: {aggregated_perf:.1f} req/s ({nnodes} nodes)[/green]" - ) - - # Create aggregated record manually for Ray - aggregated_record = { - "model": per_node_metrics[0]["model"], - "performance": aggregated_perf, - "metric": per_node_metrics[0]["metric"], - "status": "SUCCESS", - "topology": f"{nnodes}N×{per_node_metrics[0].get('local_gpus', 1)}G", - "nnodes": nnodes, - "launcher": launcher_type or "N/A", - "deployment_type": "kubernetes", - "gpu_architecture": per_node_metrics[0].get("gpu_architecture", "N/A"), - "duration": per_node_metrics[0].get("duration", "N/A"), - "data_name": per_node_metrics[0].get("data_name", "N/A"), - "data_provider": per_node_metrics[0].get("data_provider", "N/A"), - "aggregation_method": "scaled_by_nnodes", - "nodes_contributing": nnodes - } - else: - # Use new aggregation logic for other launchers - aggregated_record = self._aggregate_node_metrics( - per_node_metrics, - nnodes, - launcher_type - ) - - if aggregated_record: - # Full reporting pipeline: perf_entry at project root, then update_* (same as local/SLURM) - self._ensure_perf_csv_exists() - run_details_dict = self._build_perf_entry_from_aggregated( - aggregated_record, model_info, build_info, deployment_id - ) - perf_entry_path = Path("perf_entry.json") - with open(perf_entry_path, "w", encoding="utf-8") as f: - json.dump(run_details_dict, f, indent=2) - if run_details_dict.get("status") == "SUCCESS": - update_perf_csv(perf_csv="perf.csv", single_result=str(perf_entry_path)) - else: - update_perf_csv(perf_csv="perf.csv", exception_result=str(perf_entry_path)) - scripts_path = model_info.get("scripts", "") - scripts_base_dir = scripts_base_dir_from(scripts_path) - try: - if run_details_dict.get("status") == "SUCCESS": - num_entries = update_perf_super_json( - single_result=str(perf_entry_path), - perf_super_json="perf_super.json", - scripts_base_dir=scripts_base_dir, - ) - else: - num_entries = update_perf_super_json( - exception_result=str(perf_entry_path), - perf_super_json="perf_super.json", - scripts_base_dir=scripts_base_dir, - ) - update_perf_super_csv( - perf_super_json="perf_super.json", - perf_super_csv="perf_super.csv", - num_entries=num_entries, - ) - except Exception as e: - self.console.print(f"[yellow]⚠ Could not update perf_super: {e}[/yellow]") - results["successful_runs"].append({ - "model": model_info.get("name"), - "perf_data": aggregated_record, - "nodes": results["nodes"], - "per_node_metrics": per_node_metrics - }) - self.console.print( - f"[green]✓ Aggregated performance from {len(per_node_metrics)} nodes[/green]" - ) - self.console.print( - f"[green]✓ Updated perf_entry.json, perf.csv, perf_super.* (Docker-compatible)[/green]" - ) - else: - # No performance from log: try multiple_results CSV (same contract as local Docker) - # Resolve single CSV path (one pod) or merged CSV path (multi-pod with sum/avg rules) - resolved_csv_path = self._resolve_multiple_results_csv( - results_dir, results, model_info - ) - if resolved_csv_path and REPORTING_AVAILABLE: - # Docker-compatible flow: produce perf.csv, perf_entry.*, perf_super.* - gpu_arch = "N/A" - if results.get("logs"): - import re - log_content = results["logs"][0].get("log", "") - m = re.search(r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)", log_content) - if m: - gpu_arch = m.group(1) - self._ensure_perf_csv_exists() - common_info = self._build_common_info_dict( - model_info, build_info, deployment_id, gpu_arch - ) - common_info_path = Path("common_info.json") - with open(common_info_path, "w", encoding="utf-8") as f: - json.dump(common_info, f, indent=2) - update_perf_csv( - perf_csv="perf.csv", - multiple_results=str(resolved_csv_path), - common_info=str(common_info_path), - model_name=model_info.get("name", ""), - ) - scripts_path = model_info.get("scripts", "") - scripts_base_dir = scripts_base_dir_from(scripts_path) - num_entries = update_perf_super_json( - perf_super_json="perf_super.json", - multiple_results=str(resolved_csv_path), - common_info=str(common_info_path), - model_name=model_info.get("name", ""), - scripts_base_dir=scripts_base_dir, - ) - update_perf_super_csv( - perf_super_json="perf_super.json", - perf_super_csv="perf_super.csv", - num_entries=num_entries, - ) - # Build successful_runs for display (one entry per CSV row) - import csv as _csv - model_name = model_info.get("name", "") - with open(resolved_csv_path, "r", encoding="utf-8", errors="ignore") as f: - reader = _csv.DictReader(f) - for row in reader: - row = {k.strip(): v for k, v in row.items() if k} - if row.get("performance") and row.get("metric"): - display_model = f"{model_name}_{row.get('model', '')}" - record = self._create_multiple_result_row_record( - model_info, build_info, deployment_id, - { - "model": display_model, - "performance": row.get("performance"), - "metric": row.get("metric", ""), - "gpu_architecture": gpu_arch, - "duration": row.get("test_duration", "N/A"), - }, - ) - if record: - results["successful_runs"].append({ - "model": display_model, - "perf_data": record, - "nodes": [], - "per_node_metrics": [{"model": display_model, "performance": row.get("performance"), "metric": row.get("metric", "")}], - }) - self.console.print( - f"[green]✓ Updated perf.csv, perf_entry.*, perf_super.* (Docker-compatible)[/green]" - ) - elif resolved_csv_path and not REPORTING_AVAILABLE: - # Fallback when reporting module not available: legacy row-by-row write - fallback_metrics = self._parse_multiple_results_from_artifacts( - results_dir, results, model_info, build_info - ) - if fallback_metrics: - for item in fallback_metrics: - record = self._create_multiple_result_row_record( - model_info, build_info, deployment_id, item - ) - if record: - self._write_to_perf_csv(record) - results["successful_runs"].append({ - "model": item["model"], - "perf_data": record, - "nodes": [], - "per_node_metrics": [item], - }) - self.console.print( - f"[green]✓ Wrote {len(fallback_metrics)} row(s) from multiple_results to perf.csv[/green]" - ) - if not resolved_csv_path: - # No multiple_results CSV found: record failure - error_msg = "No performance metrics found from any node" - failure_record = self._create_failure_record( - model_info, build_info, deployment_id, error_msg - ) - self._write_to_perf_csv(failure_record) - results["failed_runs"].append({ - "model": model_info.get("name", "Unknown"), - "error": error_msg, - "nodes": results["nodes"] - }) - self.console.print( - f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" - ) - elif resolved_csv_path and not REPORTING_AVAILABLE and not results.get("successful_runs"): - # Legacy path ran but produced no valid rows - error_msg = "No performance metrics found from any node" - failure_record = self._create_failure_record( - model_info, build_info, deployment_id, error_msg - ) - self._write_to_perf_csv(failure_record) - results["failed_runs"].append({ - "model": model_info.get("name", "Unknown"), - "error": error_msg, - "nodes": results["nodes"] - }) - self.console.print( - f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" - ) - - # 4. Generate summary - self._generate_results_summary(results, results_dir) - - except Exception as e: - self.console.print(f"[yellow]⚠ Results collection incomplete: {e}[/yellow]") - - return results - - def _collect_artifacts_immediately(self, deployment_id: str, pod_name: str) -> None: - """ - Collect artifacts immediately from a running pod during the sleep period. - This is called when we detect the "Keeping pod alive" message in logs. - """ - try: - # Create results directory - results_dir = Path("k8s_results") / deployment_id - results_dir.mkdir(parents=True, exist_ok=True) - - pod_dir = results_dir / pod_name - pod_dir.mkdir(exist_ok=True) - - # Collect artifacts - artifacts = self._collect_pod_artifacts(pod_name, pod_dir) - - if artifacts: - self.console.print(f"[green]✓ Collected {len(artifacts)} artifacts from {pod_name}[/green]") - else: - self.console.print(f"[yellow]⚠ No artifacts collected from {pod_name}[/yellow]") - - except Exception as e: - self.console.print(f"[yellow]⚠ Error collecting artifacts: {e}[/yellow]") - - def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: - """ - Collect file artifacts from pod using kubectl cp. - - Collects: - - perf.csv (performance results) - - *_env.csv (environment details from rocEnvTool) - - profiling outputs (rocprof*, results*, *.db) - - tracing outputs (*_output/ directories) - - tool-specific outputs - - Args: - pod_name: Name of the Kubernetes pod - dest_dir: Local directory to save artifacts - - Returns: - List of collected artifact metadata - """ - artifacts = [] - - # Define artifact patterns to collect - artifact_patterns = [ - {"pattern": "perf.csv", "type": "performance"}, - {"pattern": "*_env.csv", "type": "environment"}, - {"pattern": "results*", "type": "profiling"}, - {"pattern": "*.db", "type": "profiling"}, - {"pattern": "trace.*", "type": "tracing"}, - {"pattern": "prof.csv", "type": "profiling"}, # Raw profiler output before post-script renames it - {"pattern": "gpu_info_*.csv", "type": "profiling"}, - {"pattern": "library_trace.csv", "type": "tracing"}, - ] - - for artifact_def in artifact_patterns: - pattern = artifact_def["pattern"] - artifact_type = artifact_def["type"] - - try: - # Try direct kubectl cp without exec (works during the sleep period) - # For patterns with wildcards, try common specific filenames - if '*' in pattern: - # Expand pattern to specific known files - if pattern == "*_env.csv": - specific_files = ["dummy_prof_env.csv", "dummy_data_minio_env.csv"] - elif pattern == "gpu_info_*.csv": - specific_files = ["gpu_info_power_profiler_output.csv", "gpu_info_vram_profiler_output.csv"] - elif pattern == "results*": - specific_files = ["results.csv", "results.txt", "results.json"] - elif pattern == "trace.*": - specific_files = ["trace.txt", "trace.csv", "trace.json"] - else: - specific_files = [] - - for filename in specific_files: - local_path = dest_dir / filename - cp_cmd = [ - "kubectl", "cp", - f"{self.namespace}/{pod_name}:/workspace/{filename}", - str(local_path) - ] - - cp_result = subprocess.run( - cp_cmd, capture_output=True, text=True, timeout=30 - ) - - if cp_result.returncode == 0 and local_path.exists(): - artifacts.append({ - "pod": pod_name, - "type": artifact_type, - "source": f"/workspace/{filename}", - "local_path": str(local_path), - "size": local_path.stat().st_size - }) - self.console.print( - f"[dim] ✓ Collected {artifact_type}: {filename}[/dim]" - ) - elif cp_result.stderr and "No such file" not in cp_result.stderr: - # Log unexpected errors (but not "file not found") - self.console.print( - f"[yellow] ⚠ Failed to collect {filename}: {cp_result.stderr.strip()}[/yellow]" - ) - else: - # Direct file - try to copy it - local_path = dest_dir / pattern - cp_cmd = [ - "kubectl", "cp", - f"{self.namespace}/{pod_name}:/workspace/{pattern}", - str(local_path) - ] - - cp_result = subprocess.run( - cp_cmd, capture_output=True, text=True, timeout=30 - ) - - if cp_result.returncode == 0 and local_path.exists(): - artifacts.append({ - "pod": pod_name, - "type": artifact_type, - "source": f"/workspace/{pattern}", - "local_path": str(local_path), - "size": local_path.stat().st_size - }) - self.console.print( - f"[dim] ✓ Collected {artifact_type}: {pattern}[/dim]" - ) - elif cp_result.stderr and "No such file" not in cp_result.stderr: - # Log unexpected errors (but not "file not found") - self.console.print( - f"[yellow] ⚠ Failed to collect {pattern}: {cp_result.stderr.strip()}[/yellow]" - ) - - except subprocess.TimeoutExpired: - pass # Timeout - skip this file - except Exception: - pass # File not found or not accessible - this is expected - - # Try to collect known output directories using kubectl cp directly (during sleep period) - output_directories = ["rocprof_output", "rpd_output", "trace_output"] - for dir_name in output_directories: - try: - local_dir = dest_dir / dir_name - cp_cmd = [ - "kubectl", "cp", - f"{self.namespace}/{pod_name}:/workspace/{dir_name}", - str(local_dir) - ] - - cp_result = subprocess.run( - cp_cmd, capture_output=True, text=True, timeout=60 - ) - - if cp_result.returncode == 0 and local_dir.exists(): - # Count files in directory - file_count = sum(1 for _ in local_dir.rglob('*') if _.is_file()) - if file_count > 0: - total_size = sum(f.stat().st_size for f in local_dir.rglob('*') if f.is_file()) - artifacts.append({ - "pod": pod_name, - "type": "tool_output_directory", - "source": f"/workspace/{dir_name}", - "local_path": str(local_dir), - "file_count": file_count, - "size": total_size - }) - self.console.print( - f"[dim] ✓ Collected directory: {dir_name} ({file_count} files, {total_size} bytes)[/dim]" - ) - except Exception: - pass # Directory not found - this is expected - - return artifacts - - def _collect_from_pvc( - self, - deployment_id: str, - results_dir: Path, - results: Dict, - pod_names: Optional[List[str]] = None, - ): - """ - Collect all artifacts from the PVC using a temporary busybox pod. - - This is the best practice for collecting results from completed K8s jobs. - kubectl cp doesn't work on completed pods, so we use a helper pod. - - When ``pod_names`` is provided, each ``/results//`` is copied to - ``results_dir//pvc/`` by matching subdir to pod name (exact or - ``pod.startswith(subdir + "-")``). Unmatched subdirs go under - ``results_dir/pvc_unmapped//``. When ``pod_names`` is omitted, the - legacy layout ``results_dir//`` is used. - - Args: - deployment_id: Job deployment ID - results_dir: Local directory to save results - results: Results dict to update - pod_names: Full Kubernetes pod names for this job (ordered) - """ - pvc_name = f"{deployment_id}-results" - - try: - # Create a temporary pod to access PVC - collector_pod_name = f"collector-{deployment_id[:15]}" - - self.console.print(f"[dim]📦 Collecting artifacts from PVC: {pvc_name}[/dim]") - - collector_spec: Dict[str, Any] = { - "restartPolicy": "Never", - "containers": [{ - "name": "collector", - "image": "busybox:latest", - "command": ["sh", "-c", "sleep 600"], - "volumeMounts": [{"name": "results", "mountPath": "/results"}] - }], - "volumes": [{"name": "results", "persistentVolumeClaim": {"claimName": pvc_name}}] - } - ips = getattr(self, "_image_pull_secrets_for_pods", None) or [] - if ips: - collector_spec["imagePullSecrets"] = ips - - collector_pod_spec = { - "apiVersion": "v1", - "kind": "Pod", - "metadata": {"name": collector_pod_name, "namespace": self.namespace}, - "spec": collector_spec, - } - - # Delete existing collector pod if it exists (prevents 409 Conflict) - try: - self.core_v1.delete_namespaced_pod( - collector_pod_name, self.namespace, grace_period_seconds=0 - ) - time.sleep(2) # Wait for pod to be deleted - except ApiException as e: - if e.status != 404: # 404 means pod doesn't exist, which is fine - pass - - # Create collector pod - self.core_v1.create_namespaced_pod(self.namespace, collector_pod_spec) - - # Wait for pod to be ready - for _ in range(30): # Wait up to 30 seconds - try: - pod_status = self.core_v1.read_namespaced_pod_status( - collector_pod_name, self.namespace - ) - if pod_status.status.phase == "Running": - break - except ApiException as e: - # Pod not found yet or not ready - this is expected during startup - if e.status != 404: - self.console.print(f"[dim]Waiting for collector pod (status: {e.status})...[/dim]") - time.sleep(1) - else: - raise Exception("Collector pod did not start in time") - - # Mount / NFS may need a moment before another pod sees prior job writes. - time.sleep(2) - - # List pod result directories in PVC (retry: NFS can lag right after Job completion) - list_cmd = [ - "kubectl", - "exec", - collector_pod_name, - "-n", - self.namespace, - "-c", - "collector", - "--", - "ls", - "-1", - "/results/", - ] - list_result = subprocess.CompletedProcess( - args=list_cmd, returncode=-1, stdout="", stderr="" - ) - pod_dirs: List[str] = [] - for attempt in range(45): - list_result = subprocess.run( - list_cmd, capture_output=True, text=True, timeout=30 - ) - if list_result.returncode == 0 and list_result.stdout.strip(): - pod_dirs = [ - d - for d in list_result.stdout.strip().split("\n") - if d and d != "lost+found" - ] - if pod_dirs: - break - if list_result.stderr.strip(): - self.console.print( - f"[dim] PVC ls attempt {attempt + 1} (rc={list_result.returncode}): " - f"{list_result.stderr.strip()[:300]}[/dim]" - ) - time.sleep(1) - - if pod_dirs: - pvc_map: Dict[str, str] = {} - if pod_names: - pvc_map = assign_pvc_subdirs_to_pods(pod_dirs, pod_names) - - for pod_dir_name in pod_dirs: - if not pod_dir_name: - continue - - matched_pod = pvc_map.get(pod_dir_name) if pod_names else None - if pod_names: - if matched_pod: - local_pod_dir = results_dir / matched_pod / "pvc" - else: - local_pod_dir = results_dir / "pvc_unmapped" / pod_dir_name - else: - local_pod_dir = results_dir / pod_dir_name - - local_pod_dir.mkdir(parents=True, exist_ok=True) - - cp_cmd = [ - "kubectl", - "cp", - "-c", - "collector", - f"{self.namespace}/{collector_pod_name}:/results/{pod_dir_name}", - str(local_pod_dir), - ] - - cp_result = subprocess.run(cp_cmd, capture_output=True, text=True, timeout=60) - - if cp_result.returncode == 0: - # Count collected files - file_count = sum(1 for _ in local_pod_dir.rglob('*') if _.is_file()) - if file_count > 0: - art: Dict[str, Any] = { - "source": f"PVC:{pvc_name}/{pod_dir_name}", - "local_path": str(local_pod_dir), - "file_count": file_count, - "type": "pvc_collection", - "pvc_subdir": pod_dir_name, - } - if pod_names: - art["k8s_pod"] = matched_pod - results["artifacts"].append(art) - if matched_pod: - dest_hint = f"{matched_pod}/pvc" - elif pod_names: - dest_hint = f"pvc_unmapped/{pod_dir_name}" - else: - dest_hint = pod_dir_name - self.console.print( - f"[dim] ✓ Collected {file_count} files from {pod_dir_name} → {dest_hint}[/dim]" - ) - - self.console.print(f"[green]✓ Collected artifacts from PVC[/green]") - else: - hint = "" - if list_result.returncode != 0 or list_result.stderr.strip(): - hint = ( - f" (kubectl exec rc={list_result.returncode}" - + ( - f", stderr={list_result.stderr.strip()[:400]!r}" - if list_result.stderr.strip() - else "" - ) - + ")" - ) - self.console.print( - f"[yellow]⚠ No results found in PVC after retries{hint}[/yellow]" - ) - - # Cleanup collector pod - self.core_v1.delete_namespaced_pod( - collector_pod_name, self.namespace, grace_period_seconds=0 - ) - - except Exception as e: - self.console.print(f"[yellow]⚠ Could not collect from PVC: {e}[/yellow]") - - def _generate_results_summary(self, results: Dict, results_dir: Path): - """ - Generate a summary JSON of all collected artifacts. - - Args: - results: Results dict with logs and artifacts - results_dir: Directory where results are saved - """ - summary = { - "job_name": results["job_name"], - "namespace": results["namespace"], - "collected_at": datetime.now().isoformat(), - "k8s_results_layout": ( - "Per pod: //pod.log (API log) and " - "//pvc/ (mirror of /results//). " - "Unmatched PVC subdirs: /pvc_unmapped//." - ), - "layout_version": 2, - "pods": len(results["logs"]), - "total_artifacts": len(results["artifacts"]), - "artifacts_by_type": {}, - "artifacts": results["artifacts"], - "successful_runs": len(results["successful_runs"]), - "failed_runs": len(results["failed_runs"]), - } - - # Group artifacts by type - for artifact in results["artifacts"]: - artifact_type = artifact.get("type", "unknown") - summary["artifacts_by_type"][artifact_type] = summary["artifacts_by_type"].get(artifact_type, 0) + 1 - - summary_file = results_dir / "results_summary.json" - summary_file.write_text(json.dumps(summary, indent=2)) - - self.console.print(f"[green]✓ Results summary: {summary_file}[/green]") - - # Print summary table if artifacts were collected - if summary["artifacts_by_type"]: - from rich.table import Table - table = Table(title="Collected Artifacts") - table.add_column("Type", style="cyan") - table.add_column("Count", justify="right", style="green") - - for artifact_type, count in sorted(summary["artifacts_by_type"].items()): - table.add_row(artifact_type, str(count)) - - self.console.print(table) - - def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: str, error_msg: str) -> Dict: - """ - Create a failure record for perf.csv when performance metrics are missing. - - Args: - model_info: Model information from manifest - build_info: Build information from manifest - pod_name: Kubernetes pod name - error_msg: Error message describing the failure - - Returns: - Dict with all perf.csv fields marked as FAILED - """ - import os - - # Get topology information for failure record - deployment_config = self.manifest.get("deployment_config", {}) - distributed_config = deployment_config.get("distributed", {}) - nnodes = distributed_config.get("nnodes", 1) - nproc_per_node = distributed_config.get("nproc_per_node") - if nproc_per_node is None: - nproc_per_node = int(model_info.get("n_gpus", 1)) - # Launcher: use distributed.launcher when set, otherwise "native" for k8s - launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") - - # Create a record with the same structure as successful runs - # but with performance=0, metric="", and status="FAILED" - result = { - # Core identification - "model": model_info.get("name", ""), - "n_gpus": str(nnodes * nproc_per_node), - "nnodes": str(nnodes), - "gpus_per_node": str(nproc_per_node), - - # Model configuration - "training_precision": model_info.get("training_precision", ""), - "pipeline": get_pipeline(), - "args": model_info.get("args", ""), - "tags": model_info.get("tags", ""), - - # Build information - "docker_file": build_info.get("dockerfile", ""), - "base_docker": build_info.get("base_docker", ""), - "docker_sha": build_info.get("docker_sha", ""), - "docker_image": build_info.get("docker_image", ""), - - # Runtime information - "git_commit": "", - "machine_name": pod_name, - "deployment_type": "kubernetes", - "launcher": launcher, - "gpu_architecture": "", - - # Performance metrics - FAILED - "performance": "0", - "metric": error_msg, # Store error message in metric field - "relative_change": "", - "status": "FAILURE", # Use "FAILURE" to match CSV schema - - # Timing - "build_duration": build_info.get("build_duration", ""), - "test_duration": "", - - # Data information - "dataname": model_info.get("data", ""), - "data_provider_type": "", - "data_size": "", - "data_download_duration": "", - - # Build tracking - "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), - } - flatten_tags_in_place(result) - return result - - # Standard perf.csv header (must match container_runner.ensure_perf_csv_exists) - _PERF_CSV_HEADER = ( - "model,n_gpus,nnodes,gpus_per_node,training_precision,pipeline,args,tags," - "docker_file,base_docker,docker_sha,docker_image,git_commit,machine_name," - "deployment_type,launcher,gpu_architecture,performance,metric,relative_change," - "status,build_duration,test_duration,dataname,data_provider_type,data_size," - "data_download_duration,build_number,additional_docker_run_options" - ) - - def _ensure_perf_csv_exists(self) -> None: - """Ensure perf.csv exists with standard header (same as Docker container_runner).""" - perf_csv_path = Path("perf.csv") - if not perf_csv_path.exists(): - perf_csv_path.write_text(self._PERF_CSV_HEADER + "\n", encoding="utf-8") - self.console.print("[dim]Created perf.csv with standard header[/dim]") - - def _build_perf_entry_from_aggregated( - self, - aggregated_record: Dict[str, Any], - model_info: Dict[str, Any], - build_info: Dict[str, Any], - deployment_id: str, - ) -> Dict[str, Any]: - """Build full run_details dict from aggregated record for perf_entry and update_* pipeline.""" - from madengine.utils.config_parser import ConfigParser - - deployment_config = self.manifest.get("deployment_config", {}) - distributed_config = deployment_config.get("distributed", {}) - nnodes = distributed_config.get("nnodes", 1) - nproc_per_node = distributed_config.get("nproc_per_node") - if nproc_per_node is None: - nproc_per_node = int(model_info.get("n_gpus", 1)) - launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") - test_duration = aggregated_record.get("test_duration") or aggregated_record.get("duration", "") - run_details = { - "model": model_info.get("name", aggregated_record.get("model", "")), - "n_gpus": str(aggregated_record.get("n_gpus", nnodes * nproc_per_node)), - "nnodes": str(aggregated_record.get("nnodes", nnodes)), - "gpus_per_node": str(aggregated_record.get("gpus_per_node", nproc_per_node)), - "training_precision": model_info.get("training_precision", ""), - "pipeline": get_pipeline(), - "args": model_info.get("args", ""), - "tags": model_info.get("tags", ""), - "docker_file": build_info.get("dockerfile", ""), - "base_docker": build_info.get("base_docker", ""), - "docker_sha": build_info.get("docker_sha", ""), - "docker_image": build_info.get("docker_image", ""), - "git_commit": "", - "machine_name": deployment_id, - "deployment_type": "kubernetes", - "launcher": launcher, - "gpu_architecture": aggregated_record.get("gpu_architecture", ""), - "performance": str(aggregated_record.get("performance", "")), - "metric": aggregated_record.get("metric", ""), - "relative_change": "", - "status": aggregated_record.get("status", "SUCCESS"), - "build_duration": build_info.get("build_duration", ""), - "test_duration": test_duration, - "dataname": aggregated_record.get("data_name", model_info.get("data", "")), - "data_provider_type": aggregated_record.get("data_provider", ""), - "data_size": "", - "data_download_duration": "", - "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), - } - flatten_tags_in_place(run_details) - try: - scripts_path = model_info.get("scripts", "") - scripts_base_dir = scripts_base_dir_from(scripts_path) - config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) - run_details["configs"] = config_parser.parse_and_load( - model_info.get("args", ""), scripts_path - ) - except Exception: - run_details["configs"] = None - return run_details - - def _build_common_info_dict( - self, - model_info: Dict, - build_info: Dict, - deployment_id: str, - gpu_architecture: str = "", - ) -> Dict: - """ - Build common_info dict for update_perf_csv / update_perf_super (Docker-compatible). - Same shape as container_runner create_run_details_dict; model/performance/metric - are omitted so they are filled from the multiple_results CSV. - """ - deployment_config = self.manifest.get("deployment_config", {}) - distributed_config = deployment_config.get("distributed", {}) - nnodes = distributed_config.get("nnodes", 1) - nproc_per_node = distributed_config.get("nproc_per_node") - if nproc_per_node is None: - nproc_per_node = int(model_info.get("n_gpus", 1)) - total_gpus = nnodes * nproc_per_node - gpus_per_node = str(nproc_per_node) - nnodes_str = str(nnodes) - # Launcher: use distributed.launcher when set, otherwise "native" for k8s - launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") - result = { - "n_gpus": str(total_gpus), - "nnodes": nnodes_str, - "gpus_per_node": gpus_per_node, - "training_precision": model_info.get("training_precision", ""), - "pipeline": get_pipeline(), - "args": model_info.get("args", ""), - "tags": model_info.get("tags", ""), - "docker_file": build_info.get("dockerfile", ""), - "base_docker": build_info.get("base_docker", ""), - "docker_sha": build_info.get("docker_sha", ""), - "docker_image": build_info.get("docker_image", ""), - "git_commit": "", - "machine_name": deployment_id, - "deployment_type": "kubernetes", - "launcher": launcher, - "gpu_architecture": gpu_architecture, - "relative_change": "", - "build_duration": build_info.get("build_duration", ""), - "test_duration": "", - "dataname": model_info.get("data", ""), - "data_provider_type": "", - "data_size": "", - "data_download_duration": "", - "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), - } - flatten_tags_in_place(result) - return result - - def _create_multiple_result_row_record( - self, - model_info: Dict, - build_info: Dict, - deployment_id: str, - item: Dict, - ) -> Dict: - """ - Build one perf.csv row for a single row from a multiple_results CSV. - Same shape as _create_failure_record but with SUCCESS and item's performance/metric/model. - """ - import os - - deployment_config = self.manifest.get("deployment_config", {}) - distributed_config = deployment_config.get("distributed", {}) - nnodes = distributed_config.get("nnodes", 1) - nproc_per_node = distributed_config.get("nproc_per_node") - if nproc_per_node is None: - nproc_per_node = int(model_info.get("n_gpus", 1)) - - # Launcher: use distributed.launcher when set, otherwise "native" for k8s - launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") - result = { - "model": item.get("model", model_info.get("name", "")), - "n_gpus": str(nnodes * nproc_per_node), - "nnodes": str(nnodes), - "gpus_per_node": str(nproc_per_node), - "training_precision": model_info.get("training_precision", ""), - "pipeline": get_pipeline(), - "args": model_info.get("args", ""), - "tags": model_info.get("tags", ""), - "docker_file": build_info.get("dockerfile", ""), - "base_docker": build_info.get("base_docker", ""), - "docker_sha": build_info.get("docker_sha", ""), - "docker_image": build_info.get("docker_image", ""), - "git_commit": "", - "machine_name": deployment_id, - "deployment_type": "kubernetes", - "launcher": launcher, - "gpu_architecture": item.get("gpu_architecture", ""), - "performance": str(item.get("performance", "")), - "metric": item.get("metric", ""), - "relative_change": "", - "status": "SUCCESS", - "build_duration": build_info.get("build_duration", ""), - "test_duration": item.get("duration", ""), - "dataname": model_info.get("data", ""), - "data_provider_type": "", - "data_size": "", - "data_download_duration": "", - "build_number": get_build_number(), - "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), - } - flatten_tags_in_place(result) - return result - - def _parse_multiple_results_from_artifacts( - self, - results_dir: Path, - results: Dict, - model_info: Dict, - build_info: Dict, - ) -> List[Dict]: - """ - Parse performance from a multiple_results CSV (e.g. perf_dummy.csv) collected from PVC. - Used when the model only writes CSV and does not print 'performance: X Y' to the log - (same contract as local container_runner multiple_results handling). - - Returns: - List of perf_data dicts (same shape as _parse_node_performance), or empty list. - """ - import csv as csv_module - multiple_results_file = model_info.get("multiple_results") - filename = Path(multiple_results_file).name if multiple_results_file else None - # Try to get gpu_architecture from first pod log - gpu_arch = "N/A" - if results.get("logs"): - import re - log_content = results["logs"][0].get("log", "") - gpu_arch_match = re.search(r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)", log_content) - if gpu_arch_match: - gpu_arch = gpu_arch_match.group(1) - parsed_list = [] - for art in results.get("artifacts", []): - if art.get("type") != "pvc_collection": - continue - local_path = Path(art.get("local_path", "")) - if not local_path.is_dir(): - continue - # Prefer exact filename (same as Docker multiple_results); fallback to any perf_*.csv - csv_path = (local_path / filename) if filename else None - if not csv_path or not csv_path.is_file(): - perf_csvs = sorted(local_path.glob("perf_*.csv")) - csv_path = perf_csvs[0] if perf_csvs else None - if not csv_path or not csv_path.is_file(): - continue - try: - with open(csv_path, "r", encoding="utf-8", errors="ignore") as f: - reader = csv_module.DictReader(f) - reader.fieldnames = [f.strip() for f in (reader.fieldnames or [])] - if not reader.fieldnames or "performance" not in reader.fieldnames or "metric" not in reader.fieldnames: - continue - for row_idx, row in enumerate(reader): - perf_val = row.get("performance", "").strip() - metric_val = row.get("metric", "").strip() - if not perf_val or not metric_val: - continue - try: - perf_float = float(perf_val) - except (ValueError, TypeError): - continue - # Same model naming as local handle_multiple_results: model_name + "_" + str(model) - row_model = row.get("model", row_idx) - display_model = f"{model_info.get('name')}_{row_model}" - parsed_list.append({ - "model": display_model, - "performance": perf_float, - "metric": metric_val, - "node_id": row_idx, - "local_gpus": 1, - "duration": "N/A", - "gpu_architecture": gpu_arch, - "data_name": "N/A", - "data_provider": "N/A", - }) - if parsed_list: - self.console.print( - f"[green] ✓ Parsed performance from {csv_path.name} ({len(parsed_list)} row(s))[/green]" - ) - return parsed_list - except Exception as e: - self.console.print( - f"[dim] Could not parse {csv_path.name} from PVC: {e}[/dim]" - ) - return [] - - def _aggregation_for_extra_column(self, column_name: str) -> str: - """ - Return how to aggregate an extra CSV column when merging multi-node results. - Best practice: throughput/counts -> sum; latencies/utilization -> average; - duration/capacity -> max; identifiers -> first. - """ - col = column_name.lower().strip() - # Sum: counts, totals, throughput-like - if any(k in col for k in [ - "count", "total", "samples", "tokens", "throughput", - "requests", "images", "bandwidth", "ops" - ]): - return "sum" - # Average: rates per unit, utilization, ratios - if any(k in col for k in [ - "utilization", "usage", "percent", "ratio", "latency", - "time_ms", "ttft", "tpot", "accuracy", "loss" - ]): - return "average" - # Max: duration (slowest node), memory, capacity - if any(k in col for k in [ - "duration", "time", "seconds", "memory", "bytes", "mb", "gb" - ]): - return "max" - return "first" - - def _merge_multi_node_multiple_results_csv( - self, csv_paths: List[Path], output_path: Path - ) -> bool: - """ - Merge multiple pod multiple_results CSVs into one with sum/average rules. - Rows are aligned by index (row 0 from each pod -> one merged row 0). - - performance: aggregated by _determine_aggregation_method(metric) (sum or average). - - Other numeric columns: by _aggregation_for_extra_column (sum/average/max). - - model, metric: taken from first CSV. - """ - import csv as csv_module - import statistics - - required = ["model", "performance", "metric"] - rows_by_index: Dict[int, List[Dict]] = {} - - for path in csv_paths: - try: - with open(path, "r", encoding="utf-8", errors="ignore") as f: - reader = csv_module.DictReader(f) - fieldnames = [c.strip() for c in (reader.fieldnames or [])] - if not all(h in fieldnames for h in required): - continue - for idx, row in enumerate(reader): - row = {k.strip(): v for k, v in row.items() if k} - if not row.get("performance") or not row.get("metric"): - continue - try: - float(str(row["performance"]).strip()) - except (ValueError, TypeError): - continue - if idx not in rows_by_index: - rows_by_index[idx] = [] - rows_by_index[idx].append(row) - except Exception as e: - self.console.print(f"[dim] Could not read {path.name}: {e}[/dim]") - continue - - if not rows_by_index: - return False - - # Build union of columns (required first, then rest) - extra_cols = set() - for group in rows_by_index.values(): - for row in group: - extra_cols.update(k for k in row if k not in required) - all_columns = list(required) + sorted(extra_cols) - merged_rows = [] - for idx in sorted(rows_by_index.keys()): - group = rows_by_index[idx] - first = group[0] - metric_name = (first.get("metric") or "").strip() - perf_agg = self._determine_aggregation_method(metric_name) - perf_values = [] - for r in group: - try: - perf_values.append(float(str(r.get("performance", "")).strip())) - except (ValueError, TypeError): - pass - if not perf_values: - continue - if perf_agg == "sum": - performance = sum(perf_values) - elif perf_agg == "average": - performance = statistics.mean(perf_values) - elif perf_agg == "max": - performance = max(perf_values) - else: - performance = sum(perf_values) - merged = { - "model": first.get("model", ""), - "performance": performance, - "metric": first.get("metric", ""), - } - for col in all_columns: - if col in merged: - continue - values = [r.get(col) for r in group] - try: - nums = [float(str(v).strip()) for v in values if v is not None and str(v).strip()] - except (ValueError, TypeError): - nums = [] - if nums: - extra_agg = self._aggregation_for_extra_column(col) - if extra_agg == "sum": - merged[col] = sum(nums) - elif extra_agg == "average": - merged[col] = statistics.mean(nums) - elif extra_agg == "max": - merged[col] = max(nums) - else: - merged[col] = first.get(col, "") - else: - merged[col] = first.get(col, "") - merged_rows.append(merged) - - if not merged_rows: - return False - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w", newline="", encoding="utf-8") as f: - writer = csv_module.DictWriter(f, fieldnames=all_columns, extrasaction="ignore") - writer.writeheader() - writer.writerows(merged_rows) - self.console.print( - f"[green] ✓ Merged {len(csv_paths)} pod CSV(s) into {len(merged_rows)} row(s) → {output_path.name}[/green]" - ) - return True - - def _resolve_multiple_results_csv( - self, results_dir: Path, results: Dict, model_info: Dict - ) -> Optional[Path]: - """ - Resolve path to a single multiple_results CSV for update_perf_csv. - Single pod: return that CSV path. Multi-pod: merge all pod CSVs with - sum/average rules and return path to merged file. - """ - multiple_results_file = model_info.get("multiple_results") - filename = Path(multiple_results_file).name if multiple_results_file else None - csv_paths: List[Path] = [] - for art in results.get("artifacts", []): - if art.get("type") != "pvc_collection": - continue - local_path = Path(art.get("local_path", "")) - if not local_path.is_dir(): - continue - csv_path = (local_path / filename) if filename else None - if not csv_path or not csv_path.is_file(): - perf_csvs = sorted(local_path.glob("perf_*.csv")) - csv_path = perf_csvs[0] if perf_csvs else None - if csv_path and csv_path.is_file(): - csv_paths.append(csv_path) - if not csv_paths: - return None - if len(csv_paths) == 1: - return csv_paths[0] - merged_path = results_dir / "multiple_results_merged.csv" - if self._merge_multi_node_multiple_results_csv(csv_paths, merged_path): - return merged_path - return csv_paths[0] - def cleanup(self, deployment_id: str) -> bool: """Delete Job, ConfigMap, Service and associated pods.""" success = True @@ -3532,4 +776,3 @@ def cleanup(self, deployment_id: str) -> bool: pass return success -