From b22bc7b55f5e3a6c805c2b4f115a7d76c79f40fd Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 15:58:51 -0400 Subject: [PATCH 1/9] Implemented a batch input arg for madengine-cli build --- src/madengine/mad_cli.py | 240 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 232 insertions(+), 8 deletions(-) diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index ac4527ed..fbd68305 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -119,6 +119,56 @@ def __init__(self, **kwargs): return Args(**kwargs) +def process_batch_manifest(batch_manifest_file: str) -> Dict[str, List[str]]: + """Process batch manifest file and extract model tags based on build_new flag. + + Args: + batch_manifest_file: Path to the input manifest.json file + + Returns: + Dict containing 'build_tags' and 'all_tags' lists + + Raises: + FileNotFoundError: If the manifest file doesn't exist + ValueError: If the manifest format is invalid + """ + if not os.path.exists(batch_manifest_file): + raise FileNotFoundError(f"Batch manifest file not found: {batch_manifest_file}") + + try: + with open(batch_manifest_file, 'r') as f: + manifest_data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in batch manifest file: {e}") + + if not isinstance(manifest_data, list): + raise ValueError("Batch manifest must be a list of model objects") + + build_tags = [] # Models that need to be built (build_new=true) + all_tags = [] # All models in the manifest + + for i, model in enumerate(manifest_data): + if not isinstance(model, dict): + raise ValueError(f"Model entry {i} must be a dictionary") + + if "model_name" not in model: + raise ValueError(f"Model entry {i} missing required 'model_name' field") + + model_name = model["model_name"] + build_new = model.get("build_new", False) + + all_tags.append(model_name) + if build_new: + build_tags.append(model_name) + + return { + "build_tags": build_tags, + "all_tags": all_tags, + "manifest_data": manifest_data + } + + + def validate_additional_context( additional_context: str, additional_context_file: Optional[str] = None, @@ -219,6 +269,127 @@ def save_summary_with_feedback(summary: Dict, output_path: Optional[str], summar raise typer.Exit(ExitCode.FAILURE) +def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, registry: Optional[str]) -> None: + """Process batch manifest and add entries for all models to build_manifest.json. + + Args: + batch_data: Processed batch manifest data + manifest_output: Path to the build manifest file + registry: Registry used for the build + """ + from madengine.tools.discover_models import DiscoverModels + + # Load the existing build manifest + if os.path.exists(manifest_output): + with open(manifest_output, 'r') as f: + build_manifest = json.load(f) + else: + # Create a minimal manifest structure + build_manifest = { + "built_images": {}, + "built_models": {}, + "context": {}, + "credentials_required": [], + "registry": registry or "" + } + + # Process each model in the batch manifest + for model_entry in batch_data["manifest_data"]: + model_name = model_entry["model_name"] + build_new = model_entry.get("build_new", False) + model_registry_image = model_entry.get("registry_image", "") + model_registry = model_entry.get("registry", "") + + # If the model was not built (build_new=false), create an entry for it + if not build_new: + # Find the model configuration by discovering models with this tag + try: + # Create a temporary args object to discover the model + temp_args = create_args_namespace( + tags=[model_name], + registry=registry, + additional_context="{}", + additional_context_file=None, + clean_docker_cache=False, + manifest_output=manifest_output, + live_output=False, + output="perf.csv", + ignore_deprecated_flag=False, + data_config_file_name="data.json", + tools_json_file_name="scripts/common/tools.json", + generate_sys_env_details=True, + force_mirror_local=None, + disable_skip_gpu_arch=False, + verbose=False, + _separate_phases=True, + ) + + discover_models = DiscoverModels(args=temp_args) + models = discover_models.run() + + for model_info in models: + if model_info["name"] == model_name: + # Create a synthetic image name for this model + synthetic_image_name = f"ci-{model_name}_{model_name}.ubuntu.amd" + + # Add to built_images (even though it wasn't actually built) + build_manifest["built_images"][synthetic_image_name] = { + "docker_image": synthetic_image_name, + "dockerfile": model_info.get("dockerfile", f"docker/{model_name}"), + "base_docker": "rocm/pytorch", # Default base + "docker_sha": "", # No SHA since not built + "build_duration": 0, + "build_command": f"# Skipped build for {model_name} (build_new=false)", + "log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log", + "registry_image": model_registry_image or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" if model_registry_image or model_registry or registry else "" + } + + # Add to built_models + build_manifest["built_models"][synthetic_image_name] = { + "name": model_info["name"], + "dockerfile": model_info.get("dockerfile", f"docker/{model_name}"), + "scripts": model_info.get("scripts", f"scripts/{model_name}/run.sh"), + "n_gpus": model_info.get("n_gpus", "1"), + "owner": model_info.get("owner", ""), + "training_precision": model_info.get("training_precision", ""), + "tags": model_info.get("tags", []), + "args": model_info.get("args", ""), + "cred": model_info.get("cred", "") + } + break + + except Exception as e: + console.print(f"Warning: Could not process model {model_name}: {e}") + # Create a minimal entry anyway + synthetic_image_name = f"ci-{model_name}_{model_name}.ubuntu.amd" + build_manifest["built_images"][synthetic_image_name] = { + "docker_image": synthetic_image_name, + "dockerfile": f"docker/{model_name}", + "base_docker": "rocm/pytorch", + "docker_sha": "", + "build_duration": 0, + "build_command": f"# Skipped build for {model_name} (build_new=false)", + "log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log", + "registry_image": model_registry_image or "" + } + build_manifest["built_models"][synthetic_image_name] = { + "name": model_name, + "dockerfile": f"docker/{model_name}", + "scripts": f"scripts/{model_name}/run.sh", + "n_gpus": "1", + "owner": "", + "training_precision": "", + "tags": [], + "args": "" + } + + # Save the updated manifest + with open(manifest_output, 'w') as f: + json.dump(build_manifest, f, indent=2) + + console.print(f"āœ… Added entries for all models from batch manifest to {manifest_output}") + + def display_results_table(summary: Dict, title: str) -> None: """Display results in a formatted table.""" table = Table(title=title, show_header=True, header_style="bold magenta") @@ -265,6 +436,7 @@ def get_display_names(items, limit=5): def build( tags: Annotated[List[str], typer.Option("--tags", "-t", help="Model tags to build (can specify multiple)")] = [], registry: Annotated[Optional[str], typer.Option("--registry", "-r", help="Docker registry to push images to")] = None, + batch_manifest: Annotated[Optional[str], typer.Option("--batch-manifest", help="Input manifest.json file for batch build mode")] = None, additional_context: Annotated[str, typer.Option("--additional-context", "-c", help="Additional context as JSON string")] = "{}", additional_context_file: Annotated[Optional[str], typer.Option("--additional-context-file", "-f", help="File containing additional context JSON")] = None, clean_docker_cache: Annotated[bool, typer.Option("--clean-docker-cache", help="Rebuild images without using cache")] = False, @@ -286,16 +458,62 @@ def build( This command builds Docker images for the specified model tags and optionally pushes them to a registry. Additional context with gpu_vendor and guest_os is required for build-only operations. + + Batch Build Mode: + Use --batch-manifest to specify a manifest.json file containing a list of models. + For each model with build_new=true, the image will be built. For all models + (regardless of build_new), entries will be created in the build_manifest.json. + + Example batch manifest.json: + [ + { + "model_name": "dummy", + "build_new": false, + "registry_image": "rocm/mad-private:ci-dummy_dummy.ubuntu.amd", + "registry": "dockerhub" + }, + { + "model_name": "dummy2", + "build_new": true, + "registry_image": "", + "registry": "" + } + ] """ setup_logging(verbose) - console.print(Panel( - f"šŸ”Ø [bold cyan]Building Models[/bold cyan]\n" - f"Tags: [yellow]{', '.join(tags) if tags else 'All models'}[/yellow]\n" - f"Registry: [yellow]{registry or 'Local only'}[/yellow]", - title="Build Configuration", - border_style="blue" - )) + # Validate mutually exclusive options + if batch_manifest and tags: + console.print("āŒ [bold red]Error: Cannot specify both --batch-manifest and --tags options[/bold red]") + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Process batch manifest if provided + batch_data = None + effective_tags = tags + if batch_manifest: + try: + batch_data = process_batch_manifest(batch_manifest) + effective_tags = batch_data["build_tags"] + console.print(Panel( + f"ļæ½ [bold cyan]Batch Build Mode[/bold cyan]\n" + f"Input manifest: [yellow]{batch_manifest}[/yellow]\n" + f"Total models: [yellow]{len(batch_data['all_tags'])}[/yellow]\n" + f"Models to build: [yellow]{len(batch_data['build_tags'])}[/yellow] ({', '.join(batch_data['build_tags']) if batch_data['build_tags'] else 'none'})\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]", + title="Batch Build Configuration", + border_style="blue" + )) + except (FileNotFoundError, ValueError) as e: + console.print(f"āŒ [bold red]Error processing batch manifest: {e}[/bold red]") + raise typer.Exit(ExitCode.INVALID_ARGS) + else: + console.print(Panel( + f"ļæ½šŸ”Ø [bold cyan]Building Models[/bold cyan]\n" + f"Tags: [yellow]{', '.join(tags) if tags else 'All models'}[/yellow]\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]", + title="Build Configuration", + border_style="blue" + )) try: # Validate additional context @@ -303,7 +521,7 @@ def build( # Create arguments object args = create_args_namespace( - tags=tags, + tags=effective_tags, registry=registry, additional_context=additional_context, additional_context_file=additional_context_file, @@ -338,6 +556,12 @@ def build( ) progress.update(task, description="Build completed!") + # Handle batch manifest post-processing + if batch_data: + with console.status("Processing batch manifest..."): + _process_batch_manifest_entries(batch_data, manifest_output, registry) + + # Display results display_results_table(build_summary, "Build Results") From 768dcf92eb06a86d584508b6ab4a28240faaa038 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 17:05:26 -0400 Subject: [PATCH 2/9] enhanced logging system is now active and will automatically highlight all Docker operations --- src/madengine/core/console.py | 77 +++++++- src/madengine/mad_cli.py | 8 +- .../pre_scripts/rocEnvTool/csv_parser.py | 18 +- src/madengine/tools/container_runner.py | 23 ++- src/madengine/tools/csv_to_html.py | 24 ++- src/madengine/tools/docker_builder.py | 27 +-- src/madengine/tools/run_models.py | 12 +- src/madengine/tools/update_perf_csv.py | 28 ++- src/madengine/utils/log_formatting.py | 172 ++++++++++++++++++ 9 files changed, 359 insertions(+), 30 deletions(-) create mode 100644 src/madengine/utils/log_formatting.py diff --git a/src/madengine/core/console.py b/src/madengine/core/console.py index 9340924a..e25a1eba 100644 --- a/src/madengine/core/console.py +++ b/src/madengine/core/console.py @@ -8,6 +8,7 @@ # built-in modules import subprocess import typing +import re # third-party modules import typing_extensions @@ -33,6 +34,73 @@ def __init__( self.shellVerbose = shellVerbose self.live_output = live_output + def _highlight_docker_operations(self, command: str) -> str: + """Highlight docker push/pull/build/run operations for better visibility. + + Args: + command (str): The command to potentially highlight. + + Returns: + str: The highlighted command if it's a docker operation. + """ + # Check if this is a docker operation + docker_push_pattern = r'^docker\s+push\s+' + docker_pull_pattern = r'^docker\s+pull\s+' + docker_build_pattern = r'^docker\s+build\s+' + docker_run_pattern = r'^docker\s+run\s+' + + if re.match(docker_push_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸš€ DOCKER PUSH OPERATION: {command}\n{'='*80}" + elif re.match(docker_pull_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸ“„ DOCKER PULL OPERATION: {command}\n{'='*80}" + elif re.match(docker_build_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸ”Ø DOCKER BUILD OPERATION: {command}\n{'='*80}" + elif re.match(docker_run_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸƒ DOCKER RUN OPERATION: {command}\n{'='*80}" + + return command + + def _show_docker_completion(self, command: str, success: bool = True) -> None: + """Show completion message for docker operations. + + Args: + command (str): The command that was executed. + success (bool): Whether the operation was successful. + """ + docker_push_pattern = r'^docker\s+push\s+' + docker_pull_pattern = r'^docker\s+pull\s+' + docker_build_pattern = r'^docker\s+build\s+' + docker_run_pattern = r'^docker\s+run\s+' + + if re.match(docker_push_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER PUSH COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER PUSH FAILED") + print(f"{'='*80}\n") + elif re.match(docker_pull_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER PULL COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER PULL FAILED") + print(f"{'='*80}\n") + elif re.match(docker_build_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER BUILD COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER BUILD FAILED") + print(f"{'='*80}\n") + elif re.match(docker_run_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER RUN COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER RUN FAILED") + print(f"{'='*80}\n") + def sh( self, command: str, @@ -60,7 +128,8 @@ def sh( """ # Print the command if shellVerbose is True if self.shellVerbose and not secret: - print("> " + command, flush=True) + highlighted_command = self._highlight_docker_operations(command) + print("> " + highlighted_command, flush=True) # Run the shell command proc = subprocess.Popen( @@ -91,6 +160,12 @@ def sh( raise RuntimeError("Console script timeout") from exc # Check for failure + success = proc.returncode == 0 + + # Show docker operation completion status + if not secret: + self._show_docker_completion(command, success) + if proc.returncode != 0: if not canFail: if not secret: diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index fbd68305..b08c7a36 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -123,7 +123,7 @@ def process_batch_manifest(batch_manifest_file: str) -> Dict[str, List[str]]: """Process batch manifest file and extract model tags based on build_new flag. Args: - batch_manifest_file: Path to the input manifest.json file + batch_manifest_file: Path to the input batch.json file Returns: Dict containing 'build_tags' and 'all_tags' lists @@ -436,7 +436,7 @@ def get_display_names(items, limit=5): def build( tags: Annotated[List[str], typer.Option("--tags", "-t", help="Model tags to build (can specify multiple)")] = [], registry: Annotated[Optional[str], typer.Option("--registry", "-r", help="Docker registry to push images to")] = None, - batch_manifest: Annotated[Optional[str], typer.Option("--batch-manifest", help="Input manifest.json file for batch build mode")] = None, + batch_manifest: Annotated[Optional[str], typer.Option("--batch-manifest", help="Input batch.json file for batch build mode")] = None, additional_context: Annotated[str, typer.Option("--additional-context", "-c", help="Additional context as JSON string")] = "{}", additional_context_file: Annotated[Optional[str], typer.Option("--additional-context-file", "-f", help="File containing additional context JSON")] = None, clean_docker_cache: Annotated[bool, typer.Option("--clean-docker-cache", help="Rebuild images without using cache")] = False, @@ -460,11 +460,11 @@ def build( is required for build-only operations. Batch Build Mode: - Use --batch-manifest to specify a manifest.json file containing a list of models. + Use --batch-manifest to specify a batch.json file containing a list of models. For each model with build_new=true, the image will be built. For all models (regardless of build_new), entries will be created in the build_manifest.json. - Example batch manifest.json: + Example batch batch.json: [ { "model_name": "dummy", diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index 66fb84ac..db504803 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -284,11 +284,23 @@ def dump_csv_output(self): fs.write(sys_config_info[j]) fs.write("\n") fs.close() - print ("OK: Dumped into {} file.".format(self.filename)) + print("\n" + "="*60) + print(f"āœ… SUCCESS: System config data dumped to {self.filename}") + print("="*60 + "\n") def print_csv_output(self): - print ("Printing the sys config info env variables...") + print("\n" + "="*80) + print("šŸ“‹ SYSTEM CONFIG INFO - ENVIRONMENT VARIABLES") + print("="*80) if self.sys_config_info_list: for j in range(len(self.sys_config_info_list)): line = self.sys_config_info_list[j] - print (line) + # Add some formatting for key-value pairs + if "|" in line and not line.startswith("Tag"): + key, value = line.split("|", 1) + print(f"šŸ”¹ {key:<30}: {value}") + else: + print(f"šŸ“Œ {line}") + else: + print("āŒ No system config information available") + print("="*80 + "\n") diff --git a/src/madengine/tools/container_runner.py b/src/madengine/tools/container_runner.py index f29ef9ea..0f56b373 100644 --- a/src/madengine/tools/container_runner.py +++ b/src/madengine/tools/container_runner.py @@ -211,15 +211,21 @@ def pull_image(self, registry_image: str, local_name: str = None, if registry and credentials: self.login_to_registry(registry, credentials) - print(f"Pulling image: {registry_image}") + print(f"\nšŸ“„ Starting docker pull from registry...") + print(f"šŸ“ Registry: {registry or 'Default'}") + print(f"šŸ·ļø Image: {registry_image}") try: self.console.sh(f"docker pull {registry_image}") if local_name: self.console.sh(f"docker tag {registry_image} {local_name}") - print(f"Tagged as: {local_name}") + print(f"šŸ·ļø Tagged as: {local_name}") + print(f"āœ… Successfully pulled and tagged image") + print(f"{'='*80}") return local_name + print(f"āœ… Successfully pulled image: {registry_image}") + print(f"{'='*80}") return registry_image except Exception as e: @@ -542,7 +548,14 @@ def run_container(self, model_info: typing.Dict, docker_image: str, print(f"Docker options: {docker_options}") # set timeout - print(f"Setting timeout to {str(timeout)} seconds.") + print(f"ā° Setting timeout to {str(timeout)} seconds.") + + print(f"\nšŸƒ Starting Docker container execution...") + print(f"šŸ·ļø Image: {docker_image}") + print(f"šŸ“¦ Container: {container_name}") + print(f"šŸ“ Log file: {log_file_path}") + print(f"šŸŽ® GPU Vendor: {gpu_vendor}") + print(f"{'='*80}") # Run the container with logging try: @@ -554,13 +567,15 @@ def run_container(self, model_info: typing.Dict, docker_image: str, # Check user whoami = model_docker.sh("whoami") - print(f"USER is {whoami}") + print(f"šŸ‘¤ Running as user: {whoami}") # Show GPU info if gpu_vendor.find("AMD") != -1: + print(f"šŸŽ® Checking AMD GPU status...") smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true") print(smi) elif gpu_vendor.find("NVIDIA") != -1: + print(f"šŸŽ® Checking NVIDIA GPU status...") smi = model_docker.sh("/usr/bin/nvidia-smi || true") print(smi) diff --git a/src/madengine/tools/csv_to_html.py b/src/madengine/tools/csv_to_html.py index 5a27952a..2bbcc38d 100644 --- a/src/madengine/tools/csv_to_html.py +++ b/src/madengine/tools/csv_to_html.py @@ -30,7 +30,17 @@ def convert_csv_to_html(file_path: str): output_name += file_name + ".html" # read csv df = pd.read_csv(file_path) - print(df) + + # Use beautiful formatting for dataframe display + try: + from madengine.utils.log_formatting import print_dataframe_beautiful + print_dataframe_beautiful(df, f"Converting CSV: {file_name}") + except ImportError: + # Fallback to basic formatting if utils not available + print(f"\nšŸ“Š Converting CSV: {file_name}") + print("="*80) + print(df.to_string(max_rows=20, max_cols=10)) + print("="*80) # Use the .to_html() to get your table in html df_html = df.to_html(index=False) @@ -67,7 +77,17 @@ def run(self): # read csv df = pd.read_csv(file_path) - print(df) + + # Use beautiful formatting for dataframe display + try: + from madengine.utils.log_formatting import print_dataframe_beautiful + print_dataframe_beautiful(df, f"CSV Data from {file_name}") + except ImportError: + # Fallback to basic formatting if utils not available + print(f"\nšŸ“Š CSV Data from {file_name}") + print("="*80) + print(df.to_string(max_rows=20, max_cols=10)) + print("="*80) # Use the .to_html() to get your table in html df_html = df.to_html(index=False) diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 23190e5b..90eed423 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -91,8 +91,11 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, Returns: dict: Build information including image name, build duration, etc. """ - print(f"Building Docker image for model {model_info['name']} from {dockerfile}") - print(f"Building Docker image...") + print(f"\nšŸ”Ø Starting Docker build for model: {model_info['name']}") + print(f"šŸ“ Dockerfile: {dockerfile}") + print(f"šŸ·ļø Target image: {docker_image}") + print(f"šŸ“ Build log: {log_file_path}") + print(f"{'='*80}") # Generate image name image_docker_name = ( @@ -115,9 +118,6 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, # Replace / with _ in log file path (already done above, but keeping for safety) log_file_path = log_file_path.replace("/", "_") - print(f"Processing Dockerfile: {dockerfile}") - print(f"Build log will be written to: {log_file_path}") - # Get docker context docker_context = self.get_context_path(model_info) @@ -148,13 +148,15 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, # Execute build with log redirection with open(log_file_path, mode="w", buffering=1) as outlog: with redirect_stdout(PythonicTee(outlog, self.live_output)), redirect_stderr(PythonicTee(outlog, self.live_output)): - print(f"Executing: {build_command}") + print(f"šŸ”Ø Executing build command...") self.console.sh(build_command, timeout=None) build_duration = time.time() - build_start_time - print(f"Build Duration: {build_duration} seconds") - print(f"MAD_CONTAINER_IMAGE is {docker_image}") + print(f"ā±ļø Build Duration: {build_duration:.2f} seconds") + print(f"šŸ·ļø MAD_CONTAINER_IMAGE is {docker_image}") + print(f"āœ… Docker build completed successfully") + print(f"{'='*80}") # Get base docker info base_docker = "" @@ -294,15 +296,18 @@ def push_image(self, docker_image: str, registry: str = None, credentials: typin # Tag the image if different from local name if registry_image != docker_image: tag_command = f"docker tag {docker_image} {registry_image}" - print(f"Tagging image: {tag_command}") + print(f"šŸ·ļø Tagging image: {tag_command}") self.console.sh(tag_command) # Push the image push_command = f"docker push {registry_image}" - print(f"Pushing image: {push_command}") + print(f"\nšŸš€ Starting docker push to registry...") + print(f"šŸ“¤ Registry: {registry}") + print(f"šŸ·ļø Image: {registry_image}") self.console.sh(push_command) - print(f"Successfully pushed image to registry: {registry_image}") + print(f"āœ… Successfully pushed image to registry: {registry_image}") + print(f"{'='*80}") return registry_image except Exception as e: diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index ddcc166d..cd2f3a46 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -118,7 +118,17 @@ def print_perf(self): Method to print stage perf results of a model. """ - print(f"{self.model} performance is {self.performance} {self.metric}") + print("\n" + "="*60) + print(f"šŸ“Š PERFORMANCE RESULTS") + print("="*60) + print(f"šŸ·ļø Model: {self.model}") + print(f"⚔ Performance: {self.performance} {self.metric}") + print(f"šŸ“ˆ Status: {self.status}") + if self.machine_name: + print(f"šŸ–„ļø Machine: {self.machine_name}") + if self.gpu_architecture: + print(f"šŸŽ® GPU Architecture: {self.gpu_architecture}") + print("="*60 + "\n") # Exports all info in json format to json_name # multiple_results excludes the info provided on csv diff --git a/src/madengine/tools/update_perf_csv.py b/src/madengine/tools/update_perf_csv.py index 09c267f1..f26da890 100644 --- a/src/madengine/tools/update_perf_csv.py +++ b/src/madengine/tools/update_perf_csv.py @@ -195,12 +195,17 @@ def update_perf_csv( model_name: typing.Optional[str] = None, ): """Update the performance csv file with the latest performance data.""" - print(f"Attaching performance metrics of models to perf.csv") + print("\n" + "="*80) + print("šŸ“ˆ ATTACHING PERFORMANCE METRICS TO DATABASE") + print("="*80) + print(f"šŸ“‚ Target file: {perf_csv}") + # read perf.csv perf_csv_df = df_strip_columns(pd.read_csv(perf_csv)) # handle multiple_results, single_result, and exception_result if multiple_results: + print("šŸ”„ Processing multiple results...") perf_csv_df = handle_multiple_results( perf_csv_df, multiple_results, @@ -208,17 +213,22 @@ def update_perf_csv( model_name, ) elif single_result: + print("šŸ”„ Processing single result...") perf_csv_df = handle_single_result(perf_csv_df, single_result) elif exception_result: + print("āš ļø Processing exception result...") perf_csv_df = handle_exception_result( perf_csv_df, exception_result ) else: - print("No results to update in perf.csv") + print("ā„¹ļø No results to update in perf.csv") # write new perf.csv # Note that this file will also generate a perf_entry.csv regardless of the output file args. perf_csv_df.to_csv(perf_csv, index=False) + print(f"āœ… Successfully updated: {perf_csv}") + print("="*80 + "\n") + perf_csv_df.to_csv(perf_csv, index=False) class UpdatePerfCsv: @@ -238,12 +248,17 @@ def __init__(self, args: argparse.Namespace): def run(self): """Update the performance csv file with the latest performance data.""" - print(f"Updating performance metrics of models perf.csv to database") + print("\n" + "="*80) + print("šŸ“Š UPDATING PERFORMANCE METRICS DATABASE") + print("="*80) + print(f"šŸ“‚ Processing: {self.args.perf_csv}") + # read perf.csv perf_csv_df = df_strip_columns(pd.read_csv(self.args.perf_csv)) # handle multiple_results, single_result, and exception_result if self.args.multiple_results: + print("šŸ”„ Processing multiple results...") perf_csv_df = handle_multiple_results( perf_csv_df, self.args.multiple_results, @@ -251,17 +266,22 @@ def run(self): self.args.model_name, ) elif self.args.single_result: + print("šŸ”„ Processing single result...") perf_csv_df = handle_single_result(perf_csv_df, self.args.single_result) elif self.args.exception_result: + print("āš ļø Processing exception result...") perf_csv_df = handle_exception_result( perf_csv_df, self.args.exception_result ) else: - print("No results to update in perf.csv") + print("ā„¹ļø No results to update in perf.csv") # write new perf.csv # Note that this file will also generate a perf_entry.csv regardless of the output file args. perf_csv_df.to_csv(self.args.perf_csv, index=False) + + print(f"āœ… Successfully updated: {self.args.perf_csv}") + print("="*80 + "\n") self.return_status = True return self.return_status diff --git a/src/madengine/utils/log_formatting.py b/src/madengine/utils/log_formatting.py new file mode 100644 index 00000000..99803a3b --- /dev/null +++ b/src/madengine/utils/log_formatting.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +""" +Utility functions for formatting and displaying data in logs. + +This module provides enhanced formatting utilities for better log readability, +including dataframe formatting and other display utilities. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import pandas as pd +import typing +from rich.table import Table +from rich.console import Console as RichConsole +from rich.text import Text + + +def format_dataframe_for_log(df: pd.DataFrame, title: str = "DataFrame", max_rows: int = 20, max_cols: int = 10) -> str: + """ + Format a pandas DataFrame for beautiful log output. + + Args: + df: The pandas DataFrame to format + title: Title for the dataframe display + max_rows: Maximum number of rows to display + max_cols: Maximum number of columns to display + + Returns: + str: Beautifully formatted string representation of the DataFrame + """ + if df.empty: + return f"\nšŸ“Š {title}\n{'='*60}\nāŒ DataFrame is empty\n{'='*60}\n" + + # Truncate if necessary + display_df = df.copy() + truncated_rows = False + truncated_cols = False + + if len(df) > max_rows: + display_df = display_df.head(max_rows) + truncated_rows = True + + if len(df.columns) > max_cols: + display_df = display_df.iloc[:, :max_cols] + truncated_cols = True + + # Create header + header = f"\nšŸ“Š {title}\n" + header += f"{'='*80}\n" + header += f"šŸ“ Shape: {df.shape[0]} rows Ɨ {df.shape[1]} columns\n" + + if truncated_rows or truncated_cols: + header += "āš ļø Display truncated: " + if truncated_rows: + header += f"showing first {max_rows} rows " + if truncated_cols: + header += f"showing first {max_cols} columns" + header += "\n" + + header += f"{'='*80}\n" + + # Format the DataFrame with nice styling + formatted_df = display_df.to_string( + index=True, + max_rows=max_rows, + max_cols=max_cols, + width=None, + float_format='{:.4f}'.format + ) + + # Add some visual separators + footer = f"\n{'='*80}\n" + + return header + formatted_df + footer + + +def format_dataframe_rich(df: pd.DataFrame, title: str = "DataFrame", max_rows: int = 20) -> None: + """ + Display a pandas DataFrame using Rich formatting for enhanced readability. + + Args: + df: The pandas DataFrame to display + title: Title for the table + max_rows: Maximum number of rows to display + """ + console = RichConsole() + + if df.empty: + console.print(f"šŸ“Š [bold cyan]{title}[/bold cyan]: [red]DataFrame is empty[/red]") + return + + # Create Rich table + table = Table(title=f"šŸ“Š {title}", show_header=True, header_style="bold magenta") + + # Add index column + table.add_column("Index", style="dim", width=8) + + # Add data columns + for col in df.columns: + table.add_column(str(col), style="cyan") + + # Add rows (truncate if necessary) + display_rows = min(len(df), max_rows) + for i in range(display_rows): + row_data = [str(df.index[i])] + for col in df.columns: + value = df.iloc[i][col] + if pd.isna(value): + row_data.append("[dim]NaN[/dim]") + elif isinstance(value, float): + row_data.append(f"{value:.4f}") + else: + row_data.append(str(value)) + table.add_row(*row_data) + + # Show truncation info + if len(df) > max_rows: + table.add_row(*["..." for _ in range(len(df.columns) + 1)]) + console.print(f"[yellow]āš ļø Showing first {max_rows} of {len(df)} rows[/yellow]") + + console.print(table) + console.print(f"[green]✨ DataFrame shape: {df.shape[0]} rows Ɨ {df.shape[1]} columns[/green]") + + +def print_dataframe_beautiful(df: pd.DataFrame, title: str = "Data", use_rich: bool = True) -> None: + """ + Print a pandas DataFrame with beautiful formatting. + + Args: + df: The pandas DataFrame to print + title: Title for the display + use_rich: Whether to use Rich formatting (if available) or fall back to simple formatting + """ + try: + if use_rich: + format_dataframe_rich(df, title) + else: + raise ImportError("Fallback to simple formatting") + except (ImportError, Exception): + # Fallback to simple but nice formatting + formatted_output = format_dataframe_for_log(df, title) + print(formatted_output) + + +def highlight_log_section(title: str, content: str, style: str = "info") -> str: + """ + Create a highlighted log section with borders and styling. + + Args: + title: Section title + content: Section content + style: Style type ('info', 'success', 'warning', 'error') + + Returns: + str: Formatted log section + """ + styles = { + 'info': {'emoji': 'ā„¹ļø', 'border': '-'}, + 'success': {'emoji': 'āœ…', 'border': '='}, + 'warning': {'emoji': 'āš ļø', 'border': '!'}, + 'error': {'emoji': 'āŒ', 'border': '#'} + } + + style_config = styles.get(style, styles['info']) + emoji = style_config['emoji'] + border_char = style_config['border'] + + border = border_char * 80 + header = f"\n{border}\n{emoji} {title.upper()}\n{border}" + footer = f"{border}\n" + + return f"{header}\n{content}\n{footer}" From a4b324ff7fcb8c2815a4c9638a468a4b283ba14d Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 17:13:43 -0400 Subject: [PATCH 3/9] Fix the error local variable docker_image referenced before assignment --- src/madengine/tools/docker_builder.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 90eed423..26183433 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -91,13 +91,7 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, Returns: dict: Build information including image name, build duration, etc. """ - print(f"\nšŸ”Ø Starting Docker build for model: {model_info['name']}") - print(f"šŸ“ Dockerfile: {dockerfile}") - print(f"šŸ·ļø Target image: {docker_image}") - print(f"šŸ“ Build log: {log_file_path}") - print(f"{'='*80}") - - # Generate image name + # Generate image name first image_docker_name = ( model_info["name"].replace("/", "_").lower() + "_" @@ -118,6 +112,12 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, # Replace / with _ in log file path (already done above, but keeping for safety) log_file_path = log_file_path.replace("/", "_") + print(f"\nšŸ”Ø Starting Docker build for model: {model_info['name']}") + print(f"šŸ“ Dockerfile: {dockerfile}") + print(f"šŸ·ļø Target image: {docker_image}") + print(f"šŸ“ Build log: {log_file_path}") + print(f"{'='*80}") + # Get docker context docker_context = self.get_context_path(model_info) From ebfb472d6afccfa241775a447a0937f008a5c750 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 17:38:49 -0400 Subject: [PATCH 4/9] Updated the perf dataframe output --- src/madengine/utils/log_formatting.py | 83 +++++++++++++++++---------- 1 file changed, 54 insertions(+), 29 deletions(-) diff --git a/src/madengine/utils/log_formatting.py b/src/madengine/utils/log_formatting.py index 99803a3b..26daae7b 100644 --- a/src/madengine/utils/log_formatting.py +++ b/src/madengine/utils/log_formatting.py @@ -31,31 +31,41 @@ def format_dataframe_for_log(df: pd.DataFrame, title: str = "DataFrame", max_row if df.empty: return f"\nšŸ“Š {title}\n{'='*60}\nāŒ DataFrame is empty\n{'='*60}\n" - # Truncate if necessary - display_df = df.copy() + # Define key columns to display for performance results + key_columns = [ + "model", "n_gpus", "docker_file", "machine_name", "gpu_architecture", + "performance", "metric", "status", "dataname" + ] + + # Filter DataFrame to show only key columns that exist + available_columns = [col for col in key_columns if col in df.columns] + if available_columns: + display_df = df[available_columns].copy() + total_columns_note = f"(showing {len(available_columns)} of {len(df.columns)} columns)" + else: + # If no key columns found, show all columns as fallback with truncation + display_df = df.copy() + total_columns_note = f"(showing all {len(df.columns)} columns)" + if len(df.columns) > max_cols: + display_df = display_df.iloc[:, :max_cols] + total_columns_note = f"(showing first {max_cols} of {len(df.columns)} columns)" + + # Truncate rows if necessary truncated_rows = False - truncated_cols = False - - if len(df) > max_rows: + if len(display_df) > max_rows: display_df = display_df.head(max_rows) truncated_rows = True - if len(df.columns) > max_cols: - display_df = display_df.iloc[:, :max_cols] - truncated_cols = True - # Create header - header = f"\nšŸ“Š {title}\n" + header = f"\nšŸ“Š {title} {total_columns_note}\n" header += f"{'='*80}\n" - header += f"šŸ“ Shape: {df.shape[0]} rows Ɨ {df.shape[1]} columns\n" + if available_columns: + header += f"šŸ“ Shape: {df.shape[0]} rows Ɨ {len(available_columns)} key columns (total: {df.shape[1]} columns)\n" + else: + header += f"šŸ“ Shape: {df.shape[0]} rows Ɨ {df.shape[1]} columns\n" - if truncated_rows or truncated_cols: - header += "āš ļø Display truncated: " - if truncated_rows: - header += f"showing first {max_rows} rows " - if truncated_cols: - header += f"showing first {max_cols} columns" - header += "\n" + if truncated_rows: + header += f"āš ļø Display truncated: showing first {max_rows} rows\n" header += f"{'='*80}\n" @@ -63,7 +73,6 @@ def format_dataframe_for_log(df: pd.DataFrame, title: str = "DataFrame", max_row formatted_df = display_df.to_string( index=True, max_rows=max_rows, - max_cols=max_cols, width=None, float_format='{:.4f}'.format ) @@ -89,22 +98,38 @@ def format_dataframe_rich(df: pd.DataFrame, title: str = "DataFrame", max_rows: console.print(f"šŸ“Š [bold cyan]{title}[/bold cyan]: [red]DataFrame is empty[/red]") return + # Define key columns to display for performance results + key_columns = [ + "model", "n_gpus", "machine_name", "gpu_architecture", + "performance", "metric", "status", "dataname" + ] + + # Filter DataFrame to show only key columns that exist + available_columns = [col for col in key_columns if col in df.columns] + if available_columns: + display_df = df[available_columns] + total_columns_note = f"(showing {len(available_columns)} of {len(df.columns)} columns)" + else: + # If no key columns found, show all columns as fallback + display_df = df + total_columns_note = f"(showing all {len(df.columns)} columns)" + # Create Rich table - table = Table(title=f"šŸ“Š {title}", show_header=True, header_style="bold magenta") + table = Table(title=f"šŸ“Š {title} {total_columns_note}", show_header=True, header_style="bold magenta") # Add index column table.add_column("Index", style="dim", width=8) # Add data columns - for col in df.columns: + for col in display_df.columns: table.add_column(str(col), style="cyan") # Add rows (truncate if necessary) - display_rows = min(len(df), max_rows) + display_rows = min(len(display_df), max_rows) for i in range(display_rows): - row_data = [str(df.index[i])] - for col in df.columns: - value = df.iloc[i][col] + row_data = [str(display_df.index[i])] + for col in display_df.columns: + value = display_df.iloc[i][col] if pd.isna(value): row_data.append("[dim]NaN[/dim]") elif isinstance(value, float): @@ -114,12 +139,12 @@ def format_dataframe_rich(df: pd.DataFrame, title: str = "DataFrame", max_rows: table.add_row(*row_data) # Show truncation info - if len(df) > max_rows: - table.add_row(*["..." for _ in range(len(df.columns) + 1)]) - console.print(f"[yellow]āš ļø Showing first {max_rows} of {len(df)} rows[/yellow]") + if len(display_df) > max_rows: + table.add_row(*["..." for _ in range(len(display_df.columns) + 1)]) + console.print(f"[yellow]āš ļø Showing first {max_rows} of {len(display_df)} rows[/yellow]") console.print(table) - console.print(f"[green]✨ DataFrame shape: {df.shape[0]} rows Ɨ {df.shape[1]} columns[/green]") + console.print(f"[green]✨ DataFrame shape: {df.shape[0]} rows Ɨ {len(available_columns)} key columns (total: {df.shape[1]} columns)[/green]") def print_dataframe_beautiful(df: pd.DataFrame, title: str = "Data", use_rich: bool = True) -> None: From e47572eb4feb864a50c873c88cc4d899e4b5d01f Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 18:16:29 -0400 Subject: [PATCH 5/9] The fixes are backward compatible and maintain existing functionality for truly successful runs while correctly identifying and handling various failure scenarios. --- src/madengine/tools/container_runner.py | 46 ++++++++++++++++++- .../tools/distributed_orchestrator.py | 22 ++++++--- 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/src/madengine/tools/container_runner.py b/src/madengine/tools/container_runner.py index 0f56b373..f3ab0da5 100644 --- a/src/madengine/tools/container_runner.py +++ b/src/madengine/tools/container_runner.py @@ -706,8 +706,50 @@ def run_container(self, model_info: typing.Dict, docker_image: str, except Exception as e: print(f"Warning: Could not extract performance metrics: {e}") - # Set status based on performance - run_results["status"] = 'SUCCESS' if run_results.get("performance") else 'FAILURE' + # Set status based on performance and error patterns + # First check for obvious failure patterns in the logs + try: + # Check for common failure patterns in the log file + error_patterns = [ + "OutOfMemoryError", "HIP out of memory", "CUDA out of memory", + "RuntimeError", "AssertionError", "ValueError", "SystemExit", + "failed (exitcode:", "Traceback (most recent call last):", + "Error:", "FAILED", "Exception:" + ] + + has_errors = False + if log_file_path and os.path.exists(log_file_path): + try: + # Check for error patterns in the log + for pattern in error_patterns: + error_check_cmd = f"grep -q '{pattern}' {log_file_path} && echo 'FOUND' || echo 'NOT_FOUND'" + result = self.console.sh(error_check_cmd, canFail=True) + if result.strip() == "FOUND": + has_errors = True + print(f"Found error pattern '{pattern}' in logs") + break + except Exception: + pass # Error checking is optional + + # Status logic: Must have performance AND no errors to be considered success + performance_value = run_results.get("performance") + has_performance = performance_value and performance_value.strip() and performance_value.strip() != "N/A" + + if has_errors: + run_results["status"] = 'FAILURE' + print(f"Status: FAILURE (error patterns detected in logs)") + elif has_performance: + run_results["status"] = 'SUCCESS' + print(f"Status: SUCCESS (performance metrics found, no errors)") + else: + run_results["status"] = 'FAILURE' + print(f"Status: FAILURE (no performance metrics)") + + except Exception as e: + print(f"Warning: Error in status determination: {e}") + # Fallback to simple performance check + run_results["status"] = 'SUCCESS' if run_results.get("performance") else 'FAILURE' + print(f"{model_info['name']} performance is {run_results.get('performance', 'N/A')} {run_results.get('metric', '')}") # Generate performance results and update perf.csv diff --git a/src/madengine/tools/distributed_orchestrator.py b/src/madengine/tools/distributed_orchestrator.py index c6246c4c..d21a9a0d 100644 --- a/src/madengine/tools/distributed_orchestrator.py +++ b/src/madengine/tools/distributed_orchestrator.py @@ -311,10 +311,15 @@ def run_phase(self, manifest_file: str = "build_manifest.json", generate_sys_env_details=getattr(self.args, 'generate_sys_env_details', True) ) - execution_summary["successful_runs"].append(run_results) - execution_summary["total_execution_time"] += run_results.get("test_duration", 0) + # Add to appropriate list based on actual status + if run_results.get("status") == "SUCCESS": + execution_summary["successful_runs"].append(run_results) + print(f"Successfully completed: {model_info['name']} -> {run_results['status']}") + else: + execution_summary["failed_runs"].append(run_results) + print(f"Failed to complete: {model_info['name']} -> {run_results['status']}") - print(f"Successfully completed: {model_info['name']} -> {run_results['status']}") + execution_summary["total_execution_time"] += run_results.get("test_duration", 0) except Exception as e: print(f"Failed to run {model_info['name']} with image {image_name}: {e}") @@ -404,10 +409,15 @@ def run_phase(self, manifest_file: str = "build_manifest.json", generate_sys_env_details=getattr(self.args, 'generate_sys_env_details', True) ) - execution_summary["successful_runs"].append(run_results) - execution_summary["total_execution_time"] += run_results.get("test_duration", 0) + # Add to appropriate list based on actual status + if run_results.get("status") == "SUCCESS": + execution_summary["successful_runs"].append(run_results) + print(f"Successfully completed: {model_name} -> {run_results['status']}") + else: + execution_summary["failed_runs"].append(run_results) + print(f"Failed to complete: {model_name} -> {run_results['status']}") - print(f"Successfully completed: {model_name} -> {run_results['status']}") + execution_summary["total_execution_time"] += run_results.get("test_duration", 0) except Exception as e: print(f"Failed to run {model_name} with image {image_name}: {e}") From 3a73edca0bb30e98bd85f29bf6cc908d88541dd8 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 18:33:28 -0400 Subject: [PATCH 6/9] Fixed the problematic log --- src/madengine/tools/container_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/madengine/tools/container_runner.py b/src/madengine/tools/container_runner.py index f3ab0da5..7a41be53 100644 --- a/src/madengine/tools/container_runner.py +++ b/src/madengine/tools/container_runner.py @@ -720,9 +720,10 @@ def run_container(self, model_info: typing.Dict, docker_image: str, has_errors = False if log_file_path and os.path.exists(log_file_path): try: - # Check for error patterns in the log + # Check for error patterns in the log (exclude our own grep commands and output messages) for pattern in error_patterns: - error_check_cmd = f"grep -q '{pattern}' {log_file_path} && echo 'FOUND' || echo 'NOT_FOUND'" + # Use grep with -v to exclude our own commands and output to avoid false positives + error_check_cmd = f"grep -v -E '(grep -q.*{pattern}|Found error pattern.*{pattern})' {log_file_path} | grep -q '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'" result = self.console.sh(error_check_cmd, canFail=True) if result.strip() == "FOUND": has_errors = True From e1000a41e907c4ae11ce1617b1b417e14c98de19 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 19:07:21 -0400 Subject: [PATCH 7/9] Fixed the error pattern, removed the wrong string --- src/madengine/tools/container_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/madengine/tools/container_runner.py b/src/madengine/tools/container_runner.py index 7a41be53..4057ba93 100644 --- a/src/madengine/tools/container_runner.py +++ b/src/madengine/tools/container_runner.py @@ -713,8 +713,7 @@ def run_container(self, model_info: typing.Dict, docker_image: str, error_patterns = [ "OutOfMemoryError", "HIP out of memory", "CUDA out of memory", "RuntimeError", "AssertionError", "ValueError", "SystemExit", - "failed (exitcode:", "Traceback (most recent call last):", - "Error:", "FAILED", "Exception:" + "failed (exitcode:", "Error:", "FAILED", "Exception:" ] has_errors = False From 06934d3263c110adce6739f2d2f16b3e0658b394 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 11 Jul 2025 22:41:14 -0400 Subject: [PATCH 8/9] Fixed the error of test prof --- tests/test_distributed_integration.py | 40 +++++++++++++++++++-------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/tests/test_distributed_integration.py b/tests/test_distributed_integration.py index daae5f67..4feaaf6d 100644 --- a/tests/test_distributed_integration.py +++ b/tests/test_distributed_integration.py @@ -774,8 +774,8 @@ def test_distributed_profiling_tools_integration(self, mock_exists, mock_data, m # Mock successful container run mock_run_container.return_value = { - "model": "dummy", - "status": "success", + "model": "dummy_prof", + "status": "SUCCESS", "test_duration": 30.5, "profiling_data": { "rocprof_output": "/tmp/rocprof/output.csv" @@ -785,22 +785,38 @@ def test_distributed_profiling_tools_integration(self, mock_exists, mock_data, m # Mock manifest with profiling tools manifest_with_profiling = { "built_images": { - "ci-dummy_profiling.ubuntu.amd": { - "docker_image": "ci-dummy_profiling.ubuntu.amd", + "ci-dummy_prof_dummy.ubuntu.amd": { + "docker_image": "ci-dummy_prof_dummy.ubuntu.amd", "dockerfile": "docker/dummy.ubuntu.amd.Dockerfile", - "build_duration": 45.2 + "base_docker": "rocm/pytorch", + "docker_sha": "sha256:47efe367d76c620ee828750fb294303f3f9f5fb6c184362a4741ce5e55ed3769", + "build_duration": 0.559730052947998, + "build_command": "docker build --network=host -t ci-dummy_prof_dummy.ubuntu.amd --pull -f docker/dummy.ubuntu.amd.Dockerfile ./docker", + "log_file": "dummy_prof_dummy.ubuntu.amd.build.live.log" } }, "built_models": { - "ci-dummy_profiling.ubuntu.amd": { - "name": "dummy_profiling", + "ci-dummy_prof_dummy.ubuntu.amd": { + "name": "dummy_prof", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_prof.sh", "n_gpus": "1", - "scripts": "scripts/dummy/run.sh", - "dockerfile": "docker/dummy.ubuntu.amd.Dockerfile", - "tags": ["dummy", "profiling"], - "tools": ["rocprof", "roctracer"] + "owner": "mmelesse@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "" } - } + }, + "context": { + "docker_env_vars": {}, + "docker_mounts": {}, + "docker_build_arg": {}, + "gpu_vendor": "AMD", + "docker_gpus": "" + }, + "credentials_required": [] } with patch('builtins.open', mock_open(read_data=json.dumps(manifest_with_profiling))): From 59dd584cd9214c4e4b2aafb7184d5981d68d0ae5 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Sat, 12 Jul 2025 11:39:25 -0400 Subject: [PATCH 9/9] Updated the interface of mad_cli --- src/madengine/mad_cli.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index b08c7a36..7db910b4 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -458,27 +458,6 @@ def build( This command builds Docker images for the specified model tags and optionally pushes them to a registry. Additional context with gpu_vendor and guest_os is required for build-only operations. - - Batch Build Mode: - Use --batch-manifest to specify a batch.json file containing a list of models. - For each model with build_new=true, the image will be built. For all models - (regardless of build_new), entries will be created in the build_manifest.json. - - Example batch batch.json: - [ - { - "model_name": "dummy", - "build_new": false, - "registry_image": "rocm/mad-private:ci-dummy_dummy.ubuntu.amd", - "registry": "dockerhub" - }, - { - "model_name": "dummy2", - "build_new": true, - "registry_image": "", - "registry": "" - } - ] """ setup_logging(verbose)