diff --git a/src/madengine/core/console.py b/src/madengine/core/console.py index 9340924a..e25a1eba 100644 --- a/src/madengine/core/console.py +++ b/src/madengine/core/console.py @@ -8,6 +8,7 @@ # built-in modules import subprocess import typing +import re # third-party modules import typing_extensions @@ -33,6 +34,73 @@ def __init__( self.shellVerbose = shellVerbose self.live_output = live_output + def _highlight_docker_operations(self, command: str) -> str: + """Highlight docker push/pull/build/run operations for better visibility. + + Args: + command (str): The command to potentially highlight. + + Returns: + str: The highlighted command if it's a docker operation. + """ + # Check if this is a docker operation + docker_push_pattern = r'^docker\s+push\s+' + docker_pull_pattern = r'^docker\s+pull\s+' + docker_build_pattern = r'^docker\s+build\s+' + docker_run_pattern = r'^docker\s+run\s+' + + if re.match(docker_push_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸš€ DOCKER PUSH OPERATION: {command}\n{'='*80}" + elif re.match(docker_pull_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸ“„ DOCKER PULL OPERATION: {command}\n{'='*80}" + elif re.match(docker_build_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸ”Ø DOCKER BUILD OPERATION: {command}\n{'='*80}" + elif re.match(docker_run_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\nšŸƒ DOCKER RUN OPERATION: {command}\n{'='*80}" + + return command + + def _show_docker_completion(self, command: str, success: bool = True) -> None: + """Show completion message for docker operations. + + Args: + command (str): The command that was executed. + success (bool): Whether the operation was successful. + """ + docker_push_pattern = r'^docker\s+push\s+' + docker_pull_pattern = r'^docker\s+pull\s+' + docker_build_pattern = r'^docker\s+build\s+' + docker_run_pattern = r'^docker\s+run\s+' + + if re.match(docker_push_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER PUSH COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER PUSH FAILED") + print(f"{'='*80}\n") + elif re.match(docker_pull_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER PULL COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER PULL FAILED") + print(f"{'='*80}\n") + elif re.match(docker_build_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER BUILD COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER BUILD FAILED") + print(f"{'='*80}\n") + elif re.match(docker_run_pattern, command, re.IGNORECASE): + if success: + print(f"āœ… DOCKER RUN COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"āŒ DOCKER RUN FAILED") + print(f"{'='*80}\n") + def sh( self, command: str, @@ -60,7 +128,8 @@ def sh( """ # Print the command if shellVerbose is True if self.shellVerbose and not secret: - print("> " + command, flush=True) + highlighted_command = self._highlight_docker_operations(command) + print("> " + highlighted_command, flush=True) # Run the shell command proc = subprocess.Popen( @@ -91,6 +160,12 @@ def sh( raise RuntimeError("Console script timeout") from exc # Check for failure + success = proc.returncode == 0 + + # Show docker operation completion status + if not secret: + self._show_docker_completion(command, success) + if proc.returncode != 0: if not canFail: if not secret: diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index ac4527ed..7db910b4 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -119,6 +119,56 @@ def __init__(self, **kwargs): return Args(**kwargs) +def process_batch_manifest(batch_manifest_file: str) -> Dict[str, List[str]]: + """Process batch manifest file and extract model tags based on build_new flag. + + Args: + batch_manifest_file: Path to the input batch.json file + + Returns: + Dict containing 'build_tags' and 'all_tags' lists + + Raises: + FileNotFoundError: If the manifest file doesn't exist + ValueError: If the manifest format is invalid + """ + if not os.path.exists(batch_manifest_file): + raise FileNotFoundError(f"Batch manifest file not found: {batch_manifest_file}") + + try: + with open(batch_manifest_file, 'r') as f: + manifest_data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in batch manifest file: {e}") + + if not isinstance(manifest_data, list): + raise ValueError("Batch manifest must be a list of model objects") + + build_tags = [] # Models that need to be built (build_new=true) + all_tags = [] # All models in the manifest + + for i, model in enumerate(manifest_data): + if not isinstance(model, dict): + raise ValueError(f"Model entry {i} must be a dictionary") + + if "model_name" not in model: + raise ValueError(f"Model entry {i} missing required 'model_name' field") + + model_name = model["model_name"] + build_new = model.get("build_new", False) + + all_tags.append(model_name) + if build_new: + build_tags.append(model_name) + + return { + "build_tags": build_tags, + "all_tags": all_tags, + "manifest_data": manifest_data + } + + + def validate_additional_context( additional_context: str, additional_context_file: Optional[str] = None, @@ -219,6 +269,127 @@ def save_summary_with_feedback(summary: Dict, output_path: Optional[str], summar raise typer.Exit(ExitCode.FAILURE) +def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, registry: Optional[str]) -> None: + """Process batch manifest and add entries for all models to build_manifest.json. + + Args: + batch_data: Processed batch manifest data + manifest_output: Path to the build manifest file + registry: Registry used for the build + """ + from madengine.tools.discover_models import DiscoverModels + + # Load the existing build manifest + if os.path.exists(manifest_output): + with open(manifest_output, 'r') as f: + build_manifest = json.load(f) + else: + # Create a minimal manifest structure + build_manifest = { + "built_images": {}, + "built_models": {}, + "context": {}, + "credentials_required": [], + "registry": registry or "" + } + + # Process each model in the batch manifest + for model_entry in batch_data["manifest_data"]: + model_name = model_entry["model_name"] + build_new = model_entry.get("build_new", False) + model_registry_image = model_entry.get("registry_image", "") + model_registry = model_entry.get("registry", "") + + # If the model was not built (build_new=false), create an entry for it + if not build_new: + # Find the model configuration by discovering models with this tag + try: + # Create a temporary args object to discover the model + temp_args = create_args_namespace( + tags=[model_name], + registry=registry, + additional_context="{}", + additional_context_file=None, + clean_docker_cache=False, + manifest_output=manifest_output, + live_output=False, + output="perf.csv", + ignore_deprecated_flag=False, + data_config_file_name="data.json", + tools_json_file_name="scripts/common/tools.json", + generate_sys_env_details=True, + force_mirror_local=None, + disable_skip_gpu_arch=False, + verbose=False, + _separate_phases=True, + ) + + discover_models = DiscoverModels(args=temp_args) + models = discover_models.run() + + for model_info in models: + if model_info["name"] == model_name: + # Create a synthetic image name for this model + synthetic_image_name = f"ci-{model_name}_{model_name}.ubuntu.amd" + + # Add to built_images (even though it wasn't actually built) + build_manifest["built_images"][synthetic_image_name] = { + "docker_image": synthetic_image_name, + "dockerfile": model_info.get("dockerfile", f"docker/{model_name}"), + "base_docker": "rocm/pytorch", # Default base + "docker_sha": "", # No SHA since not built + "build_duration": 0, + "build_command": f"# Skipped build for {model_name} (build_new=false)", + "log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log", + "registry_image": model_registry_image or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" if model_registry_image or model_registry or registry else "" + } + + # Add to built_models + build_manifest["built_models"][synthetic_image_name] = { + "name": model_info["name"], + "dockerfile": model_info.get("dockerfile", f"docker/{model_name}"), + "scripts": model_info.get("scripts", f"scripts/{model_name}/run.sh"), + "n_gpus": model_info.get("n_gpus", "1"), + "owner": model_info.get("owner", ""), + "training_precision": model_info.get("training_precision", ""), + "tags": model_info.get("tags", []), + "args": model_info.get("args", ""), + "cred": model_info.get("cred", "") + } + break + + except Exception as e: + console.print(f"Warning: Could not process model {model_name}: {e}") + # Create a minimal entry anyway + synthetic_image_name = f"ci-{model_name}_{model_name}.ubuntu.amd" + build_manifest["built_images"][synthetic_image_name] = { + "docker_image": synthetic_image_name, + "dockerfile": f"docker/{model_name}", + "base_docker": "rocm/pytorch", + "docker_sha": "", + "build_duration": 0, + "build_command": f"# Skipped build for {model_name} (build_new=false)", + "log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log", + "registry_image": model_registry_image or "" + } + build_manifest["built_models"][synthetic_image_name] = { + "name": model_name, + "dockerfile": f"docker/{model_name}", + "scripts": f"scripts/{model_name}/run.sh", + "n_gpus": "1", + "owner": "", + "training_precision": "", + "tags": [], + "args": "" + } + + # Save the updated manifest + with open(manifest_output, 'w') as f: + json.dump(build_manifest, f, indent=2) + + console.print(f"āœ… Added entries for all models from batch manifest to {manifest_output}") + + def display_results_table(summary: Dict, title: str) -> None: """Display results in a formatted table.""" table = Table(title=title, show_header=True, header_style="bold magenta") @@ -265,6 +436,7 @@ def get_display_names(items, limit=5): def build( tags: Annotated[List[str], typer.Option("--tags", "-t", help="Model tags to build (can specify multiple)")] = [], registry: Annotated[Optional[str], typer.Option("--registry", "-r", help="Docker registry to push images to")] = None, + batch_manifest: Annotated[Optional[str], typer.Option("--batch-manifest", help="Input batch.json file for batch build mode")] = None, additional_context: Annotated[str, typer.Option("--additional-context", "-c", help="Additional context as JSON string")] = "{}", additional_context_file: Annotated[Optional[str], typer.Option("--additional-context-file", "-f", help="File containing additional context JSON")] = None, clean_docker_cache: Annotated[bool, typer.Option("--clean-docker-cache", help="Rebuild images without using cache")] = False, @@ -289,13 +461,38 @@ def build( """ setup_logging(verbose) - console.print(Panel( - f"šŸ”Ø [bold cyan]Building Models[/bold cyan]\n" - f"Tags: [yellow]{', '.join(tags) if tags else 'All models'}[/yellow]\n" - f"Registry: [yellow]{registry or 'Local only'}[/yellow]", - title="Build Configuration", - border_style="blue" - )) + # Validate mutually exclusive options + if batch_manifest and tags: + console.print("āŒ [bold red]Error: Cannot specify both --batch-manifest and --tags options[/bold red]") + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Process batch manifest if provided + batch_data = None + effective_tags = tags + if batch_manifest: + try: + batch_data = process_batch_manifest(batch_manifest) + effective_tags = batch_data["build_tags"] + console.print(Panel( + f"ļæ½ [bold cyan]Batch Build Mode[/bold cyan]\n" + f"Input manifest: [yellow]{batch_manifest}[/yellow]\n" + f"Total models: [yellow]{len(batch_data['all_tags'])}[/yellow]\n" + f"Models to build: [yellow]{len(batch_data['build_tags'])}[/yellow] ({', '.join(batch_data['build_tags']) if batch_data['build_tags'] else 'none'})\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]", + title="Batch Build Configuration", + border_style="blue" + )) + except (FileNotFoundError, ValueError) as e: + console.print(f"āŒ [bold red]Error processing batch manifest: {e}[/bold red]") + raise typer.Exit(ExitCode.INVALID_ARGS) + else: + console.print(Panel( + f"ļæ½šŸ”Ø [bold cyan]Building Models[/bold cyan]\n" + f"Tags: [yellow]{', '.join(tags) if tags else 'All models'}[/yellow]\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]", + title="Build Configuration", + border_style="blue" + )) try: # Validate additional context @@ -303,7 +500,7 @@ def build( # Create arguments object args = create_args_namespace( - tags=tags, + tags=effective_tags, registry=registry, additional_context=additional_context, additional_context_file=additional_context_file, @@ -338,6 +535,12 @@ def build( ) progress.update(task, description="Build completed!") + # Handle batch manifest post-processing + if batch_data: + with console.status("Processing batch manifest..."): + _process_batch_manifest_entries(batch_data, manifest_output, registry) + + # Display results display_results_table(build_summary, "Build Results") diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index 66fb84ac..db504803 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -284,11 +284,23 @@ def dump_csv_output(self): fs.write(sys_config_info[j]) fs.write("\n") fs.close() - print ("OK: Dumped into {} file.".format(self.filename)) + print("\n" + "="*60) + print(f"āœ… SUCCESS: System config data dumped to {self.filename}") + print("="*60 + "\n") def print_csv_output(self): - print ("Printing the sys config info env variables...") + print("\n" + "="*80) + print("šŸ“‹ SYSTEM CONFIG INFO - ENVIRONMENT VARIABLES") + print("="*80) if self.sys_config_info_list: for j in range(len(self.sys_config_info_list)): line = self.sys_config_info_list[j] - print (line) + # Add some formatting for key-value pairs + if "|" in line and not line.startswith("Tag"): + key, value = line.split("|", 1) + print(f"šŸ”¹ {key:<30}: {value}") + else: + print(f"šŸ“Œ {line}") + else: + print("āŒ No system config information available") + print("="*80 + "\n") diff --git a/src/madengine/tools/container_runner.py b/src/madengine/tools/container_runner.py index f29ef9ea..4057ba93 100644 --- a/src/madengine/tools/container_runner.py +++ b/src/madengine/tools/container_runner.py @@ -211,15 +211,21 @@ def pull_image(self, registry_image: str, local_name: str = None, if registry and credentials: self.login_to_registry(registry, credentials) - print(f"Pulling image: {registry_image}") + print(f"\nšŸ“„ Starting docker pull from registry...") + print(f"šŸ“ Registry: {registry or 'Default'}") + print(f"šŸ·ļø Image: {registry_image}") try: self.console.sh(f"docker pull {registry_image}") if local_name: self.console.sh(f"docker tag {registry_image} {local_name}") - print(f"Tagged as: {local_name}") + print(f"šŸ·ļø Tagged as: {local_name}") + print(f"āœ… Successfully pulled and tagged image") + print(f"{'='*80}") return local_name + print(f"āœ… Successfully pulled image: {registry_image}") + print(f"{'='*80}") return registry_image except Exception as e: @@ -542,7 +548,14 @@ def run_container(self, model_info: typing.Dict, docker_image: str, print(f"Docker options: {docker_options}") # set timeout - print(f"Setting timeout to {str(timeout)} seconds.") + print(f"ā° Setting timeout to {str(timeout)} seconds.") + + print(f"\nšŸƒ Starting Docker container execution...") + print(f"šŸ·ļø Image: {docker_image}") + print(f"šŸ“¦ Container: {container_name}") + print(f"šŸ“ Log file: {log_file_path}") + print(f"šŸŽ® GPU Vendor: {gpu_vendor}") + print(f"{'='*80}") # Run the container with logging try: @@ -554,13 +567,15 @@ def run_container(self, model_info: typing.Dict, docker_image: str, # Check user whoami = model_docker.sh("whoami") - print(f"USER is {whoami}") + print(f"šŸ‘¤ Running as user: {whoami}") # Show GPU info if gpu_vendor.find("AMD") != -1: + print(f"šŸŽ® Checking AMD GPU status...") smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true") print(smi) elif gpu_vendor.find("NVIDIA") != -1: + print(f"šŸŽ® Checking NVIDIA GPU status...") smi = model_docker.sh("/usr/bin/nvidia-smi || true") print(smi) @@ -691,8 +706,50 @@ def run_container(self, model_info: typing.Dict, docker_image: str, except Exception as e: print(f"Warning: Could not extract performance metrics: {e}") - # Set status based on performance - run_results["status"] = 'SUCCESS' if run_results.get("performance") else 'FAILURE' + # Set status based on performance and error patterns + # First check for obvious failure patterns in the logs + try: + # Check for common failure patterns in the log file + error_patterns = [ + "OutOfMemoryError", "HIP out of memory", "CUDA out of memory", + "RuntimeError", "AssertionError", "ValueError", "SystemExit", + "failed (exitcode:", "Error:", "FAILED", "Exception:" + ] + + has_errors = False + if log_file_path and os.path.exists(log_file_path): + try: + # Check for error patterns in the log (exclude our own grep commands and output messages) + for pattern in error_patterns: + # Use grep with -v to exclude our own commands and output to avoid false positives + error_check_cmd = f"grep -v -E '(grep -q.*{pattern}|Found error pattern.*{pattern})' {log_file_path} | grep -q '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'" + result = self.console.sh(error_check_cmd, canFail=True) + if result.strip() == "FOUND": + has_errors = True + print(f"Found error pattern '{pattern}' in logs") + break + except Exception: + pass # Error checking is optional + + # Status logic: Must have performance AND no errors to be considered success + performance_value = run_results.get("performance") + has_performance = performance_value and performance_value.strip() and performance_value.strip() != "N/A" + + if has_errors: + run_results["status"] = 'FAILURE' + print(f"Status: FAILURE (error patterns detected in logs)") + elif has_performance: + run_results["status"] = 'SUCCESS' + print(f"Status: SUCCESS (performance metrics found, no errors)") + else: + run_results["status"] = 'FAILURE' + print(f"Status: FAILURE (no performance metrics)") + + except Exception as e: + print(f"Warning: Error in status determination: {e}") + # Fallback to simple performance check + run_results["status"] = 'SUCCESS' if run_results.get("performance") else 'FAILURE' + print(f"{model_info['name']} performance is {run_results.get('performance', 'N/A')} {run_results.get('metric', '')}") # Generate performance results and update perf.csv diff --git a/src/madengine/tools/csv_to_html.py b/src/madengine/tools/csv_to_html.py index 5a27952a..2bbcc38d 100644 --- a/src/madengine/tools/csv_to_html.py +++ b/src/madengine/tools/csv_to_html.py @@ -30,7 +30,17 @@ def convert_csv_to_html(file_path: str): output_name += file_name + ".html" # read csv df = pd.read_csv(file_path) - print(df) + + # Use beautiful formatting for dataframe display + try: + from madengine.utils.log_formatting import print_dataframe_beautiful + print_dataframe_beautiful(df, f"Converting CSV: {file_name}") + except ImportError: + # Fallback to basic formatting if utils not available + print(f"\nšŸ“Š Converting CSV: {file_name}") + print("="*80) + print(df.to_string(max_rows=20, max_cols=10)) + print("="*80) # Use the .to_html() to get your table in html df_html = df.to_html(index=False) @@ -67,7 +77,17 @@ def run(self): # read csv df = pd.read_csv(file_path) - print(df) + + # Use beautiful formatting for dataframe display + try: + from madengine.utils.log_formatting import print_dataframe_beautiful + print_dataframe_beautiful(df, f"CSV Data from {file_name}") + except ImportError: + # Fallback to basic formatting if utils not available + print(f"\nšŸ“Š CSV Data from {file_name}") + print("="*80) + print(df.to_string(max_rows=20, max_cols=10)) + print("="*80) # Use the .to_html() to get your table in html df_html = df.to_html(index=False) diff --git a/src/madengine/tools/distributed_orchestrator.py b/src/madengine/tools/distributed_orchestrator.py index c6246c4c..d21a9a0d 100644 --- a/src/madengine/tools/distributed_orchestrator.py +++ b/src/madengine/tools/distributed_orchestrator.py @@ -311,10 +311,15 @@ def run_phase(self, manifest_file: str = "build_manifest.json", generate_sys_env_details=getattr(self.args, 'generate_sys_env_details', True) ) - execution_summary["successful_runs"].append(run_results) - execution_summary["total_execution_time"] += run_results.get("test_duration", 0) + # Add to appropriate list based on actual status + if run_results.get("status") == "SUCCESS": + execution_summary["successful_runs"].append(run_results) + print(f"Successfully completed: {model_info['name']} -> {run_results['status']}") + else: + execution_summary["failed_runs"].append(run_results) + print(f"Failed to complete: {model_info['name']} -> {run_results['status']}") - print(f"Successfully completed: {model_info['name']} -> {run_results['status']}") + execution_summary["total_execution_time"] += run_results.get("test_duration", 0) except Exception as e: print(f"Failed to run {model_info['name']} with image {image_name}: {e}") @@ -404,10 +409,15 @@ def run_phase(self, manifest_file: str = "build_manifest.json", generate_sys_env_details=getattr(self.args, 'generate_sys_env_details', True) ) - execution_summary["successful_runs"].append(run_results) - execution_summary["total_execution_time"] += run_results.get("test_duration", 0) + # Add to appropriate list based on actual status + if run_results.get("status") == "SUCCESS": + execution_summary["successful_runs"].append(run_results) + print(f"Successfully completed: {model_name} -> {run_results['status']}") + else: + execution_summary["failed_runs"].append(run_results) + print(f"Failed to complete: {model_name} -> {run_results['status']}") - print(f"Successfully completed: {model_name} -> {run_results['status']}") + execution_summary["total_execution_time"] += run_results.get("test_duration", 0) except Exception as e: print(f"Failed to run {model_name} with image {image_name}: {e}") diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 23190e5b..26183433 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -91,10 +91,7 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, Returns: dict: Build information including image name, build duration, etc. """ - print(f"Building Docker image for model {model_info['name']} from {dockerfile}") - print(f"Building Docker image...") - - # Generate image name + # Generate image name first image_docker_name = ( model_info["name"].replace("/", "_").lower() + "_" @@ -115,8 +112,11 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, # Replace / with _ in log file path (already done above, but keeping for safety) log_file_path = log_file_path.replace("/", "_") - print(f"Processing Dockerfile: {dockerfile}") - print(f"Build log will be written to: {log_file_path}") + print(f"\nšŸ”Ø Starting Docker build for model: {model_info['name']}") + print(f"šŸ“ Dockerfile: {dockerfile}") + print(f"šŸ·ļø Target image: {docker_image}") + print(f"šŸ“ Build log: {log_file_path}") + print(f"{'='*80}") # Get docker context docker_context = self.get_context_path(model_info) @@ -148,13 +148,15 @@ def build_image(self, model_info: typing.Dict, dockerfile: str, # Execute build with log redirection with open(log_file_path, mode="w", buffering=1) as outlog: with redirect_stdout(PythonicTee(outlog, self.live_output)), redirect_stderr(PythonicTee(outlog, self.live_output)): - print(f"Executing: {build_command}") + print(f"šŸ”Ø Executing build command...") self.console.sh(build_command, timeout=None) build_duration = time.time() - build_start_time - print(f"Build Duration: {build_duration} seconds") - print(f"MAD_CONTAINER_IMAGE is {docker_image}") + print(f"ā±ļø Build Duration: {build_duration:.2f} seconds") + print(f"šŸ·ļø MAD_CONTAINER_IMAGE is {docker_image}") + print(f"āœ… Docker build completed successfully") + print(f"{'='*80}") # Get base docker info base_docker = "" @@ -294,15 +296,18 @@ def push_image(self, docker_image: str, registry: str = None, credentials: typin # Tag the image if different from local name if registry_image != docker_image: tag_command = f"docker tag {docker_image} {registry_image}" - print(f"Tagging image: {tag_command}") + print(f"šŸ·ļø Tagging image: {tag_command}") self.console.sh(tag_command) # Push the image push_command = f"docker push {registry_image}" - print(f"Pushing image: {push_command}") + print(f"\nšŸš€ Starting docker push to registry...") + print(f"šŸ“¤ Registry: {registry}") + print(f"šŸ·ļø Image: {registry_image}") self.console.sh(push_command) - print(f"Successfully pushed image to registry: {registry_image}") + print(f"āœ… Successfully pushed image to registry: {registry_image}") + print(f"{'='*80}") return registry_image except Exception as e: diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index ddcc166d..cd2f3a46 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -118,7 +118,17 @@ def print_perf(self): Method to print stage perf results of a model. """ - print(f"{self.model} performance is {self.performance} {self.metric}") + print("\n" + "="*60) + print(f"šŸ“Š PERFORMANCE RESULTS") + print("="*60) + print(f"šŸ·ļø Model: {self.model}") + print(f"⚔ Performance: {self.performance} {self.metric}") + print(f"šŸ“ˆ Status: {self.status}") + if self.machine_name: + print(f"šŸ–„ļø Machine: {self.machine_name}") + if self.gpu_architecture: + print(f"šŸŽ® GPU Architecture: {self.gpu_architecture}") + print("="*60 + "\n") # Exports all info in json format to json_name # multiple_results excludes the info provided on csv diff --git a/src/madengine/tools/update_perf_csv.py b/src/madengine/tools/update_perf_csv.py index 09c267f1..f26da890 100644 --- a/src/madengine/tools/update_perf_csv.py +++ b/src/madengine/tools/update_perf_csv.py @@ -195,12 +195,17 @@ def update_perf_csv( model_name: typing.Optional[str] = None, ): """Update the performance csv file with the latest performance data.""" - print(f"Attaching performance metrics of models to perf.csv") + print("\n" + "="*80) + print("šŸ“ˆ ATTACHING PERFORMANCE METRICS TO DATABASE") + print("="*80) + print(f"šŸ“‚ Target file: {perf_csv}") + # read perf.csv perf_csv_df = df_strip_columns(pd.read_csv(perf_csv)) # handle multiple_results, single_result, and exception_result if multiple_results: + print("šŸ”„ Processing multiple results...") perf_csv_df = handle_multiple_results( perf_csv_df, multiple_results, @@ -208,17 +213,22 @@ def update_perf_csv( model_name, ) elif single_result: + print("šŸ”„ Processing single result...") perf_csv_df = handle_single_result(perf_csv_df, single_result) elif exception_result: + print("āš ļø Processing exception result...") perf_csv_df = handle_exception_result( perf_csv_df, exception_result ) else: - print("No results to update in perf.csv") + print("ā„¹ļø No results to update in perf.csv") # write new perf.csv # Note that this file will also generate a perf_entry.csv regardless of the output file args. perf_csv_df.to_csv(perf_csv, index=False) + print(f"āœ… Successfully updated: {perf_csv}") + print("="*80 + "\n") + perf_csv_df.to_csv(perf_csv, index=False) class UpdatePerfCsv: @@ -238,12 +248,17 @@ def __init__(self, args: argparse.Namespace): def run(self): """Update the performance csv file with the latest performance data.""" - print(f"Updating performance metrics of models perf.csv to database") + print("\n" + "="*80) + print("šŸ“Š UPDATING PERFORMANCE METRICS DATABASE") + print("="*80) + print(f"šŸ“‚ Processing: {self.args.perf_csv}") + # read perf.csv perf_csv_df = df_strip_columns(pd.read_csv(self.args.perf_csv)) # handle multiple_results, single_result, and exception_result if self.args.multiple_results: + print("šŸ”„ Processing multiple results...") perf_csv_df = handle_multiple_results( perf_csv_df, self.args.multiple_results, @@ -251,17 +266,22 @@ def run(self): self.args.model_name, ) elif self.args.single_result: + print("šŸ”„ Processing single result...") perf_csv_df = handle_single_result(perf_csv_df, self.args.single_result) elif self.args.exception_result: + print("āš ļø Processing exception result...") perf_csv_df = handle_exception_result( perf_csv_df, self.args.exception_result ) else: - print("No results to update in perf.csv") + print("ā„¹ļø No results to update in perf.csv") # write new perf.csv # Note that this file will also generate a perf_entry.csv regardless of the output file args. perf_csv_df.to_csv(self.args.perf_csv, index=False) + + print(f"āœ… Successfully updated: {self.args.perf_csv}") + print("="*80 + "\n") self.return_status = True return self.return_status diff --git a/src/madengine/utils/log_formatting.py b/src/madengine/utils/log_formatting.py new file mode 100644 index 00000000..26daae7b --- /dev/null +++ b/src/madengine/utils/log_formatting.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +""" +Utility functions for formatting and displaying data in logs. + +This module provides enhanced formatting utilities for better log readability, +including dataframe formatting and other display utilities. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import pandas as pd +import typing +from rich.table import Table +from rich.console import Console as RichConsole +from rich.text import Text + + +def format_dataframe_for_log(df: pd.DataFrame, title: str = "DataFrame", max_rows: int = 20, max_cols: int = 10) -> str: + """ + Format a pandas DataFrame for beautiful log output. + + Args: + df: The pandas DataFrame to format + title: Title for the dataframe display + max_rows: Maximum number of rows to display + max_cols: Maximum number of columns to display + + Returns: + str: Beautifully formatted string representation of the DataFrame + """ + if df.empty: + return f"\nšŸ“Š {title}\n{'='*60}\nāŒ DataFrame is empty\n{'='*60}\n" + + # Define key columns to display for performance results + key_columns = [ + "model", "n_gpus", "docker_file", "machine_name", "gpu_architecture", + "performance", "metric", "status", "dataname" + ] + + # Filter DataFrame to show only key columns that exist + available_columns = [col for col in key_columns if col in df.columns] + if available_columns: + display_df = df[available_columns].copy() + total_columns_note = f"(showing {len(available_columns)} of {len(df.columns)} columns)" + else: + # If no key columns found, show all columns as fallback with truncation + display_df = df.copy() + total_columns_note = f"(showing all {len(df.columns)} columns)" + if len(df.columns) > max_cols: + display_df = display_df.iloc[:, :max_cols] + total_columns_note = f"(showing first {max_cols} of {len(df.columns)} columns)" + + # Truncate rows if necessary + truncated_rows = False + if len(display_df) > max_rows: + display_df = display_df.head(max_rows) + truncated_rows = True + + # Create header + header = f"\nšŸ“Š {title} {total_columns_note}\n" + header += f"{'='*80}\n" + if available_columns: + header += f"šŸ“ Shape: {df.shape[0]} rows Ɨ {len(available_columns)} key columns (total: {df.shape[1]} columns)\n" + else: + header += f"šŸ“ Shape: {df.shape[0]} rows Ɨ {df.shape[1]} columns\n" + + if truncated_rows: + header += f"āš ļø Display truncated: showing first {max_rows} rows\n" + + header += f"{'='*80}\n" + + # Format the DataFrame with nice styling + formatted_df = display_df.to_string( + index=True, + max_rows=max_rows, + width=None, + float_format='{:.4f}'.format + ) + + # Add some visual separators + footer = f"\n{'='*80}\n" + + return header + formatted_df + footer + + +def format_dataframe_rich(df: pd.DataFrame, title: str = "DataFrame", max_rows: int = 20) -> None: + """ + Display a pandas DataFrame using Rich formatting for enhanced readability. + + Args: + df: The pandas DataFrame to display + title: Title for the table + max_rows: Maximum number of rows to display + """ + console = RichConsole() + + if df.empty: + console.print(f"šŸ“Š [bold cyan]{title}[/bold cyan]: [red]DataFrame is empty[/red]") + return + + # Define key columns to display for performance results + key_columns = [ + "model", "n_gpus", "machine_name", "gpu_architecture", + "performance", "metric", "status", "dataname" + ] + + # Filter DataFrame to show only key columns that exist + available_columns = [col for col in key_columns if col in df.columns] + if available_columns: + display_df = df[available_columns] + total_columns_note = f"(showing {len(available_columns)} of {len(df.columns)} columns)" + else: + # If no key columns found, show all columns as fallback + display_df = df + total_columns_note = f"(showing all {len(df.columns)} columns)" + + # Create Rich table + table = Table(title=f"šŸ“Š {title} {total_columns_note}", show_header=True, header_style="bold magenta") + + # Add index column + table.add_column("Index", style="dim", width=8) + + # Add data columns + for col in display_df.columns: + table.add_column(str(col), style="cyan") + + # Add rows (truncate if necessary) + display_rows = min(len(display_df), max_rows) + for i in range(display_rows): + row_data = [str(display_df.index[i])] + for col in display_df.columns: + value = display_df.iloc[i][col] + if pd.isna(value): + row_data.append("[dim]NaN[/dim]") + elif isinstance(value, float): + row_data.append(f"{value:.4f}") + else: + row_data.append(str(value)) + table.add_row(*row_data) + + # Show truncation info + if len(display_df) > max_rows: + table.add_row(*["..." for _ in range(len(display_df.columns) + 1)]) + console.print(f"[yellow]āš ļø Showing first {max_rows} of {len(display_df)} rows[/yellow]") + + console.print(table) + console.print(f"[green]✨ DataFrame shape: {df.shape[0]} rows Ɨ {len(available_columns)} key columns (total: {df.shape[1]} columns)[/green]") + + +def print_dataframe_beautiful(df: pd.DataFrame, title: str = "Data", use_rich: bool = True) -> None: + """ + Print a pandas DataFrame with beautiful formatting. + + Args: + df: The pandas DataFrame to print + title: Title for the display + use_rich: Whether to use Rich formatting (if available) or fall back to simple formatting + """ + try: + if use_rich: + format_dataframe_rich(df, title) + else: + raise ImportError("Fallback to simple formatting") + except (ImportError, Exception): + # Fallback to simple but nice formatting + formatted_output = format_dataframe_for_log(df, title) + print(formatted_output) + + +def highlight_log_section(title: str, content: str, style: str = "info") -> str: + """ + Create a highlighted log section with borders and styling. + + Args: + title: Section title + content: Section content + style: Style type ('info', 'success', 'warning', 'error') + + Returns: + str: Formatted log section + """ + styles = { + 'info': {'emoji': 'ā„¹ļø', 'border': '-'}, + 'success': {'emoji': 'āœ…', 'border': '='}, + 'warning': {'emoji': 'āš ļø', 'border': '!'}, + 'error': {'emoji': 'āŒ', 'border': '#'} + } + + style_config = styles.get(style, styles['info']) + emoji = style_config['emoji'] + border_char = style_config['border'] + + border = border_char * 80 + header = f"\n{border}\n{emoji} {title.upper()}\n{border}" + footer = f"{border}\n" + + return f"{header}\n{content}\n{footer}" diff --git a/tests/test_distributed_integration.py b/tests/test_distributed_integration.py index daae5f67..4feaaf6d 100644 --- a/tests/test_distributed_integration.py +++ b/tests/test_distributed_integration.py @@ -774,8 +774,8 @@ def test_distributed_profiling_tools_integration(self, mock_exists, mock_data, m # Mock successful container run mock_run_container.return_value = { - "model": "dummy", - "status": "success", + "model": "dummy_prof", + "status": "SUCCESS", "test_duration": 30.5, "profiling_data": { "rocprof_output": "/tmp/rocprof/output.csv" @@ -785,22 +785,38 @@ def test_distributed_profiling_tools_integration(self, mock_exists, mock_data, m # Mock manifest with profiling tools manifest_with_profiling = { "built_images": { - "ci-dummy_profiling.ubuntu.amd": { - "docker_image": "ci-dummy_profiling.ubuntu.amd", + "ci-dummy_prof_dummy.ubuntu.amd": { + "docker_image": "ci-dummy_prof_dummy.ubuntu.amd", "dockerfile": "docker/dummy.ubuntu.amd.Dockerfile", - "build_duration": 45.2 + "base_docker": "rocm/pytorch", + "docker_sha": "sha256:47efe367d76c620ee828750fb294303f3f9f5fb6c184362a4741ce5e55ed3769", + "build_duration": 0.559730052947998, + "build_command": "docker build --network=host -t ci-dummy_prof_dummy.ubuntu.amd --pull -f docker/dummy.ubuntu.amd.Dockerfile ./docker", + "log_file": "dummy_prof_dummy.ubuntu.amd.build.live.log" } }, "built_models": { - "ci-dummy_profiling.ubuntu.amd": { - "name": "dummy_profiling", + "ci-dummy_prof_dummy.ubuntu.amd": { + "name": "dummy_prof", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_prof.sh", "n_gpus": "1", - "scripts": "scripts/dummy/run.sh", - "dockerfile": "docker/dummy.ubuntu.amd.Dockerfile", - "tags": ["dummy", "profiling"], - "tools": ["rocprof", "roctracer"] + "owner": "mmelesse@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "" } - } + }, + "context": { + "docker_env_vars": {}, + "docker_mounts": {}, + "docker_build_arg": {}, + "gpu_vendor": "AMD", + "docker_gpus": "" + }, + "credentials_required": [] } with patch('builtins.open', mock_open(read_data=json.dumps(manifest_with_profiling))):