diff --git a/src/madengine/mad_cli.py b/src/madengine/mad_cli.py index 6f238276..577da662 100644 --- a/src/madengine/mad_cli.py +++ b/src/madengine/mad_cli.py @@ -283,14 +283,15 @@ def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, regi if os.path.exists(manifest_output): with open(manifest_output, 'r') as f: build_manifest = json.load(f) + # Remove top-level registry if present + build_manifest.pop("registry", None) else: # Create a minimal manifest structure build_manifest = { "built_images": {}, "built_models": {}, "context": {}, - "credentials_required": [], - "registry": registry or "" + "credentials_required": [] } # Process each model in the batch manifest @@ -341,7 +342,8 @@ def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, regi "build_duration": 0, "build_command": f"# Skipped build for {model_name} (build_new=false)", "log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log", - "registry_image": model_registry_image or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" if model_registry_image or model_registry or registry else "" + "registry_image": model_registry_image or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" if model_registry_image or model_registry or registry else "", + "registry": model_registry or registry or "dockerhub" } # Add to built_models @@ -370,7 +372,8 @@ def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, regi "build_duration": 0, "build_command": f"# Skipped build for {model_name} (build_new=false)", "log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log", - "registry_image": model_registry_image or "" + "registry_image": model_registry_image or "", + "registry": model_registry or registry or "dockerhub" } build_manifest["built_models"][synthetic_image_name] = { "name": model_name, @@ -470,9 +473,17 @@ def build( batch_data = None effective_tags = tags batch_build_metadata = None + + # There are 2 scenarios for batch builds and single builds + # - Batch builds: Use the batch manifest to determine which models to build + # - Single builds: Use the tags directly if batch_manifest: + # Process the batch manifest + if verbose: console.print(f"[DEBUG] Processing batch manifest: {batch_manifest}") try: batch_data = process_batch_manifest(batch_manifest) + if verbose: console.print(f"[DEBUG] batch_data: {batch_data}") + effective_tags = batch_data["build_tags"] # Build a mapping of model_name -> registry_image/registry for build_new models batch_build_metadata = {} @@ -482,6 +493,8 @@ def build( "registry_image": model.get("registry_image"), "registry": model.get("registry") } + if verbose: console.print(f"[DEBUG] batch_build_metadata: {batch_build_metadata}") + console.print(Panel( f"ļæ½ [bold cyan]Batch Build Mode[/bold cyan]\n" f"Input manifest: [yellow]{batch_manifest}[/yellow]\n" @@ -538,12 +551,13 @@ def build( orchestrator = DistributedOrchestrator(args, build_only_mode=True) progress.update(task, description="Building models...") - # Pass batch_build_metadata to build_phase if present + # Prepare build phase arguments build_phase_kwargs = dict( registry=registry, clean_cache=clean_docker_cache, manifest_output=manifest_output ) + # Pass batch_build_metadata to build_phase if present if batch_build_metadata: build_phase_kwargs["batch_build_metadata"] = batch_build_metadata diff --git a/src/madengine/tools/distributed_orchestrator.py b/src/madengine/tools/distributed_orchestrator.py index 9234de9c..c7b86ed5 100644 --- a/src/madengine/tools/distributed_orchestrator.py +++ b/src/madengine/tools/distributed_orchestrator.py @@ -5,6 +5,8 @@ This module provides orchestration capabilities for distributed execution scenarios like Ansible or Kubernetes, where Docker image building and container execution are separated across different nodes. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ import os @@ -87,7 +89,8 @@ def __init__(self, args, build_only_mode: bool = False): print(f"Docker Hub credentials: {self.credentials['dockerhub']}") def build_phase(self, registry: str = None, clean_cache: bool = False, - manifest_output: str = "build_manifest.json", batch_build_metadata: typing.Optional[dict] = None) -> typing.Dict: + manifest_output: str = "build_manifest.json", + batch_build_metadata: typing.Optional[dict] = None) -> typing.Dict: """Execute the build phase - build all Docker images. This method supports both build-only mode (for dedicated build nodes) @@ -109,15 +112,20 @@ def build_phase(self, registry: str = None, clean_cache: bool = False, print("(Build-only mode - no GPU detection)") print("=" * 60) - print(f"Building models with args {self.args}") + # Print the arguments as a dictionary for better readability + print(f"Building models with args: {vars(self.args) if hasattr(self.args, '__dict__') else self.args}") # Discover models + print("=" * 60) + print("DISCOVERING MODELS") discover_models = DiscoverModels(args=self.args) models = discover_models.run() print(f"Discovered {len(models)} models to build") # Copy scripts for building + print("=" * 60) + print("COPYING SCRIPTS") self._copy_scripts() # Validate build context for build-only mode @@ -144,7 +152,7 @@ def build_phase(self, registry: str = None, clean_cache: bool = False, ) # Export build manifest with registry information - builder.export_build_manifest(manifest_output, registry) + builder.export_build_manifest(manifest_output, registry, batch_build_metadata) print("=" * 60) print("BUILD PHASE COMPLETED") @@ -209,18 +217,11 @@ def run_phase(self, manifest_file: str = "build_manifest.json", print(f"Loaded manifest with {len(manifest['built_images'])} images") - # Auto-detect registry from manifest if not provided via CLI - if not registry and "registry" in manifest: - manifest_registry = manifest["registry"] - if manifest_registry and manifest_registry.strip(): # Check for non-empty string - registry = manifest_registry - print(f"Auto-detected registry from manifest: {registry}") - else: - print("Manifest registry is empty, will use local images only") - elif registry: + # Registry is now per-image; CLI registry is fallback + if registry: print(f"Using registry from CLI: {registry}") else: - print("No registry specified, will use local images only") + print("No registry specified, will use per-image registry or local images only") # Copy scripts for running self._copy_scripts() @@ -262,31 +263,17 @@ def run_phase(self, manifest_file: str = "build_manifest.json", model_info = manifest["built_models"][image_name] try: print(f"\nRunning model {model_info['name']} with image {image_name}") - - # Handle registry image pulling and tagging according to manifest - if "registry_image" in build_info: - # Registry image exists - pull it and tag as docker_image, then run with docker_image - registry_image = build_info["registry_image"] - docker_image = build_info["docker_image"] - - # Extract registry from the registry_image format - effective_registry = registry - if not effective_registry and registry_image: - registry_parts = registry_image.split('/') - if len(registry_parts) > 1 and '.' in registry_parts[0]: - effective_registry = registry_parts[0] - elif registry_image.startswith('docker.io/') or '/' in registry_image: - effective_registry = "docker.io" - + # Use per-image registry if present, else CLI registry + effective_registry = build_info.get("registry", registry) + registry_image = build_info.get("registry_image") + docker_image = build_info.get("docker_image") + if registry_image: if effective_registry: print(f"Pulling image from registry: {registry_image}") try: - # Ensure all parameters are strings and credentials is properly formatted registry_image_str = str(registry_image) if registry_image else "" docker_image_str = str(docker_image) if docker_image else "" effective_registry_str = str(effective_registry) if effective_registry else "" - - # Pull registry image and tag it as docker_image runner.pull_image(registry_image_str, docker_image_str, effective_registry_str, self.credentials) actual_image = docker_image_str print(f"Successfully pulled and tagged as: {docker_image_str}") @@ -294,7 +281,6 @@ def run_phase(self, manifest_file: str = "build_manifest.json", print(f"Failed to pull from registry, falling back to local image: {e}") actual_image = docker_image else: - # Registry image exists but no valid registry found, try to pull as-is and tag print(f"Attempting to pull registry image as-is: {registry_image}") try: registry_image_str = str(registry_image) if registry_image else "" diff --git a/src/madengine/tools/docker_builder.py b/src/madengine/tools/docker_builder.py index 6c4f22d6..2945036c 100644 --- a/src/madengine/tools/docker_builder.py +++ b/src/madengine/tools/docker_builder.py @@ -270,7 +270,7 @@ def login_to_registry(self, registry: str, credentials: typing.Dict = None) -> N print(f"Failed to login to registry {registry}: {e}") raise - def push_image(self, docker_image: str, registry: str = None, credentials: typing.Dict = None) -> str: + def push_image(self, docker_image: str, registry: str = None, credentials: typing.Dict = None, explicit_registry_image: str = None) -> str: """Push the built image to a registry. Args: @@ -290,46 +290,70 @@ def push_image(self, docker_image: str, registry: str = None, credentials: typin self.login_to_registry(registry, credentials) # Determine registry image name (this should match what was already determined) - registry_image = self._determine_registry_image_name(docker_image, registry, credentials) + if explicit_registry_image: + registry_image = explicit_registry_image + else: + registry_image = self._determine_registry_image_name(docker_image, registry, credentials) + print(f"[DEBUG] push_image: docker_image='{docker_image}', registry='{registry}', registry_image='{registry_image}'") try: # Tag the image if different from local name if registry_image != docker_image: + print(f"[DEBUG] Tagging image: docker tag {docker_image} {registry_image}") tag_command = f"docker tag {docker_image} {registry_image}" - print(f"šŸ·ļø Tagging image: {tag_command}") self.console.sh(tag_command) - + else: + print(f"[DEBUG] No tag needed, docker_image and registry_image are the same: {docker_image}") + # Push the image + print(f"[DEBUG] Pushing image: docker push {registry_image}") push_command = f"docker push {registry_image}" print(f"\nšŸš€ Starting docker push to registry...") print(f"šŸ“¤ Registry: {registry}") print(f"šŸ·ļø Image: {registry_image}") self.console.sh(push_command) - + print(f"āœ… Successfully pushed image to registry: {registry_image}") print(f"{'='*80}") return registry_image - + except Exception as e: print(f"Failed to push image {docker_image} to registry {registry}: {e}") raise - - def export_build_manifest(self, output_file: str = "build_manifest.json", registry: str = None) -> None: + + def export_build_manifest(self, output_file: str = "build_manifest.json", registry: str = None, batch_build_metadata: typing.Optional[dict] = None) -> None: """Export enhanced build information to a manifest file. This creates a comprehensive build manifest that includes all necessary information for deployment, reducing the need for separate execution configs. - + Args: output_file: Path to output manifest file - registry: Registry used for building (added to manifest metadata) + registry: Registry used for building (added to each image entry) + batch_build_metadata: Optional metadata for batch builds """ # Extract credentials from models credentials_required = list(set([ model.get("cred", "") for model in self.built_models.values() if model.get("cred", "") != "" ])) - + + print(f"[DEBUG] batch_build_metadata: {batch_build_metadata}") + print(f"[DEBUG] built_images: {self.built_images}") + + # Set registry for each built image + for image_name, build_info in self.built_images.items(): + # If registry is not set in build_info, set it from argument + if registry: + build_info["registry"] = registry + + docker_file = build_info.get("dockerfile", "") + truncated_docker_file = docker_file.split("/")[-1].split(".Dockerfile")[0] + model_name = image_name.split("ci-")[1].split(truncated_docker_file)[0].rstrip("_") + if batch_build_metadata and model_name in batch_build_metadata: + print(f"[DEBUG] Overriding registry for {model_name} from batch_build_metadata") + build_info["registry"] = batch_build_metadata[model_name].get("registry") + manifest = { "built_images": self.built_images, "built_models": self.built_models, @@ -342,15 +366,11 @@ def export_build_manifest(self, output_file: str = "build_manifest.json", regist }, "credentials_required": credentials_required } - + # Add multi-node args to context if present if "build_multi_node_args" in self.context.ctx: manifest["context"]["multi_node_args"] = self.context.ctx["build_multi_node_args"] - - # Add registry information to manifest metadata if provided - if registry: - manifest["registry"] = registry - + # Add push failure summary if any pushes failed push_failures = [] for image_name, build_info in self.built_images.items(): @@ -360,13 +380,13 @@ def export_build_manifest(self, output_file: str = "build_manifest.json", regist "intended_registry_image": build_info.get("registry_image"), "error": build_info.get("push_error") }) - + if push_failures: manifest["push_failures"] = push_failures - + with open(output_file, 'w') as f: json.dump(manifest, f, indent=2) - + print(f"Build manifest exported to: {output_file}") if push_failures: print(f"Warning: {len(push_failures)} image(s) failed to push to registry") @@ -438,7 +458,7 @@ def build_all_models(self, models: typing.List[typing.Dict], model_info, dockerfile, credentials, clean_cache, phase_suffix ) - # Determine registry image name and add to manifest before push operations + # Determine registry image name for push/tag registry_image = None if model_registry_image: registry_image = model_registry_image @@ -446,6 +466,11 @@ def build_all_models(self, models: typing.List[typing.Dict], registry_image = self._determine_registry_image_name( build_info["docker_image"], model_registry, credentials ) + # Always use registry_image from batch_build_metadata if present + if batch_build_metadata and model_info["name"] in batch_build_metadata: + meta = batch_build_metadata[model_info["name"]] + if meta.get("registry_image"): + registry_image = meta["registry_image"] if registry_image: build_info["registry_image"] = registry_image if build_info["docker_image"] in self.built_images: @@ -453,9 +478,11 @@ def build_all_models(self, models: typing.List[typing.Dict], # Now attempt to push to registry if registry is set if model_registry and registry_image: + explicit_registry_image = registry_image try: + # Use registry_image from batch_build_metadata for push/tag if present actual_registry_image = self.push_image( - build_info["docker_image"], model_registry, credentials + build_info["docker_image"], model_registry, credentials, explicit_registry_image ) if actual_registry_image != registry_image: print(f"Warning: Pushed image name {actual_registry_image} differs from intended {registry_image}")