Skip to content
24 changes: 19 additions & 5 deletions src/madengine/mad_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,14 +283,15 @@ def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, regi
if os.path.exists(manifest_output):
with open(manifest_output, 'r') as f:
build_manifest = json.load(f)
# Remove top-level registry if present
build_manifest.pop("registry", None)
else:
# Create a minimal manifest structure
build_manifest = {
"built_images": {},
"built_models": {},
"context": {},
"credentials_required": [],
"registry": registry or ""
"credentials_required": []
}

# Process each model in the batch manifest
Expand Down Expand Up @@ -341,7 +342,8 @@ def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, regi
"build_duration": 0,
"build_command": f"# Skipped build for {model_name} (build_new=false)",
"log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log",
"registry_image": model_registry_image or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" if model_registry_image or model_registry or registry else ""
"registry_image": model_registry_image or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" if model_registry_image or model_registry or registry else "",
"registry": model_registry or registry or "dockerhub"
}

# Add to built_models
Expand Down Expand Up @@ -370,7 +372,8 @@ def _process_batch_manifest_entries(batch_data: Dict, manifest_output: str, regi
"build_duration": 0,
"build_command": f"# Skipped build for {model_name} (build_new=false)",
"log_file": f"{model_name}_{model_name}.ubuntu.amd.build.skipped.log",
"registry_image": model_registry_image or ""
"registry_image": model_registry_image or "",
"registry": model_registry or registry or "dockerhub"
}
build_manifest["built_models"][synthetic_image_name] = {
"name": model_name,
Expand Down Expand Up @@ -470,9 +473,17 @@ def build(
batch_data = None
effective_tags = tags
batch_build_metadata = None

# There are 2 scenarios for batch builds and single builds
# - Batch builds: Use the batch manifest to determine which models to build
# - Single builds: Use the tags directly
if batch_manifest:
# Process the batch manifest
if verbose: console.print(f"[DEBUG] Processing batch manifest: {batch_manifest}")
try:
batch_data = process_batch_manifest(batch_manifest)
if verbose: console.print(f"[DEBUG] batch_data: {batch_data}")

effective_tags = batch_data["build_tags"]
# Build a mapping of model_name -> registry_image/registry for build_new models
batch_build_metadata = {}
Expand All @@ -482,6 +493,8 @@ def build(
"registry_image": model.get("registry_image"),
"registry": model.get("registry")
}
if verbose: console.print(f"[DEBUG] batch_build_metadata: {batch_build_metadata}")

console.print(Panel(
f"� [bold cyan]Batch Build Mode[/bold cyan]\n"
f"Input manifest: [yellow]{batch_manifest}[/yellow]\n"
Expand Down Expand Up @@ -538,12 +551,13 @@ def build(
orchestrator = DistributedOrchestrator(args, build_only_mode=True)
progress.update(task, description="Building models...")

# Pass batch_build_metadata to build_phase if present
# Prepare build phase arguments
build_phase_kwargs = dict(
registry=registry,
clean_cache=clean_docker_cache,
manifest_output=manifest_output
)
# Pass batch_build_metadata to build_phase if present
if batch_build_metadata:
build_phase_kwargs["batch_build_metadata"] = batch_build_metadata

Expand Down
52 changes: 19 additions & 33 deletions src/madengine/tools/distributed_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
This module provides orchestration capabilities for distributed execution
scenarios like Ansible or Kubernetes, where Docker image building and
container execution are separated across different nodes.

Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
"""

import os
Expand Down Expand Up @@ -87,7 +89,8 @@ def __init__(self, args, build_only_mode: bool = False):
print(f"Docker Hub credentials: {self.credentials['dockerhub']}")

def build_phase(self, registry: str = None, clean_cache: bool = False,
manifest_output: str = "build_manifest.json", batch_build_metadata: typing.Optional[dict] = None) -> typing.Dict:
manifest_output: str = "build_manifest.json",
batch_build_metadata: typing.Optional[dict] = None) -> typing.Dict:
"""Execute the build phase - build all Docker images.

This method supports both build-only mode (for dedicated build nodes)
Expand All @@ -109,15 +112,20 @@ def build_phase(self, registry: str = None, clean_cache: bool = False,
print("(Build-only mode - no GPU detection)")
print("=" * 60)

print(f"Building models with args {self.args}")
# Print the arguments as a dictionary for better readability
print(f"Building models with args: {vars(self.args) if hasattr(self.args, '__dict__') else self.args}")

# Discover models
print("=" * 60)
print("DISCOVERING MODELS")
discover_models = DiscoverModels(args=self.args)
models = discover_models.run()

print(f"Discovered {len(models)} models to build")

# Copy scripts for building
print("=" * 60)
print("COPYING SCRIPTS")
self._copy_scripts()

# Validate build context for build-only mode
Expand All @@ -144,7 +152,7 @@ def build_phase(self, registry: str = None, clean_cache: bool = False,
)

# Export build manifest with registry information
builder.export_build_manifest(manifest_output, registry)
builder.export_build_manifest(manifest_output, registry, batch_build_metadata)

print("=" * 60)
print("BUILD PHASE COMPLETED")
Expand Down Expand Up @@ -209,18 +217,11 @@ def run_phase(self, manifest_file: str = "build_manifest.json",

print(f"Loaded manifest with {len(manifest['built_images'])} images")

# Auto-detect registry from manifest if not provided via CLI
if not registry and "registry" in manifest:
manifest_registry = manifest["registry"]
if manifest_registry and manifest_registry.strip(): # Check for non-empty string
registry = manifest_registry
print(f"Auto-detected registry from manifest: {registry}")
else:
print("Manifest registry is empty, will use local images only")
elif registry:
# Registry is now per-image; CLI registry is fallback
if registry:
print(f"Using registry from CLI: {registry}")
else:
print("No registry specified, will use local images only")
print("No registry specified, will use per-image registry or local images only")

# Copy scripts for running
self._copy_scripts()
Expand Down Expand Up @@ -262,39 +263,24 @@ def run_phase(self, manifest_file: str = "build_manifest.json",
model_info = manifest["built_models"][image_name]
try:
print(f"\nRunning model {model_info['name']} with image {image_name}")

# Handle registry image pulling and tagging according to manifest
if "registry_image" in build_info:
# Registry image exists - pull it and tag as docker_image, then run with docker_image
registry_image = build_info["registry_image"]
docker_image = build_info["docker_image"]

# Extract registry from the registry_image format
effective_registry = registry
if not effective_registry and registry_image:
registry_parts = registry_image.split('/')
if len(registry_parts) > 1 and '.' in registry_parts[0]:
effective_registry = registry_parts[0]
elif registry_image.startswith('docker.io/') or '/' in registry_image:
effective_registry = "docker.io"

# Use per-image registry if present, else CLI registry
effective_registry = build_info.get("registry", registry)
registry_image = build_info.get("registry_image")
docker_image = build_info.get("docker_image")
if registry_image:
if effective_registry:
print(f"Pulling image from registry: {registry_image}")
try:
# Ensure all parameters are strings and credentials is properly formatted
registry_image_str = str(registry_image) if registry_image else ""
docker_image_str = str(docker_image) if docker_image else ""
effective_registry_str = str(effective_registry) if effective_registry else ""

# Pull registry image and tag it as docker_image
runner.pull_image(registry_image_str, docker_image_str, effective_registry_str, self.credentials)
actual_image = docker_image_str
print(f"Successfully pulled and tagged as: {docker_image_str}")
except Exception as e:
print(f"Failed to pull from registry, falling back to local image: {e}")
actual_image = docker_image
else:
# Registry image exists but no valid registry found, try to pull as-is and tag
print(f"Attempting to pull registry image as-is: {registry_image}")
try:
registry_image_str = str(registry_image) if registry_image else ""
Expand Down
71 changes: 49 additions & 22 deletions src/madengine/tools/docker_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def login_to_registry(self, registry: str, credentials: typing.Dict = None) -> N
print(f"Failed to login to registry {registry}: {e}")
raise

def push_image(self, docker_image: str, registry: str = None, credentials: typing.Dict = None) -> str:
def push_image(self, docker_image: str, registry: str = None, credentials: typing.Dict = None, explicit_registry_image: str = None) -> str:
"""Push the built image to a registry.

Args:
Expand All @@ -290,46 +290,70 @@ def push_image(self, docker_image: str, registry: str = None, credentials: typin
self.login_to_registry(registry, credentials)

# Determine registry image name (this should match what was already determined)
registry_image = self._determine_registry_image_name(docker_image, registry, credentials)
if explicit_registry_image:
registry_image = explicit_registry_image
else:
registry_image = self._determine_registry_image_name(docker_image, registry, credentials)
print(f"[DEBUG] push_image: docker_image='{docker_image}', registry='{registry}', registry_image='{registry_image}'")

try:
# Tag the image if different from local name
if registry_image != docker_image:
print(f"[DEBUG] Tagging image: docker tag {docker_image} {registry_image}")
tag_command = f"docker tag {docker_image} {registry_image}"
print(f"🏷️ Tagging image: {tag_command}")
self.console.sh(tag_command)

else:
print(f"[DEBUG] No tag needed, docker_image and registry_image are the same: {docker_image}")

# Push the image
print(f"[DEBUG] Pushing image: docker push {registry_image}")
push_command = f"docker push {registry_image}"
print(f"\n🚀 Starting docker push to registry...")
print(f"📤 Registry: {registry}")
print(f"🏷️ Image: {registry_image}")
self.console.sh(push_command)

print(f"✅ Successfully pushed image to registry: {registry_image}")
print(f"{'='*80}")
return registry_image

except Exception as e:
print(f"Failed to push image {docker_image} to registry {registry}: {e}")
raise
def export_build_manifest(self, output_file: str = "build_manifest.json", registry: str = None) -> None:

def export_build_manifest(self, output_file: str = "build_manifest.json", registry: str = None, batch_build_metadata: typing.Optional[dict] = None) -> None:
"""Export enhanced build information to a manifest file.

This creates a comprehensive build manifest that includes all necessary
information for deployment, reducing the need for separate execution configs.

Args:
output_file: Path to output manifest file
registry: Registry used for building (added to manifest metadata)
registry: Registry used for building (added to each image entry)
batch_build_metadata: Optional metadata for batch builds
"""
# Extract credentials from models
credentials_required = list(set([
model.get("cred", "") for model in self.built_models.values()
if model.get("cred", "") != ""
]))


print(f"[DEBUG] batch_build_metadata: {batch_build_metadata}")
print(f"[DEBUG] built_images: {self.built_images}")

# Set registry for each built image
for image_name, build_info in self.built_images.items():
# If registry is not set in build_info, set it from argument
if registry:
build_info["registry"] = registry

docker_file = build_info.get("dockerfile", "")
truncated_docker_file = docker_file.split("/")[-1].split(".Dockerfile")[0]
model_name = image_name.split("ci-")[1].split(truncated_docker_file)[0].rstrip("_")
if batch_build_metadata and model_name in batch_build_metadata:
print(f"[DEBUG] Overriding registry for {model_name} from batch_build_metadata")
build_info["registry"] = batch_build_metadata[model_name].get("registry")

manifest = {
"built_images": self.built_images,
"built_models": self.built_models,
Expand All @@ -342,15 +366,11 @@ def export_build_manifest(self, output_file: str = "build_manifest.json", regist
},
"credentials_required": credentials_required
}

# Add multi-node args to context if present
if "build_multi_node_args" in self.context.ctx:
manifest["context"]["multi_node_args"] = self.context.ctx["build_multi_node_args"]

# Add registry information to manifest metadata if provided
if registry:
manifest["registry"] = registry


# Add push failure summary if any pushes failed
push_failures = []
for image_name, build_info in self.built_images.items():
Expand All @@ -360,13 +380,13 @@ def export_build_manifest(self, output_file: str = "build_manifest.json", regist
"intended_registry_image": build_info.get("registry_image"),
"error": build_info.get("push_error")
})

if push_failures:
manifest["push_failures"] = push_failures

with open(output_file, 'w') as f:
json.dump(manifest, f, indent=2)

print(f"Build manifest exported to: {output_file}")
if push_failures:
print(f"Warning: {len(push_failures)} image(s) failed to push to registry")
Expand Down Expand Up @@ -438,24 +458,31 @@ def build_all_models(self, models: typing.List[typing.Dict],
model_info, dockerfile, credentials, clean_cache, phase_suffix
)

# Determine registry image name and add to manifest before push operations
# Determine registry image name for push/tag
registry_image = None
if model_registry_image:
registry_image = model_registry_image
elif model_registry:
registry_image = self._determine_registry_image_name(
build_info["docker_image"], model_registry, credentials
)
# Always use registry_image from batch_build_metadata if present
if batch_build_metadata and model_info["name"] in batch_build_metadata:
meta = batch_build_metadata[model_info["name"]]
if meta.get("registry_image"):
registry_image = meta["registry_image"]
if registry_image:
build_info["registry_image"] = registry_image
if build_info["docker_image"] in self.built_images:
self.built_images[build_info["docker_image"]]["registry_image"] = registry_image

# Now attempt to push to registry if registry is set
if model_registry and registry_image:
explicit_registry_image = registry_image
try:
# Use registry_image from batch_build_metadata for push/tag if present
actual_registry_image = self.push_image(
build_info["docker_image"], model_registry, credentials
build_info["docker_image"], model_registry, credentials, explicit_registry_image
)
if actual_registry_image != registry_image:
print(f"Warning: Pushed image name {actual_registry_image} differs from intended {registry_image}")
Expand Down