Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ For detailed command options, see the **[CLI Command Reference](docs/cli-referen
| Guide | Description |
|-------|-------------|
| [Installation](docs/installation.md) | Complete installation instructions |
| [Usage Guide](docs/usage.md) | Commands, workflows, and examples |
| [Usage Guide](docs/usage.md) | Commands, workflows, and examples ([`--skip-model-run`](docs/usage.md#skip-model-run-after-build)) |
| **[CLI Reference](docs/cli-reference.md)** | **Detailed command options and examples** |
| [Deployment](docs/deployment.md) | Kubernetes and SLURM deployment |
| [Configuration](docs/configuration.md) | Advanced configuration options |
Expand Down Expand Up @@ -562,6 +562,7 @@ See [Installation Guide](docs/installation.md) for detailed instructions.
### Build & Deployment

- **Separate build and run phases** for distributed deployments
- **Build without executing:** `madengine run --tags … --skip-model-run` skips container execution **after a build in that same invocation** (ignored when using an existing `--manifest-file`). See [Usage — Skip model run after build](docs/usage.md#skip-model-run-after-build).
- **Use registries** for multi-node execution (K8s/SLURM)
- **Use batch build mode** for CI/CD to optimize build times
- **Specify `--target-archs`** when building for multiple GPU architectures
Expand Down
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Complete documentation for madengine - AI model automation and distributed bench
| Guide | Description |
|-------|-------------|
| [Installation](installation.md) | Complete installation instructions |
| [Usage Guide](usage.md) | Commands, configuration, and examples |
| [Usage Guide](usage.md) | Commands, configuration, and examples ([`--skip-model-run`](usage.md#skip-model-run-after-build)) |

### Configuration & Deployment

Expand Down
6 changes: 6 additions & 0 deletions docs/cli-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ madengine run [OPTIONS]
| `--keep-alive` | | FLAG | `False` | Keep Docker containers alive after run |
| `--keep-model-dir` | | FLAG | `False` | Keep model directory after run |
| `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache (full workflow) |
| `--skip-model-run` | | FLAG | `False` | After a **build in this invocation**, skip executing models (manifest/images still produced). **Ignored** when using `--manifest-file` with an existing manifest (run-only), or when no build ran in this invocation. See [Usage — Skip model run](usage.md#skip-model-run-after-build). |
| `--manifest-output` | | TEXT | `build_manifest.json` | Output file for build manifest (full workflow) |
| `--summary-output` | `-s` | TEXT | `None` | Output file for summary JSON |
| `--live-output` | `-l` | FLAG | `False` | Print output in real-time |
Expand Down Expand Up @@ -246,6 +247,11 @@ madengine run --tags dummy --rocm-path /path/to/rocm \
# Run with pre-built images (manifest-based)
madengine run --manifest-file build_manifest.json

# Build in this invocation but skip executing containers (CI: images + manifest only)
madengine run --tags model \
--additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' \
--skip-model-run

# Multi-GPU with torchrun
madengine run --tags model \
--additional-context '{
Expand Down
15 changes: 15 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,21 @@ madengine build --batch-manifest batch.json \

## Run Workflow

### Skip model run after build

When `madengine run` **builds** in the same invocation (no pre-existing `--manifest-file`), you can pass **`--skip-model-run`** to produce images and `build_manifest.json` **without** running model containers.

- **Ignored** when `--manifest-file` points at an existing manifest (execution-only mode): use plain `madengine run --manifest-file ...` to run later.
- **Ignored** with a warning if this invocation did not perform a build (for example a manifest was already present and no rebuild occurred).

```bash
madengine run --tags model \
--additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' \
--skip-model-run
```

See [CLI Reference — `run`](cli-reference.md#run---execute-models) and `madengine run --help`.

### Local Execution

Run on local machine:
Expand Down
27 changes: 26 additions & 1 deletion src/madengine/cli/commands/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ def run(
help="Rebuild images without using cache (for full workflow)",
),
] = False,
skip_model_run: Annotated[
bool,
typer.Option(
"--skip-model-run",
help="After a build in this invocation, skip executing models (ignored when using an existing manifest).",
),
] = False,
manifest_output: Annotated[
str,
typer.Option(
Expand Down Expand Up @@ -193,6 +200,12 @@ def run(
manifest_exists = manifest_file and os.path.exists(manifest_file)

if manifest_exists:
if skip_model_run:
console.print(
"[yellow]⚠️ --skip-model-run applies only after a build in this invocation; "
"using an existing manifest. Ignoring --skip-model-run.[/yellow]"
)

console.print(
Panel(
f"🚀 [bold cyan]Running Models (Execution Only)[/bold cyan]\n"
Expand Down Expand Up @@ -225,6 +238,7 @@ def run(
verbose=verbose,
cleanup_perf=cleanup_perf,
rocm_path=rocm_path,
skip_model_run=skip_model_run,
_separate_phases=True,
)

Expand Down Expand Up @@ -290,12 +304,18 @@ def run(
f"⚠️ Manifest file [yellow]{manifest_file}[/yellow] not found, running complete workflow"
)

skip_note = (
"\nSkip run: [yellow]yes (--skip-model-run)[/yellow]"
if skip_model_run
else ""
)
console.print(
Panel(
f"🔨🚀 [bold cyan]Complete Workflow (Build + Run)[/bold cyan]\n"
f"Tags: [yellow]{', '.join(processed_tags) if processed_tags else 'All models'}[/yellow]\n"
f"Registry: [yellow]{registry or 'Local only'}[/yellow]\n"
f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s",
f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s"
f"{skip_note}",
title="Workflow Configuration",
border_style="magenta",
)
Expand Down Expand Up @@ -323,6 +343,7 @@ def run(
verbose=verbose,
cleanup_perf=cleanup_perf,
rocm_path=rocm_path,
skip_model_run=skip_model_run,
_separate_phases=False, # Full workflow uses .live.log (not .run.live.log)
)

Expand Down Expand Up @@ -385,6 +406,10 @@ def run(
save_summary_with_feedback(workflow_summary, summary_output, "Workflow")

if workflow_summary["overall_success"]:
if execution_summary.get("skipped_model_run"):
console.print(
"[cyan]Model run was skipped (--skip-model-run); build completed.[/cyan]"
)
console.print(
"🎉 [bold green]Complete workflow finished successfully![/bold green]"
)
Expand Down
32 changes: 32 additions & 0 deletions src/madengine/orchestration/run_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,10 @@ def execute(
1. Run-only: If manifest_file provided
2. Full workflow: If tags provided (build + run)

When args.skip_model_run is True (Policy A), the model execution step is
skipped only if this invocation ran a build (_did_build_phase). Otherwise
the flag is ignored with a warning.

Args:
manifest_file: Path to build_manifest.json
tags: Model tags to build (triggers build phase if no manifest)
Expand Down Expand Up @@ -258,6 +262,34 @@ def execute(

self.rich_console.print(f"[bold cyan]Deployment target: {target}[/bold cyan]\n")

# Use `is True` so MagicMock-based test doubles do not count as enabled.
skip_requested = getattr(self.args, "skip_model_run", False) is True
if skip_requested and not self._did_build_phase:
self.rich_console.print(
"[yellow]⚠️ --skip-model-run is ignored "
"(not a build+run workflow in this invocation).[/yellow]\n"
)

if skip_requested and self._did_build_phase:
self.rich_console.print(
"[bold cyan]Skipping model run (--skip-model-run) after build.[/bold cyan]\n"
)
results = {
"successful_runs": [],
"failed_runs": [],
"total_runs": 0,
"skipped_model_run": True,
}
results["session_start_row"] = session_start_row
results["session_row_count"] = (
self.session_tracker.get_session_row_count()
)
self.rich_console.print(
"\n[dim]🧹 Cleaning up madengine package files...[/dim]"
)
self._cleanup_model_dir_copies()
return results

# Step 4: Execute based on target
try:
if target == "local" or target == "docker":
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def mock_run_args():
args.force_mirror_local = False
args.disable_skip_gpu_arch = False
args.verbose = False
args.skip_model_run = False
args._separate_phases = True
return args

Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ def test_run_help_exits_zero(self, runner: CliRunner) -> None:
result = runner.invoke(app, ["run", "--help"])
assert result.exit_code == ExitCode.SUCCESS
assert "run" in result.stdout.lower() or "model" in result.stdout.lower()
assert "--skip-model-run" in result.stdout

def test_run_command_build_error_returns_build_failure_exit_code(
self, runner: CliRunner
Expand Down
81 changes: 81 additions & 0 deletions tests/unit/test_orchestration.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Unit tests for orchestration: image_filtering and orchestrator init/validation."""

import json

import pytest
from unittest.mock import MagicMock, patch

Expand Down Expand Up @@ -190,3 +192,82 @@ def test_run_without_manifest_or_tags_raises_error(self, mock_exists):

with pytest.raises(ConfigurationError):
orchestrator.execute(manifest_file=None, tags=None)


@pytest.mark.unit
class TestSkipModelRunPolicyA:
"""Policy A: --skip-model-run only skips execution after an internal build."""

@patch.object(RunOrchestrator, "_cleanup_model_dir_copies")
def test_skip_after_build_skips_execute_local(self, mock_cleanup, tmp_path):
"""Full workflow: skip_model_run + build phase skips _execute_local."""
perf = tmp_path / "perf.csv"
manifest_path = tmp_path / "build_manifest.json"
manifest_path.write_text(
json.dumps(
{
"deployment_config": {"target": "local"},
"context": {},
"built_images": {},
}
)
)

mock_args = MagicMock()
mock_args.skip_model_run = True
mock_args.additional_context = None
mock_args.live_output = False
mock_args.output = str(perf)

orchestrator = RunOrchestrator(mock_args)

with patch.object(RunOrchestrator, "_build_phase", return_value=str(manifest_path)):
with patch.object(
RunOrchestrator, "_load_and_merge_manifest", side_effect=lambda f: f
):
with patch.object(RunOrchestrator, "_execute_local") as mock_local:
with patch.object(
RunOrchestrator, "_combine_build_and_run_logs"
) as mock_combine:
orchestrator.execute(
manifest_file=None, tags=["dummy"], timeout=60
)

mock_local.assert_not_called()
mock_combine.assert_not_called()
mock_cleanup.assert_called()

@patch.object(RunOrchestrator, "_cleanup_model_dir_copies")
def test_skip_ignored_when_run_only_still_calls_execute_local(
self, mock_cleanup, tmp_path
):
"""Run-only: skip_model_run is ignored; _execute_local runs."""
perf = tmp_path / "perf.csv"
manifest_path = tmp_path / "build_manifest.json"
manifest_path.write_text(
json.dumps(
{
"deployment_config": {"target": "local"},
"context": {},
"built_images": {},
}
)
)

mock_args = MagicMock()
mock_args.skip_model_run = True
mock_args.additional_context = None
mock_args.live_output = False
mock_args.output = str(perf)

orchestrator = RunOrchestrator(mock_args)

with patch.object(RunOrchestrator, "_execute_local") as mock_local:
mock_local.return_value = {
"successful_runs": [],
"failed_runs": [],
}
orchestrator.execute(manifest_file=str(manifest_path), tags=None, timeout=60)

mock_local.assert_called_once()
mock_cleanup.assert_called()