From 552aad1546f460517ae631c42caaa29ebf6ede2a Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 7 Apr 2026 12:10:05 -0500 Subject: [PATCH 1/2] feat(cli): add --skip-model-run for build+run workflows Restore v1-style --skip-model-run on madengine run (Policy A): model execution is skipped only when this invocation ran a build (_did_build_phase). If an existing --manifest-file is used, the flag is ignored and a warning is printed. - CLI: new --skip-model-run, passed through args; workflow panel and success messaging when the run phase is skipped - RunOrchestrator: short-circuit before local/distributed execute; empty run summary with skipped_model_run; use (skip_model_run is True) so MagicMock test doubles do not enable the path by accident - Tests: unit coverage for skip vs run-only; help text; conftest skip_model_run --- src/madengine/cli/commands/run.py | 27 ++++++- .../orchestration/run_orchestrator.py | 32 ++++++++ tests/conftest.py | 1 + tests/unit/test_cli.py | 1 + tests/unit/test_orchestration.py | 81 +++++++++++++++++++ 5 files changed, 141 insertions(+), 1 deletion(-) diff --git a/src/madengine/cli/commands/run.py b/src/madengine/cli/commands/run.py index a2b174f2..450d1a53 100644 --- a/src/madengine/cli/commands/run.py +++ b/src/madengine/cli/commands/run.py @@ -95,6 +95,13 @@ def run( help="Rebuild images without using cache (for full workflow)", ), ] = False, + skip_model_run: Annotated[ + bool, + typer.Option( + "--skip-model-run", + help="After a build in this invocation, skip executing models (ignored when using an existing manifest).", + ), + ] = False, manifest_output: Annotated[ str, typer.Option( @@ -193,6 +200,12 @@ def run( manifest_exists = manifest_file and os.path.exists(manifest_file) if manifest_exists: + if skip_model_run: + console.print( + "[yellow]โš ๏ธ --skip-model-run applies only after a build in this invocation; " + "using an existing manifest. Ignoring --skip-model-run.[/yellow]" + ) + console.print( Panel( f"๐Ÿš€ [bold cyan]Running Models (Execution Only)[/bold cyan]\n" @@ -225,6 +238,7 @@ def run( verbose=verbose, cleanup_perf=cleanup_perf, rocm_path=rocm_path, + skip_model_run=skip_model_run, _separate_phases=True, ) @@ -290,12 +304,18 @@ def run( f"โš ๏ธ Manifest file [yellow]{manifest_file}[/yellow] not found, running complete workflow" ) + skip_note = ( + "\nSkip run: [yellow]yes (--skip-model-run)[/yellow]" + if skip_model_run + else "" + ) console.print( Panel( f"๐Ÿ”จ๐Ÿš€ [bold cyan]Complete Workflow (Build + Run)[/bold cyan]\n" f"Tags: [yellow]{', '.join(processed_tags) if processed_tags else 'All models'}[/yellow]\n" f"Registry: [yellow]{registry or 'Local only'}[/yellow]\n" - f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s", + f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s" + f"{skip_note}", title="Workflow Configuration", border_style="magenta", ) @@ -323,6 +343,7 @@ def run( verbose=verbose, cleanup_perf=cleanup_perf, rocm_path=rocm_path, + skip_model_run=skip_model_run, _separate_phases=False, # Full workflow uses .live.log (not .run.live.log) ) @@ -385,6 +406,10 @@ def run( save_summary_with_feedback(workflow_summary, summary_output, "Workflow") if workflow_summary["overall_success"]: + if execution_summary.get("skipped_model_run"): + console.print( + "[cyan]Model run was skipped (--skip-model-run); build completed.[/cyan]" + ) console.print( "๐ŸŽ‰ [bold green]Complete workflow finished successfully![/bold green]" ) diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index b3aeb6cb..01b499b6 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -143,6 +143,10 @@ def execute( 1. Run-only: If manifest_file provided 2. Full workflow: If tags provided (build + run) + When args.skip_model_run is True (Policy A), the model execution step is + skipped only if this invocation ran a build (_did_build_phase). Otherwise + the flag is ignored with a warning. + Args: manifest_file: Path to build_manifest.json tags: Model tags to build (triggers build phase if no manifest) @@ -258,6 +262,34 @@ def execute( self.rich_console.print(f"[bold cyan]Deployment target: {target}[/bold cyan]\n") + # Use `is True` so MagicMock-based test doubles do not count as enabled. + skip_requested = getattr(self.args, "skip_model_run", False) is True + if skip_requested and not self._did_build_phase: + self.rich_console.print( + "[yellow]โš ๏ธ --skip-model-run is ignored " + "(not a build+run workflow in this invocation).[/yellow]\n" + ) + + if skip_requested and self._did_build_phase: + self.rich_console.print( + "[bold cyan]Skipping model run (--skip-model-run) after build.[/bold cyan]\n" + ) + results = { + "successful_runs": [], + "failed_runs": [], + "total_runs": 0, + "skipped_model_run": True, + } + results["session_start_row"] = session_start_row + results["session_row_count"] = ( + self.session_tracker.get_session_row_count() + ) + self.rich_console.print( + "\n[dim]๐Ÿงน Cleaning up madengine package files...[/dim]" + ) + self._cleanup_model_dir_copies() + return results + # Step 4: Execute based on target try: if target == "local" or target == "docker": diff --git a/tests/conftest.py b/tests/conftest.py index fd02e425..91241e01 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -140,6 +140,7 @@ def mock_run_args(): args.force_mirror_local = False args.disable_skip_gpu_arch = False args.verbose = False + args.skip_model_run = False args._separate_phases = True return args diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 81b74a4b..32899dae 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -453,6 +453,7 @@ def test_run_help_exits_zero(self, runner: CliRunner) -> None: result = runner.invoke(app, ["run", "--help"]) assert result.exit_code == ExitCode.SUCCESS assert "run" in result.stdout.lower() or "model" in result.stdout.lower() + assert "--skip-model-run" in result.stdout def test_run_command_build_error_returns_build_failure_exit_code( self, runner: CliRunner diff --git a/tests/unit/test_orchestration.py b/tests/unit/test_orchestration.py index 7c1506c4..ece59067 100644 --- a/tests/unit/test_orchestration.py +++ b/tests/unit/test_orchestration.py @@ -1,5 +1,7 @@ """Unit tests for orchestration: image_filtering and orchestrator init/validation.""" +import json + import pytest from unittest.mock import MagicMock, patch @@ -190,3 +192,82 @@ def test_run_without_manifest_or_tags_raises_error(self, mock_exists): with pytest.raises(ConfigurationError): orchestrator.execute(manifest_file=None, tags=None) + + +@pytest.mark.unit +class TestSkipModelRunPolicyA: + """Policy A: --skip-model-run only skips execution after an internal build.""" + + @patch.object(RunOrchestrator, "_cleanup_model_dir_copies") + def test_skip_after_build_skips_execute_local(self, mock_cleanup, tmp_path): + """Full workflow: skip_model_run + build phase skips _execute_local.""" + perf = tmp_path / "perf.csv" + manifest_path = tmp_path / "build_manifest.json" + manifest_path.write_text( + json.dumps( + { + "deployment_config": {"target": "local"}, + "context": {}, + "built_images": {}, + } + ) + ) + + mock_args = MagicMock() + mock_args.skip_model_run = True + mock_args.additional_context = None + mock_args.live_output = False + mock_args.output = str(perf) + + orchestrator = RunOrchestrator(mock_args) + + with patch.object(RunOrchestrator, "_build_phase", return_value=str(manifest_path)): + with patch.object( + RunOrchestrator, "_load_and_merge_manifest", side_effect=lambda f: f + ): + with patch.object(RunOrchestrator, "_execute_local") as mock_local: + with patch.object( + RunOrchestrator, "_combine_build_and_run_logs" + ) as mock_combine: + orchestrator.execute( + manifest_file=None, tags=["dummy"], timeout=60 + ) + + mock_local.assert_not_called() + mock_combine.assert_not_called() + mock_cleanup.assert_called() + + @patch.object(RunOrchestrator, "_cleanup_model_dir_copies") + def test_skip_ignored_when_run_only_still_calls_execute_local( + self, mock_cleanup, tmp_path + ): + """Run-only: skip_model_run is ignored; _execute_local runs.""" + perf = tmp_path / "perf.csv" + manifest_path = tmp_path / "build_manifest.json" + manifest_path.write_text( + json.dumps( + { + "deployment_config": {"target": "local"}, + "context": {}, + "built_images": {}, + } + ) + ) + + mock_args = MagicMock() + mock_args.skip_model_run = True + mock_args.additional_context = None + mock_args.live_output = False + mock_args.output = str(perf) + + orchestrator = RunOrchestrator(mock_args) + + with patch.object(RunOrchestrator, "_execute_local") as mock_local: + mock_local.return_value = { + "successful_runs": [], + "failed_runs": [], + } + orchestrator.execute(manifest_file=str(manifest_path), tags=None, timeout=60) + + mock_local.assert_called_once() + mock_cleanup.assert_called() From 221b445a1d20b29dc3129c0e249e34554e72eeae Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 8 Apr 2026 11:03:24 -0500 Subject: [PATCH 2/2] Updated docs: --skip-model-run is documented in the CLI reference, usage guide, main README tips --- README.md | 3 ++- docs/README.md | 2 +- docs/cli-reference.md | 6 ++++++ docs/usage.md | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b7bf745e..4e0dce9f 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ For detailed command options, see the **[CLI Command Reference](docs/cli-referen | Guide | Description | |-------|-------------| | [Installation](docs/installation.md) | Complete installation instructions | -| [Usage Guide](docs/usage.md) | Commands, workflows, and examples | +| [Usage Guide](docs/usage.md) | Commands, workflows, and examples ([`--skip-model-run`](docs/usage.md#skip-model-run-after-build)) | | **[CLI Reference](docs/cli-reference.md)** | **Detailed command options and examples** | | [Deployment](docs/deployment.md) | Kubernetes and SLURM deployment | | [Configuration](docs/configuration.md) | Advanced configuration options | @@ -562,6 +562,7 @@ See [Installation Guide](docs/installation.md) for detailed instructions. ### Build & Deployment - **Separate build and run phases** for distributed deployments +- **Build without executing:** `madengine run --tags โ€ฆ --skip-model-run` skips container execution **after a build in that same invocation** (ignored when using an existing `--manifest-file`). See [Usage โ€” Skip model run after build](docs/usage.md#skip-model-run-after-build). - **Use registries** for multi-node execution (K8s/SLURM) - **Use batch build mode** for CI/CD to optimize build times - **Specify `--target-archs`** when building for multiple GPU architectures diff --git a/docs/README.md b/docs/README.md index 61e5d2d8..9796e8b9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,7 +9,7 @@ Complete documentation for madengine - AI model automation and distributed bench | Guide | Description | |-------|-------------| | [Installation](installation.md) | Complete installation instructions | -| [Usage Guide](usage.md) | Commands, configuration, and examples | +| [Usage Guide](usage.md) | Commands, configuration, and examples ([`--skip-model-run`](usage.md#skip-model-run-after-build)) | ### Configuration & Deployment diff --git a/docs/cli-reference.md b/docs/cli-reference.md index bfd1ca51..4758b8fe 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -219,6 +219,7 @@ madengine run [OPTIONS] | `--keep-alive` | | FLAG | `False` | Keep Docker containers alive after run | | `--keep-model-dir` | | FLAG | `False` | Keep model directory after run | | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache (full workflow) | +| `--skip-model-run` | | FLAG | `False` | After a **build in this invocation**, skip executing models (manifest/images still produced). **Ignored** when using `--manifest-file` with an existing manifest (run-only), or when no build ran in this invocation. See [Usage โ€” Skip model run](usage.md#skip-model-run-after-build). | | `--manifest-output` | | TEXT | `build_manifest.json` | Output file for build manifest (full workflow) | | `--summary-output` | `-s` | TEXT | `None` | Output file for summary JSON | | `--live-output` | `-l` | FLAG | `False` | Print output in real-time | @@ -246,6 +247,11 @@ madengine run --tags dummy --rocm-path /path/to/rocm \ # Run with pre-built images (manifest-based) madengine run --manifest-file build_manifest.json +# Build in this invocation but skip executing containers (CI: images + manifest only) +madengine run --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' \ + --skip-model-run + # Multi-GPU with torchrun madengine run --tags model \ --additional-context '{ diff --git a/docs/usage.md b/docs/usage.md index f29f8426..6b1391e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -281,6 +281,21 @@ madengine build --batch-manifest batch.json \ ## Run Workflow +### Skip model run after build + +When `madengine run` **builds** in the same invocation (no pre-existing `--manifest-file`), you can pass **`--skip-model-run`** to produce images and `build_manifest.json` **without** running model containers. + +- **Ignored** when `--manifest-file` points at an existing manifest (execution-only mode): use plain `madengine run --manifest-file ...` to run later. +- **Ignored** with a warning if this invocation did not perform a build (for example a manifest was already present and no rebuild occurred). + +```bash +madengine run --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' \ + --skip-model-run +``` + +See [CLI Reference โ€” `run`](cli-reference.md#run---execute-models) and `madengine run --help`. + ### Local Execution Run on local machine: