From 552aad1546f460517ae631c42caaa29ebf6ede2a Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Tue, 7 Apr 2026 12:10:05 -0500
Subject: [PATCH 1/2] feat(cli): add --skip-model-run for build+run workflows

Restore v1-style --skip-model-run on madengine run (Policy A): model execution
is skipped only when this invocation ran a build (_did_build_phase). If an
existing --manifest-file is used, the flag is ignored and a warning is printed.

- CLI: new --skip-model-run, passed through args; workflow panel and success
  messaging when the run phase is skipped
- RunOrchestrator: short-circuit before local/distributed execute; empty run
  summary with skipped_model_run; use (skip_model_run is True) so MagicMock test
  doubles do not enable the path by accident
- Tests: unit coverage for skip vs run-only; help text; conftest skip_model_run
---
 src/madengine/cli/commands/run.py             | 27 ++++++-
 .../orchestration/run_orchestrator.py         | 32 ++++++++
 tests/conftest.py                             |  1 +
 tests/unit/test_cli.py                        |  1 +
 tests/unit/test_orchestration.py              | 81 +++++++++++++++++++
 5 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/src/madengine/cli/commands/run.py b/src/madengine/cli/commands/run.py
index a2b174f2..450d1a53 100644
--- a/src/madengine/cli/commands/run.py
+++ b/src/madengine/cli/commands/run.py
@@ -95,6 +95,13 @@ def run(
             help="Rebuild images without using cache (for full workflow)",
         ),
     ] = False,
+    skip_model_run: Annotated[
+        bool,
+        typer.Option(
+            "--skip-model-run",
+            help="After a build in this invocation, skip executing models (ignored when using an existing manifest).",
+        ),
+    ] = False,
     manifest_output: Annotated[
         str,
         typer.Option(
@@ -193,6 +200,12 @@ def run(
         manifest_exists = manifest_file and os.path.exists(manifest_file)
 
         if manifest_exists:
+            if skip_model_run:
+                console.print(
+                    "[yellow]⚠️  --skip-model-run applies only after a build in this invocation; "
+                    "using an existing manifest. Ignoring --skip-model-run.[/yellow]"
+                )
+
             console.print(
                 Panel(
                     f"🚀 [bold cyan]Running Models (Execution Only)[/bold cyan]\n"
@@ -225,6 +238,7 @@ def run(
                 verbose=verbose,
                 cleanup_perf=cleanup_perf,
                 rocm_path=rocm_path,
+                skip_model_run=skip_model_run,
                 _separate_phases=True,
             )
 
@@ -290,12 +304,18 @@ def run(
                     f"⚠️  Manifest file [yellow]{manifest_file}[/yellow] not found, running complete workflow"
                 )
 
+            skip_note = (
+                "\nSkip run: [yellow]yes (--skip-model-run)[/yellow]"
+                if skip_model_run
+                else ""
+            )
             console.print(
                 Panel(
                     f"🔨🚀 [bold cyan]Complete Workflow (Build + Run)[/bold cyan]\n"
                     f"Tags: [yellow]{', '.join(processed_tags) if processed_tags else 'All models'}[/yellow]\n"
                     f"Registry: [yellow]{registry or 'Local only'}[/yellow]\n"
-                    f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s",
+                    f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s"
+                    f"{skip_note}",
                     title="Workflow Configuration",
                     border_style="magenta",
                 )
@@ -323,6 +343,7 @@ def run(
                 verbose=verbose,
                 cleanup_perf=cleanup_perf,
                 rocm_path=rocm_path,
+                skip_model_run=skip_model_run,
                 _separate_phases=False,  # Full workflow uses .live.log (not .run.live.log)
             )
 
@@ -385,6 +406,10 @@ def run(
             save_summary_with_feedback(workflow_summary, summary_output, "Workflow")
 
             if workflow_summary["overall_success"]:
+                if execution_summary.get("skipped_model_run"):
+                    console.print(
+                        "[cyan]Model run was skipped (--skip-model-run); build completed.[/cyan]"
+                    )
                 console.print(
                     "🎉 [bold green]Complete workflow finished successfully![/bold green]"
                 )
diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py
index b3aeb6cb..01b499b6 100644
--- a/src/madengine/orchestration/run_orchestrator.py
+++ b/src/madengine/orchestration/run_orchestrator.py
@@ -143,6 +143,10 @@ def execute(
         1. Run-only: If manifest_file provided
         2. Full workflow: If tags provided (build + run)
 
+        When args.skip_model_run is True (Policy A), the model execution step is
+        skipped only if this invocation ran a build (_did_build_phase). Otherwise
+        the flag is ignored with a warning.
+
         Args:
             manifest_file: Path to build_manifest.json
             tags: Model tags to build (triggers build phase if no manifest)
@@ -258,6 +262,34 @@ def execute(
             
             self.rich_console.print(f"[bold cyan]Deployment target: {target}[/bold cyan]\n")
 
+            # Use `is True` so MagicMock-based test doubles do not count as enabled.
+            skip_requested = getattr(self.args, "skip_model_run", False) is True
+            if skip_requested and not self._did_build_phase:
+                self.rich_console.print(
+                    "[yellow]⚠️  --skip-model-run is ignored "
+                    "(not a build+run workflow in this invocation).[/yellow]\n"
+                )
+
+            if skip_requested and self._did_build_phase:
+                self.rich_console.print(
+                    "[bold cyan]Skipping model run (--skip-model-run) after build.[/bold cyan]\n"
+                )
+                results = {
+                    "successful_runs": [],
+                    "failed_runs": [],
+                    "total_runs": 0,
+                    "skipped_model_run": True,
+                }
+                results["session_start_row"] = session_start_row
+                results["session_row_count"] = (
+                    self.session_tracker.get_session_row_count()
+                )
+                self.rich_console.print(
+                    "\n[dim]🧹 Cleaning up madengine package files...[/dim]"
+                )
+                self._cleanup_model_dir_copies()
+                return results
+
             # Step 4: Execute based on target
             try:
                 if target == "local" or target == "docker":
diff --git a/tests/conftest.py b/tests/conftest.py
index fd02e425..91241e01 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -140,6 +140,7 @@ def mock_run_args():
     args.force_mirror_local = False
     args.disable_skip_gpu_arch = False
     args.verbose = False
+    args.skip_model_run = False
     args._separate_phases = True
     return args
 
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index 81b74a4b..32899dae 100644
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -453,6 +453,7 @@ def test_run_help_exits_zero(self, runner: CliRunner) -> None:
         result = runner.invoke(app, ["run", "--help"])
         assert result.exit_code == ExitCode.SUCCESS
         assert "run" in result.stdout.lower() or "model" in result.stdout.lower()
+        assert "--skip-model-run" in result.stdout
 
     def test_run_command_build_error_returns_build_failure_exit_code(
         self, runner: CliRunner
diff --git a/tests/unit/test_orchestration.py b/tests/unit/test_orchestration.py
index 7c1506c4..ece59067 100644
--- a/tests/unit/test_orchestration.py
+++ b/tests/unit/test_orchestration.py
@@ -1,5 +1,7 @@
 """Unit tests for orchestration: image_filtering and orchestrator init/validation."""
 
+import json
+
 import pytest
 from unittest.mock import MagicMock, patch
 
@@ -190,3 +192,82 @@ def test_run_without_manifest_or_tags_raises_error(self, mock_exists):
 
         with pytest.raises(ConfigurationError):
             orchestrator.execute(manifest_file=None, tags=None)
+
+
+@pytest.mark.unit
+class TestSkipModelRunPolicyA:
+    """Policy A: --skip-model-run only skips execution after an internal build."""
+
+    @patch.object(RunOrchestrator, "_cleanup_model_dir_copies")
+    def test_skip_after_build_skips_execute_local(self, mock_cleanup, tmp_path):
+        """Full workflow: skip_model_run + build phase skips _execute_local."""
+        perf = tmp_path / "perf.csv"
+        manifest_path = tmp_path / "build_manifest.json"
+        manifest_path.write_text(
+            json.dumps(
+                {
+                    "deployment_config": {"target": "local"},
+                    "context": {},
+                    "built_images": {},
+                }
+            )
+        )
+
+        mock_args = MagicMock()
+        mock_args.skip_model_run = True
+        mock_args.additional_context = None
+        mock_args.live_output = False
+        mock_args.output = str(perf)
+
+        orchestrator = RunOrchestrator(mock_args)
+
+        with patch.object(RunOrchestrator, "_build_phase", return_value=str(manifest_path)):
+            with patch.object(
+                RunOrchestrator, "_load_and_merge_manifest", side_effect=lambda f: f
+            ):
+                with patch.object(RunOrchestrator, "_execute_local") as mock_local:
+                    with patch.object(
+                        RunOrchestrator, "_combine_build_and_run_logs"
+                    ) as mock_combine:
+                        orchestrator.execute(
+                            manifest_file=None, tags=["dummy"], timeout=60
+                        )
+
+        mock_local.assert_not_called()
+        mock_combine.assert_not_called()
+        mock_cleanup.assert_called()
+
+    @patch.object(RunOrchestrator, "_cleanup_model_dir_copies")
+    def test_skip_ignored_when_run_only_still_calls_execute_local(
+        self, mock_cleanup, tmp_path
+    ):
+        """Run-only: skip_model_run is ignored; _execute_local runs."""
+        perf = tmp_path / "perf.csv"
+        manifest_path = tmp_path / "build_manifest.json"
+        manifest_path.write_text(
+            json.dumps(
+                {
+                    "deployment_config": {"target": "local"},
+                    "context": {},
+                    "built_images": {},
+                }
+            )
+        )
+
+        mock_args = MagicMock()
+        mock_args.skip_model_run = True
+        mock_args.additional_context = None
+        mock_args.live_output = False
+        mock_args.output = str(perf)
+
+        orchestrator = RunOrchestrator(mock_args)
+
+        with patch.object(RunOrchestrator, "_execute_local") as mock_local:
+            mock_local.return_value = {
+                "successful_runs": [],
+                "failed_runs": [],
+            }
+            orchestrator.execute(manifest_file=str(manifest_path), tags=None, timeout=60)
+
+        mock_local.assert_called_once()
+        mock_cleanup.assert_called()

From 221b445a1d20b29dc3129c0e249e34554e72eeae Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Wed, 8 Apr 2026 11:03:24 -0500
Subject: [PATCH 2/2] Updated docs: --skip-model-run is documented in the CLI
 reference, usage guide, main README tips

---
 README.md             |  3 ++-
 docs/README.md        |  2 +-
 docs/cli-reference.md |  6 ++++++
 docs/usage.md         | 15 +++++++++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b7bf745e..4e0dce9f 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ For detailed command options, see the **[CLI Command Reference](docs/cli-referen
 | Guide | Description |
 |-------|-------------|
 | [Installation](docs/installation.md) | Complete installation instructions |
-| [Usage Guide](docs/usage.md) | Commands, workflows, and examples |
+| [Usage Guide](docs/usage.md) | Commands, workflows, and examples ([`--skip-model-run`](docs/usage.md#skip-model-run-after-build)) |
 | **[CLI Reference](docs/cli-reference.md)** | **Detailed command options and examples** |
 | [Deployment](docs/deployment.md) | Kubernetes and SLURM deployment |
 | [Configuration](docs/configuration.md) | Advanced configuration options |
@@ -562,6 +562,7 @@ See [Installation Guide](docs/installation.md) for detailed instructions.
 ### Build & Deployment
 
 - **Separate build and run phases** for distributed deployments
+- **Build without executing:** `madengine run --tags … --skip-model-run` skips container execution **after a build in that same invocation** (ignored when using an existing `--manifest-file`). See [Usage — Skip model run after build](docs/usage.md#skip-model-run-after-build).
 - **Use registries** for multi-node execution (K8s/SLURM)
 - **Use batch build mode** for CI/CD to optimize build times
 - **Specify `--target-archs`** when building for multiple GPU architectures
diff --git a/docs/README.md b/docs/README.md
index 61e5d2d8..9796e8b9 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -9,7 +9,7 @@ Complete documentation for madengine - AI model automation and distributed bench
 | Guide | Description |
 |-------|-------------|
 | [Installation](installation.md) | Complete installation instructions |
-| [Usage Guide](usage.md) | Commands, configuration, and examples |
+| [Usage Guide](usage.md) | Commands, configuration, and examples ([`--skip-model-run`](usage.md#skip-model-run-after-build)) |
 
 ### Configuration & Deployment
 
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index bfd1ca51..4758b8fe 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -219,6 +219,7 @@ madengine run [OPTIONS]
 | `--keep-alive` | | FLAG | `False` | Keep Docker containers alive after run |
 | `--keep-model-dir` | | FLAG | `False` | Keep model directory after run |
 | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache (full workflow) |
+| `--skip-model-run` | | FLAG | `False` | After a **build in this invocation**, skip executing models (manifest/images still produced). **Ignored** when using `--manifest-file` with an existing manifest (run-only), or when no build ran in this invocation. See [Usage — Skip model run](usage.md#skip-model-run-after-build). |
 | `--manifest-output` | | TEXT | `build_manifest.json` | Output file for build manifest (full workflow) |
 | `--summary-output` | `-s` | TEXT | `None` | Output file for summary JSON |
 | `--live-output` | `-l` | FLAG | `False` | Print output in real-time |
@@ -246,6 +247,11 @@ madengine run --tags dummy --rocm-path /path/to/rocm \
 # Run with pre-built images (manifest-based)
 madengine run --manifest-file build_manifest.json
 
+# Build in this invocation but skip executing containers (CI: images + manifest only)
+madengine run --tags model \
+  --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' \
+  --skip-model-run
+
 # Multi-GPU with torchrun
 madengine run --tags model \
   --additional-context '{
diff --git a/docs/usage.md b/docs/usage.md
index f29f8426..6b1391e4 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -281,6 +281,21 @@ madengine build --batch-manifest batch.json \
 
 ## Run Workflow
 
+### Skip model run after build
+
+When `madengine run` **builds** in the same invocation (no pre-existing `--manifest-file`), you can pass **`--skip-model-run`** to produce images and `build_manifest.json` **without** running model containers.
+
+- **Ignored** when `--manifest-file` points at an existing manifest (execution-only mode): use plain `madengine run --manifest-file ...` to run later.
+- **Ignored** with a warning if this invocation did not perform a build (for example a manifest was already present and no rebuild occurred).
+
+```bash
+madengine run --tags model \
+  --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' \
+  --skip-model-run
+```
+
+See [CLI Reference — `run`](cli-reference.md#run---execute-models) and `madengine run --help`.
+
 ### Local Execution
 
 Run on local machine: