From 637b2024744d815d17d85030405a70882ac1fc48 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 7 Apr 2026 17:26:42 -0500 Subject: [PATCH 1/3] feat(execution): configurable log error pattern scan (additional_context) Allow disabling or tuning post-run log grep via additional_context and/or model_info: - log_error_pattern_scan / disable_log_error_scan to skip substring scan - log_error_benign_patterns to extend exclusions (e.g. noisy DLM logs) - log_error_patterns to override default pattern list Extract resolution into container_runner_helpers.resolve_log_error_scan_config; validate new keys in CLI validators; add unit tests. Motivation: avoid false FAILURE from benign RuntimeError: lines in suites already validated by pytest/JUnit (e.g. pyt_dl_tool_unit_test / AIMODELS-571). --- src/madengine/cli/validators.py | 36 +++++++ src/madengine/execution/container_runner.py | 38 ++++--- .../execution/container_runner_helpers.py | 100 ++++++++++++++++++ tests/unit/test_container_runner_helpers.py | 98 +++++++++++++++++ 4 files changed, 252 insertions(+), 20 deletions(-) create mode 100644 tests/unit/test_container_runner_helpers.py diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index d2d0a288..25706f94 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -170,6 +170,42 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: if "guest_os" in context and not isinstance(context["guest_os"], str): _fail_structure("guest_os", "a string") + if "log_error_pattern_scan" in context and not isinstance( + context["log_error_pattern_scan"], (bool, str, int, float, type(None)) + ): + _fail_structure( + "log_error_pattern_scan", + "a boolean, string, number, or null", + ) + + if "disable_log_error_scan" in context and not isinstance( + context["disable_log_error_scan"], (bool, str, int, float, type(None)) + ): + _fail_structure( + "disable_log_error_scan", + "a boolean, string, number, or null", + ) + + if "log_error_benign_patterns" in context: + lebp = context["log_error_benign_patterns"] + if not isinstance(lebp, list) or not all( + isinstance(x, str) for x in lebp + ): + _fail_structure( + "log_error_benign_patterns", + "an array of strings", + ) + + if "log_error_patterns" in context: + lep = context["log_error_patterns"] + if not isinstance(lep, list) or not lep or not all( + isinstance(x, str) for x in lep + ): + _fail_structure( + "log_error_patterns", + "a non-empty array of strings", + ) + def _normalize_docker_build_arg_values(context: Dict[str, Any]) -> None: dba = context.get("docker_build_arg") diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index cfa5e523..e9a87f6f 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -35,6 +35,7 @@ from madengine.utils.run_details import get_build_number, get_pipeline from madengine.execution.container_runner_helpers import ( make_run_log_file_path, + resolve_log_error_scan_config, resolve_run_timeout, ) @@ -1249,27 +1250,18 @@ def run_container( # Set status based on performance and error patterns # First check for obvious failure patterns in the logs try: - # Check for common failure patterns in the log file - # Note: Patterns should be specific enough to avoid false positives - # from profiling tools (rocprof, etc.) that use "Error:" as log level - error_patterns = [ - "OutOfMemoryError", - "HIP out of memory", - "CUDA out of memory", - "RuntimeError:", # More specific with colon - "AssertionError:", - "ValueError:", - "SystemExit", - "failed (exitcode:", # Literal text in logs - "Traceback (most recent call last)", # Python tracebacks - "FAILED", - "Exception:", - "ImportError:", - "ModuleNotFoundError:", - ] + scan_logs, error_patterns, extra_benign = ( + resolve_log_error_scan_config( + model_info, self.additional_context + ) + ) has_errors = False - if log_file_path and os.path.exists(log_file_path): + if ( + scan_logs + and log_file_path + and os.path.exists(log_file_path) + ): try: # Define benign patterns to exclude from error detection # These are known warnings/info messages that should not trigger failures @@ -1289,7 +1281,8 @@ def run_container( "rocpd_op:", # ROCProf operation logs "rpd_tracer:", # ROCProf tracer logs ] - + benign_patterns.extend(extra_benign) + # Check for error patterns in the log (exclude our own grep commands, output messages, and benign patterns). # Use subprocess (not console.sh) so the check runs silently and does not clutter console output. for pattern in error_patterns: @@ -1326,6 +1319,11 @@ def run_container( pass # Error checking is optional; treat as no match except Exception: pass # Error checking is optional + elif not scan_logs: + self.rich_console.print( + "[dim]ℹ️ Log error pattern scan disabled " + "(log_error_pattern_scan / disable_log_error_scan).[/dim]" + ) # Status logic: Must have performance AND no errors to be considered success # Exception: Worker nodes in multi-node training (MAD_COLLECT_METRICS=false) diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py index c6f7a25f..7a92251b 100644 --- a/src/madengine/execution/container_runner_helpers.py +++ b/src/madengine/execution/container_runner_helpers.py @@ -7,6 +7,106 @@ import typing +# Default substrings matched in container run logs post-hoc (see ContainerRunner). +DEFAULT_LOG_ERROR_PATTERNS: typing.Tuple[str, ...] = ( + "OutOfMemoryError", + "HIP out of memory", + "CUDA out of memory", + "RuntimeError:", + "AssertionError:", + "ValueError:", + "SystemExit", + "failed (exitcode:", + "Traceback (most recent call last)", + "FAILED", + "Exception:", + "ImportError:", + "ModuleNotFoundError:", +) + + +def _coerce_bool(value: typing.Any, *, default: bool) -> bool: + """Interpret JSON/CLI scalars as bool; fall back to *default* if None.""" + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, (int, float)) and not isinstance(value, bool): + return value != 0 + if isinstance(value, str): + s = value.strip().lower() + if s in ("0", "false", "no", "off", ""): + return False + if s in ("1", "true", "yes", "on"): + return True + return default + + +def _pick_context_over_model( + model_info: typing.Dict, + additional_context: typing.Dict, + key: str, + default: typing.Any = None, +) -> typing.Any: + """Resolve key from model_info, overridden by additional_context when present.""" + ctx = additional_context or {} + mi = model_info or {} + if key in ctx: + return ctx[key] + if key in mi: + return mi[key] + return default + + +def resolve_log_error_scan_config( + model_info: typing.Dict, + additional_context: typing.Optional[typing.Dict] = None, +) -> typing.Tuple[bool, typing.List[str], typing.List[str]]: + """ + Resolve whether to scan run logs for error substrings and which patterns to use. + + Keys (in ``additional_context`` and/or ``model_info``; context wins): + + - ``log_error_pattern_scan`` (default True): set False to skip grep-based failure detection. + - ``disable_log_error_scan`` (default False): if True, same as disabling scan (alias). + - ``log_error_benign_patterns``: list of extra substrings/regex fragments excluded from matches. + - ``log_error_patterns``: non-empty list of strings replaces the default error pattern list. + + Returns: + (scan_enabled, error_patterns, extra_benign_patterns) + """ + ctx = additional_context if additional_context is not None else {} + mi = model_info if model_info is not None else {} + + disable = _coerce_bool( + _pick_context_over_model(mi, ctx, "disable_log_error_scan", False), + default=False, + ) + scan_on = _coerce_bool( + _pick_context_over_model(mi, ctx, "log_error_pattern_scan", True), + default=True, + ) + scan_enabled = (not disable) and scan_on + + raw_benign_mi = mi.get("log_error_benign_patterns") + raw_benign_ctx = ctx.get("log_error_benign_patterns") + extra_benign: typing.List[str] = [] + for part in (raw_benign_mi, raw_benign_ctx): + if isinstance(part, list): + extra_benign.extend(str(x) for x in part if x is not None) + + custom_patterns = _pick_context_over_model(mi, ctx, "log_error_patterns", None) + if ( + isinstance(custom_patterns, list) + and len(custom_patterns) > 0 + and all(isinstance(x, str) for x in custom_patterns) + ): + error_patterns = list(custom_patterns) + else: + error_patterns = list(DEFAULT_LOG_ERROR_PATTERNS) + + return scan_enabled, error_patterns, extra_benign + def resolve_run_timeout( model_info: typing.Dict, diff --git a/tests/unit/test_container_runner_helpers.py b/tests/unit/test_container_runner_helpers.py new file mode 100644 index 00000000..a389daa8 --- /dev/null +++ b/tests/unit/test_container_runner_helpers.py @@ -0,0 +1,98 @@ +"""Unit tests for container_runner_helpers (including log error scan config).""" + +import pytest + +from madengine.execution.container_runner_helpers import ( + DEFAULT_LOG_ERROR_PATTERNS, + resolve_log_error_scan_config, +) + + +class TestResolveLogErrorScanConfig: + def test_defaults_enable_scan_and_default_patterns(self): + enabled, patterns, extra = resolve_log_error_scan_config({}, {}) + assert enabled is True + assert patterns == list(DEFAULT_LOG_ERROR_PATTERNS) + assert extra == [] + + def test_disable_log_error_scan(self): + enabled, _, extra = resolve_log_error_scan_config( + {}, {"disable_log_error_scan": True} + ) + assert enabled is False + assert extra == [] + + def test_log_error_pattern_scan_false_string(self): + enabled, _, _ = resolve_log_error_scan_config( + {}, {"log_error_pattern_scan": "false"} + ) + assert enabled is False + + def test_disable_wins_over_scan_true(self): + enabled, _, _ = resolve_log_error_scan_config( + {}, + {"disable_log_error_scan": True, "log_error_pattern_scan": True}, + ) + assert enabled is False + + def test_context_overrides_model(self): + enabled, _, _ = resolve_log_error_scan_config( + {"log_error_pattern_scan": True}, + {"log_error_pattern_scan": False}, + ) + assert enabled is False + + def test_model_only_false_when_no_context(self): + enabled, _, _ = resolve_log_error_scan_config( + {"log_error_pattern_scan": False}, + {}, + ) + assert enabled is False + + def test_extra_benign_merges_model_then_context(self): + _, _, extra = resolve_log_error_scan_config( + {"log_error_benign_patterns": ["a", "b"]}, + {"log_error_benign_patterns": ["c"]}, + ) + assert extra == ["a", "b", "c"] + + def test_custom_log_error_patterns(self): + enabled, patterns, _ = resolve_log_error_scan_config( + {}, + {"log_error_patterns": ["OOM", "Killed"]}, + ) + assert enabled is True + assert patterns == ["OOM", "Killed"] + + def test_invalid_custom_patterns_falls_back_to_default(self): + enabled, patterns, _ = resolve_log_error_scan_config( + {}, + {"log_error_patterns": []}, + ) + assert enabled is True + assert patterns == list(DEFAULT_LOG_ERROR_PATTERNS) + + def test_invalid_benign_type_skipped(self): + _, _, extra = resolve_log_error_scan_config( + {"log_error_benign_patterns": "not-a-list"}, + {}, + ) + assert extra == [] + + @pytest.mark.parametrize( + "raw,expected", + [ + (False, False), + ("no", False), + ("off", False), + ("0", False), + (True, True), + ("true", True), + (1, True), + ], + ) + def test_scan_toggle_coercion(self, raw, expected): + enabled, _, _ = resolve_log_error_scan_config( + {}, {"log_error_pattern_scan": raw} + ) + assert enabled is expected From 7caadad60f929a63f0b223d15a8c0d8c5f164cfa Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 8 Apr 2026 09:54:18 -0500 Subject: [PATCH 2/3] feat(execution): configurable log error pattern scan; document and drop duplicate flag Add optional additional_context / model keys to control post-run log substring checks (log_error_pattern_scan, log_error_benign_patterns, log_error_patterns). Resolve settings in container_runner_helpers; wire into ContainerRunner; validate CLI types. Remove disable_log_error_scan in favor of log_error_pattern_scan only. Document behavior in README, docs/configuration.md, cli-reference, usage, profiling, and docs index. Cross-link troubleshooting for benign RuntimeError: false failures. Tests: extend test_container_runner_helpers; unit suite passes. --- README.md | 12 ++++++- docs/README.md | 2 +- docs/cli-reference.md | 10 ++++++ docs/configuration.md | 35 +++++++++++++++++++ docs/profiling.md | 2 ++ docs/usage.md | 2 ++ src/madengine/cli/validators.py | 8 ----- src/madengine/execution/container_runner.py | 2 +- .../execution/container_runner_helpers.py | 8 +---- tests/unit/test_container_runner_helpers.py | 14 -------- 10 files changed, 63 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index b7bf745e..ce0cfbd4 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep - [Reporting and Database](#-reporting-and-database) - [Installation](#-installation) - [Tips & Best Practices](#-tips--best-practices) + - [Log error pattern scan](#log-error-pattern-scan) - [Exit codes and CI](#exit-codes-and-ci) - [Contributing](#-contributing) - [License](#-license) @@ -46,6 +47,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep - **🎯 ROCprofv3 Profiles** - 8 pre-configured profiles for compute/memory/communication bottleneck analysis - **🔍 Environment Validation** - TheRock ROCm detection and validation tools - **⚙️ Intelligent Defaults** - Minimal K8s configs with automatic preset application +- **📋 Configurable log scan** - Optional `--additional-context` keys to disable or tune post-run log substring checks (see [Log error pattern scan](#log-error-pattern-scan)) ## 🚀 Quick Start @@ -122,7 +124,7 @@ For detailed command options, see the **[CLI Command Reference](docs/cli-referen | [Usage Guide](docs/usage.md) | Commands, workflows, and examples | | **[CLI Reference](docs/cli-reference.md)** | **Detailed command options and examples** | | [Deployment](docs/deployment.md) | Kubernetes and SLURM deployment | -| [Configuration](docs/configuration.md) | Advanced configuration options | +| [Configuration](docs/configuration.md) | Advanced options; [run log error pattern scan](docs/configuration.md#run-phase-log-error-pattern-scan) | | [Batch Build](docs/batch-build.md) | Selective builds for CI/CD | | [Launchers](docs/launchers.md) | Distributed training frameworks | | [Profiling](docs/profiling.md) | Performance analysis tools | @@ -553,6 +555,13 @@ See [Installation Guide](docs/installation.md) for detailed instructions. - **Enable verbose logging** (`--verbose`) when debugging issues - **Use `--live-output`** for real-time monitoring of long-running operations +### Log error pattern scan + +After a local Docker run, madengine can scan the captured **run log** for common failure substrings (for example `RuntimeError:`, `CUDA out of memory`, `Traceback`). That helps catch hard failures when exit codes are ambiguous, but some workloads log benign `RuntimeError:` text while tests still pass. + +- **Disable** the scan when another signal is authoritative (e.g. pytest/JUnit inside the image): set `"log_error_pattern_scan": false` in `--additional-context` or in the model entry in `models.json`. See [Configuration — Run phase: log error pattern scan](docs/configuration.md#run-phase-log-error-pattern-scan). +- **Extend exclusions** with `log_error_benign_patterns` (list of strings), or **replace** the default pattern list with `log_error_patterns` (non-empty list of strings) for advanced cases. + ### CI / Jenkins - **Exit codes:** The CLI uses fixed exit codes (`ExitCode` in `madengine.cli.constants`, e.g. `SUCCESS=0`, `RUN_FAILURE=3`, `INVALID_ARGS=4`). Pipelines should treat **non-zero** as failure; no log scraping is required for pass/fail. @@ -597,6 +606,7 @@ madengine build --tags model --clean-docker-cache --verbose **Common Issues:** - **False failures with profiling**: If models show FAILURE but have performance metrics, see [Profiling Troubleshooting](docs/profiling.md#false-failure-detection-with-rocprof) +- **False failures from `RuntimeError:` in logs**: If the workload logs expected exception text but tests pass, disable or tune the scan with `log_error_pattern_scan` / `log_error_benign_patterns` — see [Configuration](docs/configuration.md#run-phase-log-error-pattern-scan) - **ROCProf log errors**: Messages like `E20251230` are informational logs, not errors (fixed in v2.0+) - **Configuration errors**: Validate JSON with `python -m json.tool your-config.json` diff --git a/docs/README.md b/docs/README.md index 61e5d2d8..d29d121b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -15,7 +15,7 @@ Complete documentation for madengine - AI model automation and distributed bench | Guide | Description | |-------|-------------| -| [Configuration](configuration.md) | Advanced configuration options | +| [Configuration](configuration.md) | Advanced configuration options (includes [run log error pattern scan](configuration.md#run-phase-log-error-pattern-scan)) | | [Batch Build](batch-build.md) | Selective builds with batch manifests | | [Deployment](deployment.md) | Kubernetes and SLURM deployment | | [Launchers](launchers.md) | Multi-node training frameworks | diff --git a/docs/cli-reference.md b/docs/cli-reference.md index bfd1ca51..99eee192 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -592,6 +592,16 @@ For complex configurations, use JSON files with `--additional-context-file`: To run on specific nodes, add `"nodelist": "node01,node02"` to the `slurm` section. When set, the job runs only on those nodes and node health preflight is skipped. See [examples/slurm-configs/basic/03-multi-node-basic-nodelist.json](../examples/slurm-configs/basic/03-multi-node-basic-nodelist.json). +### Run phase: log error pattern scan (optional) + +These keys apply to **local Docker runs** when madengine post-processes the run log. Use them when substring matches cause false `FAILURE` status (for example benign `RuntimeError:` lines). Full details: [Configuration — Run phase: log error pattern scan](configuration.md#run-phase-log-error-pattern-scan). + +| Key | Description | +|-----|-------------| +| `log_error_pattern_scan` | Default `true`. Set `false` to skip grep-based log failure detection. | +| `log_error_benign_patterns` | Array of extra strings to exclude from matching (merged with built-in benign list). | +| `log_error_patterns` | Non-empty array replaces the default substring list (advanced). | + --- ## Environment Variables diff --git a/docs/configuration.md b/docs/configuration.md index c1d4d13d..f6ae79a1 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -84,6 +84,41 @@ For production deployments: The **run** command does NOT require these values because it can detect GPU vendor at runtime. Defaults only apply to the **build** command where Dockerfile selection requires them. +## Run phase: log error pattern scan + +After a successful container run, madengine may scan the **run log file** for fixed substrings (for example `RuntimeError:`, `OutOfMemoryError`, `Traceback (most recent call last)`). If a match is found, the run can be marked `FAILURE` even when performance metrics exist—intended as a safety net when logs show obvious Python or OOM errors. + +Some suites (for example layer unit tests) intentionally print benign `RuntimeError:` text while pytest still passes. In those cases you can **disable** the scan or **narrow** what counts as an error. + +Keys can be set in `--additional-context` / `--additional-context-file`, or on the **model** entry in `models.json` (same keys). **Runtime context overrides the model** when both are set. + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `log_error_pattern_scan` | bool or string/number (coerced) | `true` | If `false`, skip substring-based log failure detection entirely (rely on exit codes and other signals). | +| `log_error_benign_patterns` | array of strings | `[]` | Extra lines to **exclude** before matching (appended to built-in exclusions such as ROCProf/metrics noise). Model list is merged first, then context list. | +| `log_error_patterns` | array of strings (non-empty) | (built-in list) | If set, **replaces** the default pattern list. Use only when you need a custom allowlist of failure substrings. | + +**Example — disable scan for a tag (pytest is authoritative):** + +```bash +madengine run --tags my_unit_test_suite \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "log_error_pattern_scan": false}' +``` + +**Example — extra benign substrings (prefer stable strings from real logs):** + +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_benign_patterns": [ + "expected benign fragment from workload log" + ] +} +``` + +Disabling the scan does **not** change performance metric extraction from the log; it only affects the post-hoc grep used to set `has_errors` for status. + ## Basic Configuration **gpu_vendor** (case-insensitive): diff --git a/docs/profiling.md b/docs/profiling.md index f4323d69..fe926844 100644 --- a/docs/profiling.md +++ b/docs/profiling.md @@ -841,6 +841,8 @@ ROCProf uses glog-style logging where `E` prefix means "Error level log" (not an **Fixed in:** madengine v2.0+ +For false failures **not** caused by ROCProf (for example workloads that print benign `RuntimeError:` text), see [Configuration — Run phase: log error pattern scan](configuration.md#run-phase-log-error-pattern-scan) (`log_error_pattern_scan`, `log_error_benign_patterns`). + **Verification:** ```bash # Run with profiling - should show SUCCESS status diff --git a/docs/usage.md b/docs/usage.md index f29f8426..6cf4c160 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -385,6 +385,8 @@ madengine run --tags model --verbose --live-output madengine run --tags model --keep-alive --verbose --live-output ``` +If the run is marked `FAILURE` because the log contains benign substrings (for example `RuntimeError:`) while the workload actually passed, configure [log error pattern scan](configuration.md#run-phase-log-error-pattern-scan) (`log_error_pattern_scan`, `log_error_benign_patterns`). + ### Clean Rebuild ```bash diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index 25706f94..d99e87f7 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -178,14 +178,6 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: "a boolean, string, number, or null", ) - if "disable_log_error_scan" in context and not isinstance( - context["disable_log_error_scan"], (bool, str, int, float, type(None)) - ): - _fail_structure( - "disable_log_error_scan", - "a boolean, string, number, or null", - ) - if "log_error_benign_patterns" in context: lebp = context["log_error_benign_patterns"] if not isinstance(lebp, list) or not all( diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index e9a87f6f..40550248 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -1322,7 +1322,7 @@ def run_container( elif not scan_logs: self.rich_console.print( "[dim]ℹ️ Log error pattern scan disabled " - "(log_error_pattern_scan / disable_log_error_scan).[/dim]" + "(log_error_pattern_scan).[/dim]" ) # Status logic: Must have performance AND no errors to be considered success diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py index 7a92251b..35f4fba4 100644 --- a/src/madengine/execution/container_runner_helpers.py +++ b/src/madengine/execution/container_runner_helpers.py @@ -68,7 +68,6 @@ def resolve_log_error_scan_config( Keys (in ``additional_context`` and/or ``model_info``; context wins): - ``log_error_pattern_scan`` (default True): set False to skip grep-based failure detection. - - ``disable_log_error_scan`` (default False): if True, same as disabling scan (alias). - ``log_error_benign_patterns``: list of extra substrings/regex fragments excluded from matches. - ``log_error_patterns``: non-empty list of strings replaces the default error pattern list. @@ -78,15 +77,10 @@ def resolve_log_error_scan_config( ctx = additional_context if additional_context is not None else {} mi = model_info if model_info is not None else {} - disable = _coerce_bool( - _pick_context_over_model(mi, ctx, "disable_log_error_scan", False), - default=False, - ) - scan_on = _coerce_bool( + scan_enabled = _coerce_bool( _pick_context_over_model(mi, ctx, "log_error_pattern_scan", True), default=True, ) - scan_enabled = (not disable) and scan_on raw_benign_mi = mi.get("log_error_benign_patterns") raw_benign_ctx = ctx.get("log_error_benign_patterns") diff --git a/tests/unit/test_container_runner_helpers.py b/tests/unit/test_container_runner_helpers.py index a389daa8..fcf58b10 100644 --- a/tests/unit/test_container_runner_helpers.py +++ b/tests/unit/test_container_runner_helpers.py @@ -15,26 +15,12 @@ def test_defaults_enable_scan_and_default_patterns(self): assert patterns == list(DEFAULT_LOG_ERROR_PATTERNS) assert extra == [] - def test_disable_log_error_scan(self): - enabled, _, extra = resolve_log_error_scan_config( - {}, {"disable_log_error_scan": True} - ) - assert enabled is False - assert extra == [] - def test_log_error_pattern_scan_false_string(self): enabled, _, _ = resolve_log_error_scan_config( {}, {"log_error_pattern_scan": "false"} ) assert enabled is False - def test_disable_wins_over_scan_true(self): - enabled, _, _ = resolve_log_error_scan_config( - {}, - {"disable_log_error_scan": True, "log_error_pattern_scan": True}, - ) - assert enabled is False - def test_context_overrides_model(self): enabled, _, _ = resolve_log_error_scan_config( {"log_error_pattern_scan": True}, From b62e8b81df86ab4b5c1b5a913c321947cb170d5a Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 8 Apr 2026 16:18:13 -0500 Subject: [PATCH 3/3] refactor: scan container logs in Python; validate log_error_* in tests Replace sh/grep-based log error detection with in-process scanning to avoid shell quoting and injection issues when patterns come from model_info or additional_context. - Add log_text_has_error_pattern() with literal benign substrings vs explicit regex benign rules (ROCProf). - Wire ContainerRunner to read the log once and use the helper. - Document log_error_benign_patterns as literal substrings in helpers. - Extend validate_additional_context tests for log_error_pattern_scan, log_error_benign_patterns, and log_error_patterns (valid + invalid). --- src/madengine/execution/container_runner.py | 81 ++++++------ .../execution/container_runner_helpers.py | 49 +++++++- tests/unit/test_container_runner_helpers.py | 48 +++++++ tests/unit/test_validators.py | 117 ++++++++++++++++++ 4 files changed, 248 insertions(+), 47 deletions(-) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 40550248..0b561f09 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -34,6 +34,7 @@ from madengine.utils.path_utils import scripts_base_dir_from from madengine.utils.run_details import get_build_number, get_pipeline from madengine.execution.container_runner_helpers import ( + log_text_has_error_pattern, make_run_log_file_path, resolve_log_error_scan_config, resolve_run_timeout, @@ -1263,60 +1264,48 @@ def run_container( and os.path.exists(log_file_path) ): try: - # Define benign patterns to exclude from error detection - # These are known warnings/info messages that should not trigger failures - benign_patterns = [ + # Benign: literal substrings (incl. user extra_benign) vs regex (ROCProf lines). + benign_substrings = [ "Failed to establish connection to the metrics exporter agent", "RpcError: Running out of retries to initialize the metrics agent", "Metrics will not be exported", "FutureWarning", - # ROCProf/glog logging patterns (E/W/I prefixes are log levels, not errors) - r"^E[0-9]{8}.*generateRocpd\.cpp", # ROCProf error-level logs - r"^W[0-9]{8}.*simple_timer\.cpp", # ROCProf warning-level logs - r"^W[0-9]{8}.*generateRocpd\.cpp", # ROCProf warning-level logs - r"^E[0-9]{8}.*tool\.cpp", # ROCProf tool logs - "Opened result file:", # ROCProf result file messages - "SQLite3 generation ::", # ROCProf SQLite messages - r"\[rocprofv3\]", # ROCProf v3 messages - "rocpd_op:", # ROCProf operation logs - "rpd_tracer:", # ROCProf tracer logs + "Opened result file:", + "SQLite3 generation ::", + "rocpd_op:", + "rpd_tracer:", ] - benign_patterns.extend(extra_benign) + benign_substrings.extend(extra_benign) + benign_regexes = [ + # ROCProf/glog: E/W prefixes are log levels, not app errors + r"^E[0-9]{8}.*generateRocpd\.cpp", + r"^W[0-9]{8}.*simple_timer\.cpp", + r"^W[0-9]{8}.*generateRocpd\.cpp", + r"^E[0-9]{8}.*tool\.cpp", + r"\[rocprofv3\]", + ] + + # Scan in Python (no shell; literals vs regex benign rules are explicit). + with open( + log_file_path, + "r", + encoding="utf-8", + errors="ignore", + ) as _lf: + log_scan_text = _lf.read() - # Check for error patterns in the log (exclude our own grep commands, output messages, and benign patterns). - # Use subprocess (not console.sh) so the check runs silently and does not clutter console output. for pattern in error_patterns: - # Build exclusion regex: our own commands, output messages, and benign patterns. - # Use re.escape(pattern) so parentheses and other special chars are safe in grep -E. - pattern_escaped = re.escape(pattern) - exclusions = f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped}" - for benign in benign_patterns: - # Escape special regex characters in benign patterns - escaped_benign = benign.replace(".", r"\.").replace("(", r"\(").replace(")", r"\)") - exclusions += f"|{escaped_benign}" - exclusions += ")" - # Match pattern literally in the filtered log (grep -F avoids regex issues) - error_check_cmd = [ - "sh", - "-c", - f"grep -v -E '{exclusions}' {log_file_path} | grep -F -q -- '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'", - ] - try: - proc = subprocess.run( - error_check_cmd, - capture_output=True, - text=True, - timeout=60, + if log_text_has_error_pattern( + log_scan_text, + pattern, + benign_substrings, + benign_regexes, + ): + has_errors = True + print( + f"Found error pattern '{pattern}' in logs" ) - result = (proc.stdout or "").strip() - if result == "FOUND": - has_errors = True - print( - f"Found error pattern '{pattern}' in logs" - ) - break - except (subprocess.TimeoutExpired, OSError): - pass # Error checking is optional; treat as no match + break except Exception: pass # Error checking is optional elif not scan_logs: diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py index 35f4fba4..4f60b45a 100644 --- a/src/madengine/execution/container_runner_helpers.py +++ b/src/madengine/execution/container_runner_helpers.py @@ -5,6 +5,7 @@ Extracted so run_container logic is easier to test and maintain. """ +import re import typing # Default substrings matched in container run logs post-hoc (see ContainerRunner). @@ -68,7 +69,8 @@ def resolve_log_error_scan_config( Keys (in ``additional_context`` and/or ``model_info``; context wins): - ``log_error_pattern_scan`` (default True): set False to skip grep-based failure detection. - - ``log_error_benign_patterns``: list of extra substrings/regex fragments excluded from matches. + - ``log_error_benign_patterns``: list of extra **literal** substrings; a log line containing + any of them is excluded from error matching (not interpreted as regex). - ``log_error_patterns``: non-empty list of strings replaces the default error pattern list. Returns: @@ -102,6 +104,51 @@ def resolve_log_error_scan_config( return scan_enabled, error_patterns, extra_benign +def log_text_has_error_pattern( + log_text: str, + pattern: str, + benign_substrings: typing.Sequence[str], + benign_regexes: typing.Sequence[str] = (), +) -> bool: + """ + Whether *log_text* contains a literal *pattern* on some line that is not excluded. + + Exclusions (same intent as the old ``grep -v -E | grep -F`` pipeline): + + - Meta lines mentioning our own ``grep`` / "Found error pattern" machinery. + - *benign_substrings*: line skipped if any string appears as a **literal** substring. + - *benign_regexes*: line skipped if any compiled regex matches (for built-in ROCProf rules). + + User-supplied benign entries should use *benign_substrings* only so regex metacharacters + in config are not interpreted unless explicitly added to *benign_regexes*. + """ + pattern_escaped = re.escape(pattern) + try: + meta_excl = re.compile( + f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped})" + ) + except re.error: + return False + + compiled_benign: typing.List[re.Pattern[str]] = [] + for rx in benign_regexes: + try: + compiled_benign.append(re.compile(rx)) + except re.error: + continue + + for line in log_text.splitlines(): + if meta_excl.search(line): + continue + if any(s in line for s in benign_substrings): + continue + if any(br.search(line) for br in compiled_benign): + continue + if pattern in line: + return True + return False + + def resolve_run_timeout( model_info: typing.Dict, cli_timeout: int, diff --git a/tests/unit/test_container_runner_helpers.py b/tests/unit/test_container_runner_helpers.py index fcf58b10..a4d96539 100644 --- a/tests/unit/test_container_runner_helpers.py +++ b/tests/unit/test_container_runner_helpers.py @@ -4,6 +4,7 @@ from madengine.execution.container_runner_helpers import ( DEFAULT_LOG_ERROR_PATTERNS, + log_text_has_error_pattern, resolve_log_error_scan_config, ) @@ -82,3 +83,50 @@ def test_scan_toggle_coercion(self, raw, expected): {}, {"log_error_pattern_scan": raw} ) assert enabled is expected + + +class TestLogTextHasErrorPattern: + def test_finds_literal_pattern(self): + log = "line1\nRuntimeError: boom\n" + assert log_text_has_error_pattern(log, "RuntimeError:", []) + + def test_respects_benign_substring(self): + log = "ignore FutureWarning in this line\n" + assert not log_text_has_error_pattern( + log, + "FutureWarning", + ["FutureWarning"], + (), + ) + + def test_quotes_in_pattern_no_shell(self): + """Patterns with quotes must match literally; must not raise or crash.""" + log = "msg: can't happen\n" + assert log_text_has_error_pattern(log, "can't happen", []) + + def test_excludes_grep_meta_line(self): + log = "some grep -q stuff RuntimeError: x\nreal RuntimeError: bad\n" + # First line matches exclusion grep -q.*RuntimeError + assert log_text_has_error_pattern(log, "RuntimeError:", [], ()) + + def test_regex_benign_excludes_rocprof_style_line(self): + log = ( + "E12345678 generateRocpd.cpp: noise\n" + "clean RuntimeError: real issue\n" + ) + assert log_text_has_error_pattern( + log, + "RuntimeError:", + [], + (r"^E[0-9]{8}.*generateRocpd\.cpp",), + ) + + def test_user_benign_literal_parentheses(self): + # User-config benign strings must be literal substrings (not broken by ad-hoc regex escaping). + log = "info (benign marker) ok\nRuntimeError: real\n" + assert log_text_has_error_pattern( + log, + "RuntimeError:", + ["(benign marker)"], + (), + ) diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py index cb74cce0..6a78c52d 100644 --- a/tests/unit/test_validators.py +++ b/tests/unit/test_validators.py @@ -194,3 +194,120 @@ def test_validate_additional_context_tools_list_ok(self): ctx = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": []}' result = validate_additional_context(additional_context=ctx) assert result["tools"] == [] + + def test_validate_additional_context_log_error_keys_accepted(self): + """Valid types for log scan keys pass structure validation.""" + ctx = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_pattern_scan": False, + "log_error_benign_patterns": ["noise", "(ok)"], + "log_error_patterns": ["OOM", "Killed"], + } + ) + result = validate_additional_context(additional_context=ctx) + assert result["log_error_pattern_scan"] is False + assert result["log_error_benign_patterns"] == ["noise", "(ok)"] + assert result["log_error_patterns"] == ["OOM", "Killed"] + + @pytest.mark.parametrize( + "leps", + [ + True, + "true", + 0, + 1, + None, + ], + ) + def test_validate_additional_context_log_error_pattern_scan_coercible_types( + self, leps + ): + ctx = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_pattern_scan": leps, + } + ) + result = validate_additional_context(additional_context=ctx) + assert result["log_error_pattern_scan"] == leps + + def test_validate_additional_context_log_error_pattern_scan_rejects_bad_type(self): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_pattern_scan": [], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_benign_patterns_rejects_non_list( + self, + ): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_benign_patterns": "not-a-list", + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_benign_patterns_rejects_non_strings( + self, + ): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_benign_patterns": ["a", 1], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_patterns_rejects_empty_list(self): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_patterns": [], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_patterns_rejects_non_list(self): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_patterns": {}, + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_patterns_rejects_non_string_element( + self, + ): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_patterns": ["ok", 2], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS