ROCm · coketaste · Apr 8, 2026 · Apr 7, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py
@@ -34,6 +34,7 @@
 from madengine.utils.path_utils import scripts_base_dir_from
 from madengine.utils.run_details import get_build_number, get_pipeline
 from madengine.execution.container_runner_helpers import (
+    log_text_has_error_pattern,
     make_run_log_file_path,
     resolve_log_error_scan_config,
     resolve_run_timeout,
@@ -1263,60 +1264,48 @@ def run_container(
                                 and os.path.exists(log_file_path)
                             ):
                                 try:
-                                    # Define benign patterns to exclude from error detection
-                                    # These are known warnings/info messages that should not trigger failures
-                                    benign_patterns = [
+                                    # Benign: literal substrings (incl. user extra_benign) vs regex (ROCProf lines).
+                                    benign_substrings = [
                                         "Failed to establish connection to the metrics exporter agent",
                                         "RpcError: Running out of retries to initialize the metrics agent",
                                         "Metrics will not be exported",
                                         "FutureWarning",
-                                        # ROCProf/glog logging patterns (E/W/I prefixes are log levels, not errors)
-                                        r"^E[0-9]{8}.*generateRocpd\.cpp",  # ROCProf error-level logs
-                                        r"^W[0-9]{8}.*simple_timer\.cpp",    # ROCProf warning-level logs
-                                        r"^W[0-9]{8}.*generateRocpd\.cpp",   # ROCProf warning-level logs
-                                        r"^E[0-9]{8}.*tool\.cpp",            # ROCProf tool logs
-                                        "Opened result file:",                # ROCProf result file messages
-                                        "SQLite3 generation ::",              # ROCProf SQLite messages
-                                        r"\[rocprofv3\]",                     # ROCProf v3 messages
-                                        "rocpd_op:",                          # ROCProf operation logs
-                                        "rpd_tracer:",                        # ROCProf tracer logs
+                                        "Opened result file:",
+                                        "SQLite3 generation ::",
+                                        "rocpd_op:",
+                                        "rpd_tracer:",
                                     ]
-                                    benign_patterns.extend(extra_benign)
+                                    benign_substrings.extend(extra_benign)
+                                    benign_regexes = [
+                                        # ROCProf/glog: E/W prefixes are log levels, not app errors
+                                        r"^E[0-9]{8}.*generateRocpd\.cpp",
+                                        r"^W[0-9]{8}.*simple_timer\.cpp",
+                                        r"^W[0-9]{8}.*generateRocpd\.cpp",
+                                        r"^E[0-9]{8}.*tool\.cpp",
+                                        r"\[rocprofv3\]",
+                                    ]
+
+                                    # Scan in Python (no shell; literals vs regex benign rules are explicit).
+                                    with open(
+                                        log_file_path,
+                                        "r",
+                                        encoding="utf-8",
+                                        errors="ignore",
+                                    ) as _lf:
+                                        log_scan_text = _lf.read()
 
-                                    # Check for error patterns in the log (exclude our own grep commands, output messages, and benign patterns).
-                                    # Use subprocess (not console.sh) so the check runs silently and does not clutter console output.
                                     for pattern in error_patterns:
-                                        # Build exclusion regex: our own commands, output messages, and benign patterns.
-                                        # Use re.escape(pattern) so parentheses and other special chars are safe in grep -E.
-                                        pattern_escaped = re.escape(pattern)
-                                        exclusions = f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped}"
-                                        for benign in benign_patterns:
-                                            # Escape special regex characters in benign patterns
-                                            escaped_benign = benign.replace(".", r"\.").replace("(", r"\(").replace(")", r"\)")
-                                            exclusions += f"|{escaped_benign}"
-                                        exclusions += ")"
-                                        # Match pattern literally in the filtered log (grep -F avoids regex issues)
-                                        error_check_cmd = [
-                                            "sh",
-                                            "-c",
-                                            f"grep -v -E '{exclusions}' {log_file_path} | grep -F -q -- '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'",
-                                        ]
-                                        try:
-                                            proc = subprocess.run(
-                                                error_check_cmd,
-                                                capture_output=True,
-                                                text=True,
-                                                timeout=60,
+                                        if log_text_has_error_pattern(
+                                            log_scan_text,
+                                            pattern,
+                                            benign_substrings,
+                                            benign_regexes,
+                                        ):
+                                            has_errors = True
+                                            print(
+                                                f"Found error pattern '{pattern}' in logs"
                                             )
-                                            result = (proc.stdout or "").strip()
-                                            if result == "FOUND":
-                                                has_errors = True
-                                                print(
-                                                    f"Found error pattern '{pattern}' in logs"
-                                                )
-                                                break
-                                        except (subprocess.TimeoutExpired, OSError):
-                                            pass  # Error checking is optional; treat as no match
+                                            break
                                 except Exception:
                                     pass  # Error checking is optional
                             elif not scan_logs:

diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py
@@ -5,6 +5,7 @@
 Extracted so run_container logic is easier to test and maintain.
 """
 
+import re
 import typing
 
 # Default substrings matched in container run logs post-hoc (see ContainerRunner).
@@ -67,8 +68,9 @@ def resolve_log_error_scan_config(
 
     Keys (in ``additional_context`` and/or ``model_info``; context wins):
 
-    - ``log_error_pattern_scan`` (default True): set False to skip grep-based failure detection.
-    - ``log_error_benign_patterns``: list of extra substrings/regex fragments excluded from matches.
+    - ``log_error_pattern_scan`` (default True): set False to skip post-run log error detection.
+    - ``log_error_benign_patterns``: list of extra **literal** substrings; a log line containing
+      any of them is excluded from error matching (not interpreted as regex).
     - ``log_error_patterns``: non-empty list of strings replaces the default error pattern list.
 
     Returns:
@@ -102,6 +104,51 @@ def resolve_log_error_scan_config(
     return scan_enabled, error_patterns, extra_benign
 
 
+def log_text_has_error_pattern(
+    log_text: str,
+    pattern: str,
+    benign_substrings: typing.Sequence[str],
+    benign_regexes: typing.Sequence[str] = (),
+) -> bool:
+    """
+    Whether *log_text* contains a literal *pattern* on some line that is not excluded.
+
+    Exclusions (same intent as the old ``grep -v -E | grep -F`` pipeline):
+
+    - Meta lines mentioning our own ``grep`` / "Found error pattern" machinery.
+    - *benign_substrings*: line skipped if any string appears as a **literal** substring.
+    - *benign_regexes*: line skipped if any compiled regex matches (for built-in ROCProf rules).
+
+    User-supplied benign entries should use *benign_substrings* only so regex metacharacters
+    in config are not interpreted unless explicitly added to *benign_regexes*.
+    """
+    pattern_escaped = re.escape(pattern)
+    try:
+        meta_excl = re.compile(
+            f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped})"
+        )
+    except re.error:
+        return False
+
+    compiled_benign: typing.List[re.Pattern[str]] = []
+    for rx in benign_regexes:
+        try:
+            compiled_benign.append(re.compile(rx))
+        except re.error:
+            continue
+
+    for line in log_text.splitlines():
+        if meta_excl.search(line):
+            continue
+        if any(s in line for s in benign_substrings):
+            continue
+        if any(br.search(line) for br in compiled_benign):
+            continue
+        if pattern in line:
+            return True
+    return False
+
+
 def resolve_run_timeout(
     model_info: typing.Dict,
     cli_timeout: int,

diff --git a/tests/unit/test_container_runner_helpers.py b/tests/unit/test_container_runner_helpers.py
@@ -4,6 +4,7 @@
 
 from madengine.execution.container_runner_helpers import (
     DEFAULT_LOG_ERROR_PATTERNS,
+    log_text_has_error_pattern,
     resolve_log_error_scan_config,
 )
 
@@ -82,3 +83,50 @@ def test_scan_toggle_coercion(self, raw, expected):
             {}, {"log_error_pattern_scan": raw}
         )
         assert enabled is expected
+
+
+class TestLogTextHasErrorPattern:
+    def test_finds_literal_pattern(self):
+        log = "line1\nRuntimeError: boom\n"
+        assert log_text_has_error_pattern(log, "RuntimeError:", [])
+
+    def test_respects_benign_substring(self):
+        log = "ignore FutureWarning in this line\n"
+        assert not log_text_has_error_pattern(
+            log,
+            "FutureWarning",
+            ["FutureWarning"],
+            (),
+        )
+
+    def test_quotes_in_pattern_no_shell(self):
+        """Patterns with quotes must match literally; must not raise or crash."""
+        log = "msg: can't happen\n"
+        assert log_text_has_error_pattern(log, "can't happen", [])
+
+    def test_excludes_grep_meta_line(self):
+        log = "some grep -q stuff RuntimeError: x\nreal RuntimeError: bad\n"
+        # First line matches exclusion grep -q.*RuntimeError
+        assert log_text_has_error_pattern(log, "RuntimeError:", [], ())
+
+    def test_regex_benign_excludes_rocprof_style_line(self):
+        log = (
+            "E12345678  generateRocpd.cpp: noise\n"
+            "clean RuntimeError: real issue\n"
+        )
+        assert log_text_has_error_pattern(
+            log,
+            "RuntimeError:",
+            [],
+            (r"^E[0-9]{8}.*generateRocpd\.cpp",),
+        )
+
+    def test_user_benign_literal_parentheses(self):
+        # User-config benign strings must be literal substrings (not broken by ad-hoc regex escaping).
+        log = "info (benign marker) ok\nRuntimeError: real\n"
+        assert log_text_has_error_pattern(
+            log,
+            "RuntimeError:",
+            ["(benign marker)"],
+            (),
+        )
diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py
@@ -194,3 +194,120 @@ def test_validate_additional_context_tools_list_ok(self):
         ctx = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": []}'
         result = validate_additional_context(additional_context=ctx)
         assert result["tools"] == []
+
+    def test_validate_additional_context_log_error_keys_accepted(self):
+        """Valid types for log scan keys pass structure validation."""
+        ctx = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_pattern_scan": False,
+                "log_error_benign_patterns": ["noise", "(ok)"],
+                "log_error_patterns": ["OOM", "Killed"],
+            }
+        )
+        result = validate_additional_context(additional_context=ctx)
+        assert result["log_error_pattern_scan"] is False
+        assert result["log_error_benign_patterns"] == ["noise", "(ok)"]
+        assert result["log_error_patterns"] == ["OOM", "Killed"]
+
+    @pytest.mark.parametrize(
+        "leps",
+        [
+            True,
+            "true",
+            0,
+            1,
+            None,
+        ],
+    )
+    def test_validate_additional_context_log_error_pattern_scan_coercible_types(
+        self, leps
+    ):
+        ctx = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_pattern_scan": leps,
+            }
+        )
+        result = validate_additional_context(additional_context=ctx)
+        assert result["log_error_pattern_scan"] == leps
+
+    def test_validate_additional_context_log_error_pattern_scan_rejects_bad_type(self):
+        bad = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_pattern_scan": [],
+            }
+        )
+        with pytest.raises(typer.Exit) as exc_info:
+            validate_additional_context(additional_context=bad)
+        assert exc_info.value.exit_code == ExitCode.INVALID_ARGS
+
+    def test_validate_additional_context_log_error_benign_patterns_rejects_non_list(
+        self,
+    ):
+        bad = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_benign_patterns": "not-a-list",
+            }
+        )
+        with pytest.raises(typer.Exit) as exc_info:
+            validate_additional_context(additional_context=bad)
+        assert exc_info.value.exit_code == ExitCode.INVALID_ARGS
+
+    def test_validate_additional_context_log_error_benign_patterns_rejects_non_strings(
+        self,
+    ):
+        bad = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_benign_patterns": ["a", 1],
+            }
+        )
+        with pytest.raises(typer.Exit) as exc_info:
+            validate_additional_context(additional_context=bad)
+        assert exc_info.value.exit_code == ExitCode.INVALID_ARGS
+
+    def test_validate_additional_context_log_error_patterns_rejects_empty_list(self):
+        bad = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_patterns": [],
+            }
+        )
+        with pytest.raises(typer.Exit) as exc_info:
+            validate_additional_context(additional_context=bad)
+        assert exc_info.value.exit_code == ExitCode.INVALID_ARGS
+
+    def test_validate_additional_context_log_error_patterns_rejects_non_list(self):
+        bad = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_patterns": {},
+            }
+        )
+        with pytest.raises(typer.Exit) as exc_info:
+            validate_additional_context(additional_context=bad)
+        assert exc_info.value.exit_code == ExitCode.INVALID_ARGS
+
+    def test_validate_additional_context_log_error_patterns_rejects_non_string_element(
+        self,
+    ):
+        bad = json.dumps(
+            {
+                "gpu_vendor": "AMD",
+                "guest_os": "UBUNTU",
+                "log_error_patterns": ["ok", 2],
+            }
+        )
+        with pytest.raises(typer.Exit) as exc_info:
+            validate_additional_context(additional_context=bad)
+        assert exc_info.value.exit_code == ExitCode.INVALID_ARGS