diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 40550248..0b561f09 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -34,6 +34,7 @@ from madengine.utils.path_utils import scripts_base_dir_from from madengine.utils.run_details import get_build_number, get_pipeline from madengine.execution.container_runner_helpers import ( + log_text_has_error_pattern, make_run_log_file_path, resolve_log_error_scan_config, resolve_run_timeout, @@ -1263,60 +1264,48 @@ def run_container( and os.path.exists(log_file_path) ): try: - # Define benign patterns to exclude from error detection - # These are known warnings/info messages that should not trigger failures - benign_patterns = [ + # Benign: literal substrings (incl. user extra_benign) vs regex (ROCProf lines). + benign_substrings = [ "Failed to establish connection to the metrics exporter agent", "RpcError: Running out of retries to initialize the metrics agent", "Metrics will not be exported", "FutureWarning", - # ROCProf/glog logging patterns (E/W/I prefixes are log levels, not errors) - r"^E[0-9]{8}.*generateRocpd\.cpp", # ROCProf error-level logs - r"^W[0-9]{8}.*simple_timer\.cpp", # ROCProf warning-level logs - r"^W[0-9]{8}.*generateRocpd\.cpp", # ROCProf warning-level logs - r"^E[0-9]{8}.*tool\.cpp", # ROCProf tool logs - "Opened result file:", # ROCProf result file messages - "SQLite3 generation ::", # ROCProf SQLite messages - r"\[rocprofv3\]", # ROCProf v3 messages - "rocpd_op:", # ROCProf operation logs - "rpd_tracer:", # ROCProf tracer logs + "Opened result file:", + "SQLite3 generation ::", + "rocpd_op:", + "rpd_tracer:", ] - benign_patterns.extend(extra_benign) + benign_substrings.extend(extra_benign) + benign_regexes = [ + # ROCProf/glog: E/W prefixes are log levels, not app errors + r"^E[0-9]{8}.*generateRocpd\.cpp", + r"^W[0-9]{8}.*simple_timer\.cpp", + r"^W[0-9]{8}.*generateRocpd\.cpp", + r"^E[0-9]{8}.*tool\.cpp", + r"\[rocprofv3\]", + ] + + # Scan in Python (no shell; literals vs regex benign rules are explicit). + with open( + log_file_path, + "r", + encoding="utf-8", + errors="ignore", + ) as _lf: + log_scan_text = _lf.read() - # Check for error patterns in the log (exclude our own grep commands, output messages, and benign patterns). - # Use subprocess (not console.sh) so the check runs silently and does not clutter console output. for pattern in error_patterns: - # Build exclusion regex: our own commands, output messages, and benign patterns. - # Use re.escape(pattern) so parentheses and other special chars are safe in grep -E. - pattern_escaped = re.escape(pattern) - exclusions = f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped}" - for benign in benign_patterns: - # Escape special regex characters in benign patterns - escaped_benign = benign.replace(".", r"\.").replace("(", r"\(").replace(")", r"\)") - exclusions += f"|{escaped_benign}" - exclusions += ")" - # Match pattern literally in the filtered log (grep -F avoids regex issues) - error_check_cmd = [ - "sh", - "-c", - f"grep -v -E '{exclusions}' {log_file_path} | grep -F -q -- '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'", - ] - try: - proc = subprocess.run( - error_check_cmd, - capture_output=True, - text=True, - timeout=60, + if log_text_has_error_pattern( + log_scan_text, + pattern, + benign_substrings, + benign_regexes, + ): + has_errors = True + print( + f"Found error pattern '{pattern}' in logs" ) - result = (proc.stdout or "").strip() - if result == "FOUND": - has_errors = True - print( - f"Found error pattern '{pattern}' in logs" - ) - break - except (subprocess.TimeoutExpired, OSError): - pass # Error checking is optional; treat as no match + break except Exception: pass # Error checking is optional elif not scan_logs: diff --git a/src/madengine/execution/container_runner_helpers.py b/src/madengine/execution/container_runner_helpers.py index 35f4fba4..64d499f6 100644 --- a/src/madengine/execution/container_runner_helpers.py +++ b/src/madengine/execution/container_runner_helpers.py @@ -5,6 +5,7 @@ Extracted so run_container logic is easier to test and maintain. """ +import re import typing # Default substrings matched in container run logs post-hoc (see ContainerRunner). @@ -67,8 +68,9 @@ def resolve_log_error_scan_config( Keys (in ``additional_context`` and/or ``model_info``; context wins): - - ``log_error_pattern_scan`` (default True): set False to skip grep-based failure detection. - - ``log_error_benign_patterns``: list of extra substrings/regex fragments excluded from matches. + - ``log_error_pattern_scan`` (default True): set False to skip post-run log error detection. + - ``log_error_benign_patterns``: list of extra **literal** substrings; a log line containing + any of them is excluded from error matching (not interpreted as regex). - ``log_error_patterns``: non-empty list of strings replaces the default error pattern list. Returns: @@ -102,6 +104,51 @@ def resolve_log_error_scan_config( return scan_enabled, error_patterns, extra_benign +def log_text_has_error_pattern( + log_text: str, + pattern: str, + benign_substrings: typing.Sequence[str], + benign_regexes: typing.Sequence[str] = (), +) -> bool: + """ + Whether *log_text* contains a literal *pattern* on some line that is not excluded. + + Exclusions (same intent as the old ``grep -v -E | grep -F`` pipeline): + + - Meta lines mentioning our own ``grep`` / "Found error pattern" machinery. + - *benign_substrings*: line skipped if any string appears as a **literal** substring. + - *benign_regexes*: line skipped if any compiled regex matches (for built-in ROCProf rules). + + User-supplied benign entries should use *benign_substrings* only so regex metacharacters + in config are not interpreted unless explicitly added to *benign_regexes*. + """ + pattern_escaped = re.escape(pattern) + try: + meta_excl = re.compile( + f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped})" + ) + except re.error: + return False + + compiled_benign: typing.List[re.Pattern[str]] = [] + for rx in benign_regexes: + try: + compiled_benign.append(re.compile(rx)) + except re.error: + continue + + for line in log_text.splitlines(): + if meta_excl.search(line): + continue + if any(s in line for s in benign_substrings): + continue + if any(br.search(line) for br in compiled_benign): + continue + if pattern in line: + return True + return False + + def resolve_run_timeout( model_info: typing.Dict, cli_timeout: int, diff --git a/tests/unit/test_container_runner_helpers.py b/tests/unit/test_container_runner_helpers.py index fcf58b10..a4d96539 100644 --- a/tests/unit/test_container_runner_helpers.py +++ b/tests/unit/test_container_runner_helpers.py @@ -4,6 +4,7 @@ from madengine.execution.container_runner_helpers import ( DEFAULT_LOG_ERROR_PATTERNS, + log_text_has_error_pattern, resolve_log_error_scan_config, ) @@ -82,3 +83,50 @@ def test_scan_toggle_coercion(self, raw, expected): {}, {"log_error_pattern_scan": raw} ) assert enabled is expected + + +class TestLogTextHasErrorPattern: + def test_finds_literal_pattern(self): + log = "line1\nRuntimeError: boom\n" + assert log_text_has_error_pattern(log, "RuntimeError:", []) + + def test_respects_benign_substring(self): + log = "ignore FutureWarning in this line\n" + assert not log_text_has_error_pattern( + log, + "FutureWarning", + ["FutureWarning"], + (), + ) + + def test_quotes_in_pattern_no_shell(self): + """Patterns with quotes must match literally; must not raise or crash.""" + log = "msg: can't happen\n" + assert log_text_has_error_pattern(log, "can't happen", []) + + def test_excludes_grep_meta_line(self): + log = "some grep -q stuff RuntimeError: x\nreal RuntimeError: bad\n" + # First line matches exclusion grep -q.*RuntimeError + assert log_text_has_error_pattern(log, "RuntimeError:", [], ()) + + def test_regex_benign_excludes_rocprof_style_line(self): + log = ( + "E12345678 generateRocpd.cpp: noise\n" + "clean RuntimeError: real issue\n" + ) + assert log_text_has_error_pattern( + log, + "RuntimeError:", + [], + (r"^E[0-9]{8}.*generateRocpd\.cpp",), + ) + + def test_user_benign_literal_parentheses(self): + # User-config benign strings must be literal substrings (not broken by ad-hoc regex escaping). + log = "info (benign marker) ok\nRuntimeError: real\n" + assert log_text_has_error_pattern( + log, + "RuntimeError:", + ["(benign marker)"], + (), + ) diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py index cb74cce0..6a78c52d 100644 --- a/tests/unit/test_validators.py +++ b/tests/unit/test_validators.py @@ -194,3 +194,120 @@ def test_validate_additional_context_tools_list_ok(self): ctx = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": []}' result = validate_additional_context(additional_context=ctx) assert result["tools"] == [] + + def test_validate_additional_context_log_error_keys_accepted(self): + """Valid types for log scan keys pass structure validation.""" + ctx = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_pattern_scan": False, + "log_error_benign_patterns": ["noise", "(ok)"], + "log_error_patterns": ["OOM", "Killed"], + } + ) + result = validate_additional_context(additional_context=ctx) + assert result["log_error_pattern_scan"] is False + assert result["log_error_benign_patterns"] == ["noise", "(ok)"] + assert result["log_error_patterns"] == ["OOM", "Killed"] + + @pytest.mark.parametrize( + "leps", + [ + True, + "true", + 0, + 1, + None, + ], + ) + def test_validate_additional_context_log_error_pattern_scan_coercible_types( + self, leps + ): + ctx = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_pattern_scan": leps, + } + ) + result = validate_additional_context(additional_context=ctx) + assert result["log_error_pattern_scan"] == leps + + def test_validate_additional_context_log_error_pattern_scan_rejects_bad_type(self): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_pattern_scan": [], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_benign_patterns_rejects_non_list( + self, + ): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_benign_patterns": "not-a-list", + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_benign_patterns_rejects_non_strings( + self, + ): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_benign_patterns": ["a", 1], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_patterns_rejects_empty_list(self): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_patterns": [], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_patterns_rejects_non_list(self): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_patterns": {}, + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_log_error_patterns_rejects_non_string_element( + self, + ): + bad = json.dumps( + { + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "log_error_patterns": ["ok", 2], + } + ) + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context(additional_context=bad) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS