Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 35 additions & 46 deletions src/madengine/execution/container_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from madengine.utils.path_utils import scripts_base_dir_from
from madengine.utils.run_details import get_build_number, get_pipeline
from madengine.execution.container_runner_helpers import (
log_text_has_error_pattern,
make_run_log_file_path,
resolve_log_error_scan_config,
resolve_run_timeout,
Expand Down Expand Up @@ -1263,60 +1264,48 @@ def run_container(
and os.path.exists(log_file_path)
):
try:
# Define benign patterns to exclude from error detection
# These are known warnings/info messages that should not trigger failures
benign_patterns = [
# Benign: literal substrings (incl. user extra_benign) vs regex (ROCProf lines).
benign_substrings = [
"Failed to establish connection to the metrics exporter agent",
"RpcError: Running out of retries to initialize the metrics agent",
"Metrics will not be exported",
"FutureWarning",
# ROCProf/glog logging patterns (E/W/I prefixes are log levels, not errors)
r"^E[0-9]{8}.*generateRocpd\.cpp", # ROCProf error-level logs
r"^W[0-9]{8}.*simple_timer\.cpp", # ROCProf warning-level logs
r"^W[0-9]{8}.*generateRocpd\.cpp", # ROCProf warning-level logs
r"^E[0-9]{8}.*tool\.cpp", # ROCProf tool logs
"Opened result file:", # ROCProf result file messages
"SQLite3 generation ::", # ROCProf SQLite messages
r"\[rocprofv3\]", # ROCProf v3 messages
"rocpd_op:", # ROCProf operation logs
"rpd_tracer:", # ROCProf tracer logs
"Opened result file:",
"SQLite3 generation ::",
"rocpd_op:",
"rpd_tracer:",
]
benign_patterns.extend(extra_benign)
benign_substrings.extend(extra_benign)
benign_regexes = [
# ROCProf/glog: E/W prefixes are log levels, not app errors
r"^E[0-9]{8}.*generateRocpd\.cpp",
r"^W[0-9]{8}.*simple_timer\.cpp",
r"^W[0-9]{8}.*generateRocpd\.cpp",
r"^E[0-9]{8}.*tool\.cpp",
r"\[rocprofv3\]",
]

# Scan in Python (no shell; literals vs regex benign rules are explicit).
with open(
log_file_path,
"r",
encoding="utf-8",
errors="ignore",
) as _lf:
log_scan_text = _lf.read()

# Check for error patterns in the log (exclude our own grep commands, output messages, and benign patterns).
# Use subprocess (not console.sh) so the check runs silently and does not clutter console output.
for pattern in error_patterns:
# Build exclusion regex: our own commands, output messages, and benign patterns.
# Use re.escape(pattern) so parentheses and other special chars are safe in grep -E.
pattern_escaped = re.escape(pattern)
exclusions = f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped}"
for benign in benign_patterns:
# Escape special regex characters in benign patterns
escaped_benign = benign.replace(".", r"\.").replace("(", r"\(").replace(")", r"\)")
exclusions += f"|{escaped_benign}"
exclusions += ")"
# Match pattern literally in the filtered log (grep -F avoids regex issues)
error_check_cmd = [
"sh",
"-c",
f"grep -v -E '{exclusions}' {log_file_path} | grep -F -q -- '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'",
]
try:
proc = subprocess.run(
error_check_cmd,
capture_output=True,
text=True,
timeout=60,
if log_text_has_error_pattern(
log_scan_text,
pattern,
benign_substrings,
benign_regexes,
):
has_errors = True
print(
f"Found error pattern '{pattern}' in logs"
)
result = (proc.stdout or "").strip()
if result == "FOUND":
has_errors = True
print(
f"Found error pattern '{pattern}' in logs"
)
break
except (subprocess.TimeoutExpired, OSError):
pass # Error checking is optional; treat as no match
break
except Exception:
pass # Error checking is optional
elif not scan_logs:
Expand Down
51 changes: 49 additions & 2 deletions src/madengine/execution/container_runner_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Extracted so run_container logic is easier to test and maintain.
"""

import re
import typing

# Default substrings matched in container run logs post-hoc (see ContainerRunner).
Expand Down Expand Up @@ -67,8 +68,9 @@ def resolve_log_error_scan_config(

Keys (in ``additional_context`` and/or ``model_info``; context wins):

- ``log_error_pattern_scan`` (default True): set False to skip grep-based failure detection.
- ``log_error_benign_patterns``: list of extra substrings/regex fragments excluded from matches.
- ``log_error_pattern_scan`` (default True): set False to skip post-run log error detection.
- ``log_error_benign_patterns``: list of extra **literal** substrings; a log line containing
any of them is excluded from error matching (not interpreted as regex).
- ``log_error_patterns``: non-empty list of strings replaces the default error pattern list.

Returns:
Expand Down Expand Up @@ -102,6 +104,51 @@ def resolve_log_error_scan_config(
return scan_enabled, error_patterns, extra_benign


def log_text_has_error_pattern(
log_text: str,
pattern: str,
benign_substrings: typing.Sequence[str],
benign_regexes: typing.Sequence[str] = (),
) -> bool:
"""
Whether *log_text* contains a literal *pattern* on some line that is not excluded.

Exclusions (same intent as the old ``grep -v -E | grep -F`` pipeline):

- Meta lines mentioning our own ``grep`` / "Found error pattern" machinery.
- *benign_substrings*: line skipped if any string appears as a **literal** substring.
- *benign_regexes*: line skipped if any compiled regex matches (for built-in ROCProf rules).

User-supplied benign entries should use *benign_substrings* only so regex metacharacters
in config are not interpreted unless explicitly added to *benign_regexes*.
"""
pattern_escaped = re.escape(pattern)
try:
meta_excl = re.compile(
f"(grep -q.*{pattern_escaped}|Found error pattern.*{pattern_escaped})"
)
except re.error:
return False

compiled_benign: typing.List[re.Pattern[str]] = []
for rx in benign_regexes:
try:
compiled_benign.append(re.compile(rx))
except re.error:
continue

for line in log_text.splitlines():
if meta_excl.search(line):
continue
if any(s in line for s in benign_substrings):
continue
if any(br.search(line) for br in compiled_benign):
continue
if pattern in line:
return True
return False


def resolve_run_timeout(
model_info: typing.Dict,
cli_timeout: int,
Expand Down
48 changes: 48 additions & 0 deletions tests/unit/test_container_runner_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from madengine.execution.container_runner_helpers import (
DEFAULT_LOG_ERROR_PATTERNS,
log_text_has_error_pattern,
resolve_log_error_scan_config,
)

Expand Down Expand Up @@ -82,3 +83,50 @@ def test_scan_toggle_coercion(self, raw, expected):
{}, {"log_error_pattern_scan": raw}
)
assert enabled is expected


class TestLogTextHasErrorPattern:
def test_finds_literal_pattern(self):
log = "line1\nRuntimeError: boom\n"
assert log_text_has_error_pattern(log, "RuntimeError:", [])

def test_respects_benign_substring(self):
log = "ignore FutureWarning in this line\n"
assert not log_text_has_error_pattern(
log,
"FutureWarning",
["FutureWarning"],
(),
)

def test_quotes_in_pattern_no_shell(self):
"""Patterns with quotes must match literally; must not raise or crash."""
log = "msg: can't happen\n"
assert log_text_has_error_pattern(log, "can't happen", [])

def test_excludes_grep_meta_line(self):
log = "some grep -q stuff RuntimeError: x\nreal RuntimeError: bad\n"
# First line matches exclusion grep -q.*RuntimeError
assert log_text_has_error_pattern(log, "RuntimeError:", [], ())

def test_regex_benign_excludes_rocprof_style_line(self):
log = (
"E12345678 generateRocpd.cpp: noise\n"
"clean RuntimeError: real issue\n"
)
assert log_text_has_error_pattern(
log,
"RuntimeError:",
[],
(r"^E[0-9]{8}.*generateRocpd\.cpp",),
)

def test_user_benign_literal_parentheses(self):
# User-config benign strings must be literal substrings (not broken by ad-hoc regex escaping).
log = "info (benign marker) ok\nRuntimeError: real\n"
assert log_text_has_error_pattern(
log,
"RuntimeError:",
["(benign marker)"],
(),
)
117 changes: 117 additions & 0 deletions tests/unit/test_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,120 @@ def test_validate_additional_context_tools_list_ok(self):
ctx = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "tools": []}'
result = validate_additional_context(additional_context=ctx)
assert result["tools"] == []

def test_validate_additional_context_log_error_keys_accepted(self):
"""Valid types for log scan keys pass structure validation."""
ctx = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_pattern_scan": False,
"log_error_benign_patterns": ["noise", "(ok)"],
"log_error_patterns": ["OOM", "Killed"],
}
)
result = validate_additional_context(additional_context=ctx)
assert result["log_error_pattern_scan"] is False
assert result["log_error_benign_patterns"] == ["noise", "(ok)"]
assert result["log_error_patterns"] == ["OOM", "Killed"]

@pytest.mark.parametrize(
"leps",
[
True,
"true",
0,
1,
None,
],
)
def test_validate_additional_context_log_error_pattern_scan_coercible_types(
self, leps
):
ctx = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_pattern_scan": leps,
}
)
result = validate_additional_context(additional_context=ctx)
assert result["log_error_pattern_scan"] == leps

def test_validate_additional_context_log_error_pattern_scan_rejects_bad_type(self):
bad = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_pattern_scan": [],
}
)
with pytest.raises(typer.Exit) as exc_info:
validate_additional_context(additional_context=bad)
assert exc_info.value.exit_code == ExitCode.INVALID_ARGS

def test_validate_additional_context_log_error_benign_patterns_rejects_non_list(
self,
):
bad = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_benign_patterns": "not-a-list",
}
)
with pytest.raises(typer.Exit) as exc_info:
validate_additional_context(additional_context=bad)
assert exc_info.value.exit_code == ExitCode.INVALID_ARGS

def test_validate_additional_context_log_error_benign_patterns_rejects_non_strings(
self,
):
bad = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_benign_patterns": ["a", 1],
}
)
with pytest.raises(typer.Exit) as exc_info:
validate_additional_context(additional_context=bad)
assert exc_info.value.exit_code == ExitCode.INVALID_ARGS

def test_validate_additional_context_log_error_patterns_rejects_empty_list(self):
bad = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_patterns": [],
}
)
with pytest.raises(typer.Exit) as exc_info:
validate_additional_context(additional_context=bad)
assert exc_info.value.exit_code == ExitCode.INVALID_ARGS

def test_validate_additional_context_log_error_patterns_rejects_non_list(self):
bad = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_patterns": {},
}
)
with pytest.raises(typer.Exit) as exc_info:
validate_additional_context(additional_context=bad)
assert exc_info.value.exit_code == ExitCode.INVALID_ARGS

def test_validate_additional_context_log_error_patterns_rejects_non_string_element(
self,
):
bad = json.dumps(
{
"gpu_vendor": "AMD",
"guest_os": "UBUNTU",
"log_error_patterns": ["ok", 2],
}
)
with pytest.raises(typer.Exit) as exc_info:
validate_additional_context(additional_context=bad)
assert exc_info.value.exit_code == ExitCode.INVALID_ARGS