diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index df10feb2..519db432 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -14,6 +14,7 @@ ) from benchmarks.commit0.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic @@ -507,6 +508,14 @@ def evaluate_instance( # Final debug log logger.info(f"Final eval_result: {eval_result}") + # Log instance summary + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch or "", + logger=logger, + ) + # Save workspace as zip (if supported by workspace implementation) zip_dest = os.path.join( self.metadata.eval_output_dir, "repos", repo_name, f"{repo_name}.zip" diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 8f52d171..9882dfd8 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -14,6 +14,7 @@ from benchmarks.gaia.config import INFER_DEFAULTS from benchmarks.gaia.scorer import question_scorer +from benchmarks.utils.console_logging import summarize_instance from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url from benchmarks.utils.args_parser import get_parser from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE @@ -364,6 +365,12 @@ def evaluate_instance( f"model_answer='{model_answer}', ground_truth='{ground_truth}'" ) + summarize_instance( + instance_id=instance.id, + conversation=conversation, + logger=logger, + ) + # Collect history # Return evaluation output diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index afeb7f6e..c4166565 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -15,6 +15,7 @@ from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image +from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic @@ -399,6 +400,14 @@ def evaluate_instance( ) git_patch = git_patch_result.stdout + # Log instance summary + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch or "", + logger=logger, + ) + # EvalOutput is your model; keep fields consistent with prior JSONL out = EvalOutput( instance_id=instance.id, diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index b9afff4f..5ba53811 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -14,6 +14,7 @@ from benchmarks.openagentsafety.build_images import build_workspace_image from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic from benchmarks.utils.dataset import get_dataset @@ -513,6 +514,12 @@ def event_callback(event) -> None: logger.warning(f"No evaluator_code for {instance.id}") eval_result = {"error": "No evaluator code provided"} + summarize_instance( + instance_id=instance.id, + conversation=conversation, + logger=logger, + ) + # Collect cost metrics from LLM metrics = None if hasattr(self.metadata.llm, "metrics"): diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index daafe8ad..e9a1abb1 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -16,6 +16,7 @@ from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE +from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic from benchmarks.utils.dataset import get_dataset @@ -306,6 +307,14 @@ def evaluate_instance( ) git_patch = git_patch_result.stdout + # Log instance summary + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch, + logger=logger, + ) + # EvalOutput is your model; keep fields consistent with prior JSONL out = EvalOutput( instance_id=instance.id, diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 02101697..89e1e44b 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -13,6 +13,7 @@ from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser from benchmarks.utils.build_utils import build_image +from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic @@ -394,6 +395,14 @@ def evaluate_instance( ) git_patch = git_patch_result.stdout + # Log instance summary + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch or "", + logger=logger, + ) + # EvalOutput is your model; keep fields consistent with prior JSONL out = EvalOutput( instance_id=instance.id, diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index af2724bb..d28a68d4 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -7,6 +7,7 @@ from benchmarks.swtbench.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic @@ -327,6 +328,14 @@ def evaluate_instance( ) git_patch = git_patch_result.stdout + # Log instance summary + summarize_instance( + instance_id=instance.id, + conversation=conversation, + git_patch=git_patch or "", + logger=logger, + ) + out = EvalOutput( instance_id=instance.id, attempt=self.current_attempt, diff --git a/benchmarks/utils/console_logging.py b/benchmarks/utils/console_logging.py new file mode 100644 index 00000000..b8653bdc --- /dev/null +++ b/benchmarks/utils/console_logging.py @@ -0,0 +1,577 @@ +from __future__ import annotations + +import logging +import os +import re +import sys +from datetime import datetime +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from openhands.sdk import Event + +# --------------------------------------------------------------------------- +# Logging configuration +# --------------------------------------------------------------------------- +RICH_LOGGING_ENV_VAR = "RICH_LOGGING" + + +def _rich_logging_enabled() -> bool: + """Check if rich logging is enabled via environment variable.""" + return os.getenv(RICH_LOGGING_ENV_VAR, "").lower() in ("1", "true", "yes") + + +# --------------------------------------------------------------------------- +# ANSI color constants +# --------------------------------------------------------------------------- +RESET = "\033[0m" +DIM = "\033[2m" + +BLACK = "\033[30m" +WHITE = "\033[37m" +RED = "\033[31m" +YELLOW = "\033[33m" +MAGENTA = "\033[35m" +CYAN_BRIGHT = "\033[96m" +WHITE_BOLD = "\033[1;37m" + +BG_BLACK = "\033[40m" +BG_RED = "\033[41m" +BG_YELLOW = "\033[43m" +BG_BLUE = "\033[44m" +BG_MAGENTA = "\033[45m" +BG_CYAN = "\033[46m" +BG_WHITE = "\033[47m" + +_ANSI_RE = re.compile(r"\x1b\[[0-9;]*m") +_WS_RE = re.compile(r"\s+") + + +def strip_ansi(s: str) -> str: + """Remove ANSI escape sequences.""" + return _ANSI_RE.sub("", s) + + +# --------------------------------------------------------------------------- +# Line formatting helpers +# --------------------------------------------------------------------------- + + +def _format_timestamp(dt: datetime | None = None) -> str: + dt = dt or datetime.now() + return f"{DIM}{dt.strftime('%Y-%m-%d %H:%M:%S')}{RESET}" + + +def _format_sample_id(short_id: str) -> str: + return f"{DIM}[{short_id}]{RESET}" + + +def _format_tag(tag: str, *, bg: str, fg: str = WHITE_BOLD) -> str: + return f"{bg}{fg} {tag} {RESET}" + + +def format_line( + *, + short_id: str, + tag: str, + message: str, + tag_bg: str, + tag_fg: str = WHITE_BOLD, + message_color: str = CYAN_BRIGHT, + newline_before: bool = False, + newline_after: bool = False, +) -> str: + line = ( + f"{_format_timestamp()} {_format_sample_id(short_id)} " + f"{_format_tag(tag, bg=tag_bg, fg=tag_fg)} {message_color}{message}{RESET}" + ) + if newline_before: + line = "\n" + line + if newline_after: + line = line + "\n" + return line + + +def _format_box_prefix(*, short_id: str, tag: str, tag_bg: str) -> str: + return ( + f"{_format_timestamp()} {_format_sample_id(short_id)} " + f"{_format_tag(tag, bg=tag_bg)} │" + ) + + +# --------------------------------------------------------------------------- +# Trajectory line formatting (per-event console output) +# --------------------------------------------------------------------------- + + +def _one_line(s: str) -> str: + return _WS_RE.sub(" ", s).strip() + + +def _truncate(s: str, max_len: int) -> str: + if len(s) <= max_len: + return s + return s[: max_len - 3] + "..." + + +def _extract_tool_input_preview(event: "Event") -> str: + action = getattr(event, "action", None) + if action is None: + return "" + for attr, label in ( + ("command", "cmd"), + ("path", "path"), + ("file_path", "path"), + ("target_file", "path"), + ("query", "query"), + ): + if hasattr(action, attr): + v = getattr(action, attr) + if isinstance(v, str) and v.strip(): + return f" {label}={_truncate(_one_line(v), 120)!r}" + for attr, label in (("new_string", "new"), ("new_str", "new"), ("old_string", "old")): + if hasattr(action, attr): + v = getattr(action, attr) + if isinstance(v, str) and v.strip(): + return f" {label}={_truncate(_one_line(v), 80)!r}" + return "" + + +def format_trajectory_line( + event: "Event", *, short_id: str, tool_call_index: int +) -> str | None: + """Format a single-line trajectory summary""" + # Avoid top-level import to prevent circular dependencies + from openhands.sdk.event import ( + ActionEvent, + AgentErrorEvent, + MessageEvent, + ObservationEvent, + ) + + # ------------------------------------------------------------------------- + # Tool call logs (ActionEvent with tool_name) + # #N = tool call counter within this instance (1st call, 2nd call, ...) + # Example log: + # 10:30:45 [django-12345] TOOL │ ▶ bash #1 cmd='ls' + # ------------------------------------------------------------------------- + if isinstance(event, ActionEvent) and event.tool_name: + thought = "" + if event.action is not None and getattr(event.action, "thought", None): + thought = str(getattr(event.action, "thought") or "")[:50] + if thought: + thought = f" {DIM}// {thought}...{RESET}" + prefix = _format_box_prefix(short_id=short_id, tag="TOOL", tag_bg=BG_BLUE) + args = _extract_tool_input_preview(event) + args_dim = f"{DIM}{args}{RESET}" if args else "" + return ( + f"{prefix} {WHITE}▶ {event.tool_name} #{tool_call_index}{RESET}" + f"{args_dim}{thought}" + ) + + # ------------------------------------------------------------------------- + # Tool result log (ObservationEvent) + # Example log: + # 10:30:46 [django-12345] TOOL │ └─ ok (exit_code=0) + # 10:30:47 [django-12345] WARN │ └─ exit=1 (exit_code!=0) + # 10:30:48 [django-12345] WARN │ └─ tool_error (is_error=True) + # ------------------------------------------------------------------------- + if isinstance(event, ObservationEvent): + obs = event.observation + exit_code = getattr(obs, "exit_code", None) + is_error = bool(getattr(obs, "is_error", False)) + if exit_code is not None: + if exit_code == 0: + prefix = _format_box_prefix(short_id=short_id, tag="TOOL", tag_bg=BG_BLUE) + return f"{prefix} {WHITE}└─ ok{RESET}" + prefix = _format_box_prefix(short_id=short_id, tag="WARN", tag_bg=BG_YELLOW) + return f"{prefix} {YELLOW}└─ exit={exit_code}{RESET}" + if is_error: + prefix = _format_box_prefix(short_id=short_id, tag="WARN", tag_bg=BG_YELLOW) + return f"{prefix} {YELLOW}└─ tool_error{RESET}" + return None # Skip non-terminal observations (e.g., intermediate file reads) + + # ------------------------------------------------------------------------- + # Agent-side error log (AgentErrorEvent) + # Example output: + # 10:30:49 [django-12345] ERROR │ └─ error + # ------------------------------------------------------------------------- + if isinstance(event, AgentErrorEvent): + prefix = _format_box_prefix(short_id=short_id, tag="ERROR", tag_bg=BG_RED) + return f"{prefix} {RED}└─ error{RESET}" + + # ------------------------------------------------------------------------- + # Agent text message (MessageEvent from agent) + # Displays the message content or the full message object if content is empty + # Example output: + # 10:30:50 [django-12345] MESSAGE│ I found the bug in line 42... + # 10:30:51 [django-12345] MESSAGE│ (empty) {"tool_calls": [...]} + # ------------------------------------------------------------------------- + if isinstance(event, MessageEvent) and event.source == "agent": + llm_msg = event.llm_message + prefix = _format_box_prefix(short_id=short_id, tag="MESSAGE", tag_bg=BG_MAGENTA) + if llm_msg and hasattr(llm_msg, "content") and llm_msg.content: + for block in llm_msg.content: + if hasattr(block, "text"): + text = str(getattr(block, "text") or "")[:60].replace("\n", " ") + if text: + return f"{prefix}{WHITE_BOLD}{text}...{RESET}" + try: + payload = ( + _one_line(llm_msg.model_dump_json(exclude_none=True)) + if llm_msg + else "null" + ) + except Exception: + payload = "null" + return f"{prefix}{WHITE_BOLD}(empty) {payload}{RESET}" + + # Other events (user messages, system events, etc.) are not displayed + return None + + +def print_trajectory_line( + event: "Event", *, short_id: str, tool_call_index: int +) -> None: + if not _rich_logging_enabled(): + return + try: + line = format_trajectory_line( + event, short_id=short_id, tool_call_index=tool_call_index + ) + if line: + print(line, file=sys.__stdout__) + sys.__stdout__.flush() + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Logger setup for multiprocessing workers +# --------------------------------------------------------------------------- + + +class _ConsoleFilter(logging.Filter): + """Only pass important messages to console.""" + + IMPORTANT_PATTERNS = ( + "[INSTANCE]", + "[LLM_ANALYSIS]", + "[EMPTY_PATCH]", + "[PATCH_DIAG]", + "=== Evaluation", + "Docker workspace", + "repo_path:", + ) + + def filter(self, record: logging.LogRecord) -> bool: + if record.levelno >= logging.WARNING: + return True + msg = record.getMessage() + return any(p in msg for p in self.IMPORTANT_PATTERNS) + + +class _ColorFormatter(logging.Formatter): + """Formatter that applies colored tags based on message content/level.""" + + def __init__(self, instance_id: str) -> None: + super().__init__() + self._short_id = ( + instance_id.split("__")[-1][:20] if "__" in instance_id else instance_id[:20] + ) + + def format(self, record: logging.LogRecord) -> str: + msg = record.getMessage() + if "[INSTANCE]" in msg: + pretty = msg + if "status=ERROR" in pretty: + pretty = pretty.replace("status=ERROR", f"status={RED}ERROR{RESET}") + return format_line( + short_id=self._short_id, + tag="RESULT", + message=pretty, + tag_bg=BG_BLUE, + message_color="", + newline_before=True, + newline_after=True, + ) + if "[LLM_ANALYSIS]" in msg: + return format_line( + short_id=self._short_id, + tag="LLM", + message=msg, + tag_bg=BG_BLUE, + message_color=CYAN_BRIGHT, + ) + if "[EMPTY_PATCH]" in msg or "[PATCH_DIAG]" in msg: + return format_line( + short_id=self._short_id, + tag="PATCH", + message=msg, + tag_bg=BG_MAGENTA, + message_color=MAGENTA, + ) + if record.levelno >= logging.ERROR: + return format_line( + short_id=self._short_id, + tag="ERROR", + message=msg, + tag_bg=BG_RED, + message_color=RED, + ) + if record.levelno >= logging.WARNING: + return format_line( + short_id=self._short_id, + tag="WARN", + message=msg, + tag_bg=BG_YELLOW, + message_color=YELLOW, + ) + return format_line( + short_id=self._short_id, + tag="INFO", + message=msg, + tag_bg=BG_WHITE, + tag_fg=BLACK, + message_color=WHITE, + ) + + +class _PlainFormatter(logging.Formatter): + """Original formatter""" + + def __init__(self, instance_id: str) -> None: + super().__init__( + fmt=f"Instance {instance_id} - %(asctime)s - %(levelname)s - %(message)s" + ) + + +def setup_instance_logging(log_dir: str, instance_id: str) -> None: + """ + Configure logging for a worker process handling a single instance. + + Behavior depends on RICH_LOGGING env var: + - RICH_LOGGING=1: Colored console output with filtered messages + - RICH_LOGGING not set: Original plain logging style + + In both modes: + - All INFO+ logs go to per-instance file (instance_.log) + - Console shows WARNING+ and selected important patterns + """ + log_file = os.path.join(log_dir, f"instance_{instance_id}.log") + output_log_file = os.path.join(log_dir, f"instance_{instance_id}.output.log") + short_id = ( + instance_id.split("__")[-1][:20] if "__" in instance_id else instance_id[:20] + ) + + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + + rich_mode = _rich_logging_enabled() + + # Console handler + console_handler = logging.StreamHandler(sys.__stdout__ if rich_mode else None) + console_handler.setLevel(logging.INFO) + + if rich_mode: + console_handler.addFilter(_ConsoleFilter()) + console_handler.setFormatter(_ColorFormatter(instance_id)) + else: + console_handler.setFormatter(_PlainFormatter(instance_id)) + + root_logger.addHandler(console_handler) + root_logger.setLevel(logging.DEBUG) + + # Print startup message + if rich_mode: + print( + format_line( + short_id=short_id, + tag="START", + message=f"{instance_id} | Logs: {log_file}", + tag_bg=BG_BLUE, + message_color=CYAN_BRIGHT, + newline_before=True, + ), + file=sys.__stdout__, + ) + sys.__stdout__.flush() + else: + # Original startup message style + root_logger.info( + f""" + === Evaluation Started (instance {instance_id}) === + View live output: + • tail -f {log_file} (logger) + • tail -f {output_log_file} (stdout/stderr) + =============================================== + """.strip() + ) + # Original behavior: set console to WARNING+ after initial message + console_handler.setLevel(logging.WARNING) + + # File handler for full logs + os.makedirs(log_dir, exist_ok=True) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter( + logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") + ) + file_handler.setLevel(logging.INFO) + root_logger.addHandler(file_handler) + + +# --------------------------------------------------------------------------- +# Instance summary (printed at end of each instance) +# --------------------------------------------------------------------------- + + +def _ansi(enabled: bool, code: str) -> str: + return f"\033[{code}m" if enabled else "" + + +def summarize_instance( + *, + instance_id: str, + conversation: object, + git_patch: str | None = None, + commit_exit_code: int = 0, + repo_has_changes: bool = False, + logger: logging.Logger | None = None, +) -> None: + """Log a summary line for a completed instance""" + # Lazy imports to avoid circular dependencies + from openhands.sdk.event import ActionEvent, AgentErrorEvent, MessageEvent + from openhands.sdk.event.conversation_error import ConversationErrorEvent + from openhands.sdk.conversation.state import ConversationExecutionStatus + from openhands.sdk.tool.builtins.finish import FinishAction + + if logger is None: + logger = logging.getLogger(__name__) + + # Extract events from conversation state + try: + events = list(conversation.state.events) # type: ignore[attr-defined] + except Exception: + events = [] + + n_events = len(events) + n_tool_calls = sum( + isinstance(e, ActionEvent) + and getattr(e, "source", None) == "agent" + and bool(getattr(e, "tool_name", None)) + and getattr(e, "action", None) is not None + for e in events + ) + n_agent_msgs = sum( + isinstance(e, MessageEvent) and getattr(e, "source", None) == "agent" + for e in events + ) + n_user_msgs = sum( + isinstance(e, MessageEvent) and getattr(e, "source", None) == "user" + for e in events + ) + + # Error counts + n_agent_errors = sum(isinstance(e, AgentErrorEvent) for e in events) + n_conv_errors = sum(isinstance(e, ConversationErrorEvent) for e in events) + + # Execution status + try: + status = conversation.state.execution_status + except Exception: + status = None + + healthy = ( + (n_agent_errors == 0) + and (n_conv_errors == 0) + and (status != ConversationExecutionStatus.ERROR if status else True) + ) + + # Check if agent used finish tool + finished_with_finish = any( + isinstance(e, ActionEvent) and isinstance(getattr(e, "action", None), FinishAction) + for e in events + ) + + if status and status != ConversationExecutionStatus.FINISHED: + finish_reason = f"status={status.value}" + elif finished_with_finish: + finish_reason = "finish_tool" + else: + finish_reason = "finished_no_finish_tool" + + # Patch preview + patch_preview = (git_patch or "").strip().replace("\n", "\\n") + if len(patch_preview) > 180: + patch_preview = patch_preview[:180] + "…" + patch_empty = not bool((git_patch or "").strip()) + + # Colors (respect NO_COLOR env var) + color = os.getenv("NO_COLOR") is None + ok_c = _ansi(color, "32") + warn_c = _ansi(color, "33") + err_c = _ansi(color, "31") + white_c = _ansi(color, "37") + dim_c = _ansi(color, "2") + reset = _ansi(color, "0") + + # Build colored tags + health_tag = f"{ok_c}OK{reset}" if healthy else f"{warn_c}WITH_ISSUES{reset}" + patch_tag = f"{warn_c}EMPTY{reset}" if patch_empty else f"{ok_c}NONEMPTY{reset}" + reason_tag = ( + f"{ok_c}{finish_reason}{reset}" + if finish_reason == "finish_tool" + else f"{warn_c}{finish_reason}{reset}" + ) + commit_tag = ( + f"{ok_c}{commit_exit_code}{reset}" + if commit_exit_code == 0 + else f"{warn_c}{commit_exit_code}{reset}" + ) + + # Tool call count coloring + if n_tool_calls == 0: + tool_calls_tag = f"{err_c}{n_tool_calls}{reset}" + elif n_tool_calls < n_agent_msgs: + tool_calls_tag = f"{warn_c}{n_tool_calls}{reset}" + else: + tool_calls_tag = f"{white_c}{n_tool_calls}{reset}" + + # Errors coloring + errors_tag_color = warn_c if (n_agent_errors > 0 or n_conv_errors > 0) else ok_c + errors_tag = f"{errors_tag_color}{n_agent_errors}/{n_conv_errors}{reset}" + + if git_patch is not None: + # Prefix with [INSTANCE] so rich console logging: + # - passes the _ConsoleFilter (important pattern) + # - renders with the "RESULT" tag + surrounding newlines in _ColorFormatter + logger.info( + "[INSTANCE] %s patch=%s commit=%s changes=%s " + "msgs(a/u)=%d/%d tool_calls=%s " + "errors(agent/conv)=%s end=%s %spreview='%s'%s", + health_tag, + patch_tag, + commit_tag, + "Y" if repo_has_changes else "N", + n_agent_msgs, + n_user_msgs, + tool_calls_tag, + errors_tag, + reason_tag, + dim_c, + patch_preview, + reset, + ) + else: + logger.info( + "[INSTANCE] %s msgs(a/u)=%d/%d tool_calls=%s " + "errors(agent/conv)=%s end=%s", + health_tag, + n_agent_msgs, + n_user_msgs, + tool_calls_tag, + errors_tag, + reason_tag, + ) + diff --git a/benchmarks/utils/conversation.py b/benchmarks/utils/conversation.py index 07578ad2..ffdf834c 100644 --- a/benchmarks/utils/conversation.py +++ b/benchmarks/utils/conversation.py @@ -3,6 +3,9 @@ from typing import Any, Callable from openhands.sdk import Event, get_logger +from openhands.sdk.event import ActionEvent + +from benchmarks.utils.console_logging import print_trajectory_line logger = get_logger(__name__) @@ -67,7 +70,7 @@ def _truncate(s: str, max_len: int) -> str: def build_event_persistence_callback( - run_id: str, instance_id: str, attempt: int = 1 + run_id: str, instance_id: str, attempt: int = 1, show_trajectory: bool = True ) -> ConversationCallback: """ Create a callback that logs events for later retrieval. @@ -81,15 +84,31 @@ def build_event_persistence_callback( run_id: Unique identifier for this evaluation run (e.g., job name). instance_id: Identifier for the evaluation instance. attempt: Attempt number for retries (1-indexed). + show_trajectory: If True, print trajectory logs to console. Returns: A callback function to be passed to Conversation. """ + short_id = ( + instance_id.split("__")[-1][:20] if "__" in instance_id else instance_id[:20] + ) + tool_call_index = 0 # if not bool(os.environ.get(CONVERSATION_EVENT_LOGGING_ENV_VAR, True)): # return lambda event: None # TODO: Re-enable the above once we have debugged runtime issues def _persist_event(event: Event) -> None: + nonlocal tool_call_index + + # Print trajectory line to console (uses console_logging helper) + if show_trajectory: + if isinstance(event, ActionEvent) and event.tool_name: + tool_call_index += 1 + print_trajectory_line( + event, short_id=short_id, tool_call_index=tool_call_index + ) + + # Persist event to logs try: serialized = event.model_dump_json(exclude_none=True) event_size = len(serialized.encode("utf-8")) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 32177e38..8928b6ea 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -736,80 +736,11 @@ def _process_one_mp( def reset_logger_for_multiprocessing(log_dir: str, instance_id: str) -> None: """Reset the logger for multiprocessing with instance-specific logging. - Save logs to a separate file for each instance, instead of trying to write to the - same file/console from multiple processes. This provides: - - One INFO line to console at start with tail hint - - All subsequent logs go to instance-specific file - - Only WARNING+ messages go to console after initial message - - Args: - log_dir: Directory to store log files - instance_id: Unique identifier for the instance being processed + See benchmarks.utils.console_logging.setup_instance_logging for details. """ - import logging - - # Set up logger - log_file = os.path.join(log_dir, f"instance_{instance_id}.log") - output_log_file = os.path.join(log_dir, f"instance_{instance_id}.output.log") - - # Get root logger and remove all existing handlers - root_logger = logging.getLogger() - for handler in root_logger.handlers[:]: - root_logger.removeHandler(handler) - - class ConversationEventFilter(logging.Filter): - def filter(self, record: logging.LogRecord) -> bool: # type: ignore[override] - msg = record.getMessage() - return msg in {"conversation_event", "conversation_event_metadata"} - - # Datadog/console handler for conversation events (bypasses stdout redirection) - # if bool(os.environ.get(CONVERSATION_EVENT_LOGGING_ENV_VAR, False)): - if True: # force datadog logging - from pythonjsonlogger.json import JsonFormatter - - dd_handler = logging.StreamHandler(sys.__stdout__) - dd_handler.setLevel(logging.INFO) - dd_handler.addFilter(ConversationEventFilter()) - dd_handler.setFormatter( - JsonFormatter( - fmt="%(asctime)s %(levelname)s %(name)s %(message)s %(run_id)s %(instance_id)s %(attempt)s %(event_type)s %(event_size)s" - ) - ) - root_logger.addHandler(dd_handler) - - # Create console handler for initial message - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - console_handler.setFormatter( - logging.Formatter( - f"Instance {instance_id} - " + "%(asctime)s - %(levelname)s - %(message)s" - ) - ) - root_logger.addHandler(console_handler) - root_logger.setLevel(logging.DEBUG) - - # Print one INFO line with helpful hint - root_logger.info( - f""" - === Evaluation Started (instance {instance_id}) === - View live output: - • tail -f {log_file} (logger) - • tail -f {output_log_file} (stdout/stderr) - =============================================== - """.strip() - ) + from benchmarks.utils.console_logging import setup_instance_logging - # Now set console to WARNING+ only - console_handler.setLevel(logging.WARNING) - - # Add file handler for detailed logs - os.makedirs(log_dir, exist_ok=True) - file_handler = logging.FileHandler(log_file) - file_handler.setFormatter( - logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") - ) - file_handler.setLevel(logging.INFO) - root_logger.addHandler(file_handler) + setup_instance_logging(log_dir, instance_id) @contextmanager diff --git a/benchmarks/utils/logging.md b/benchmarks/utils/logging.md new file mode 100644 index 00000000..0409d321 --- /dev/null +++ b/benchmarks/utils/logging.md @@ -0,0 +1,101 @@ +# Rich Logging + +## Enabling Rich Logs + +Set the environment variable: + +```bash +export RICH_LOGGING=1 +``` + +When disabled (default), the original logging style is used. + +## Log Types + +### 1. Startup Log + +Displayed when an instance starts processing: + +``` +2025-02-03 10:30:45 [django-12345] START instance_id | Logs: /path/to/logs/instance_xxx.log +``` + +### 2. Trajectory Logs (Tool Calls) + +Shows agent actions in real-time: + +``` +10:30:45 [django-12345] TOOL │ ▶ bash #1 cmd='ls -la' +10:30:46 [django-12345] TOOL │ └─ ok +10:30:47 [django-12345] TOOL │ ▶ str_replace_editor #2 path='/workspace/file.py' +10:30:48 [django-12345] WARN │ └─ exit=1 +``` + +- `#N` = Tool call counter (1st call, 2nd call, ...) +- `└─ ok` = Tool succeeded (exit_code=0) +- `└─ exit=N` = Tool failed with exit code N +- `└─ tool_error` = Tool returned an error + +### 3. Message Logs + +Shows agent text messages: + +``` +10:30:50 [django-12345] MESSAGE│ I see the issue now! Looking at the test data: 1. band_duo ... +``` + +### 4. Error Logs + +Shows agent-side errors: + +``` +10:30:49 [django-12345] ERROR │ └─ error +``` + +### 5. Summary Log + +Displayed at the end of each instance evaluation + +**For benchmarks with git patches** (e.g. SWE-bench): + +``` +OK patch=NONEMPTY commit=0 changes=Y msgs(a/u)=8/3 tool_calls=12 errors(agent/conv)=0/0 end=finish_tool preview='diff --git ...' +``` + +**For benchmarks without git patches** (GAIA, OpenAgentSafety): + +``` +OK msgs(a/u)=8/3 tool_calls=12 errors(agent/conv)=0/0 end=finish_tool +``` + +#### Summary Fields + +| Field | Description | +|-------|-------------| +| `OK` / `WITH_ISSUES` | Health status (green/yellow). WITH_ISSUES if errors occurred or status is ERROR | +| `patch=NONEMPTY/EMPTY` | Whether the agent produced a non-empty git patch | +| `commit=N` | Git commit exit code (0 = success) | +| `changes=Y/N` | Whether repo had uncommitted changes after agent run | +| `msgs(a/u)=N/M` | Count of agent/user messages | +| `tool_calls=N` | Total number of tool calls made | +| `errors(agent/conv)=N/M` | Count of agent errors / conversation errors | +| `end=...` | How the run ended: `finish_tool`, `status=ERROR`, `finished_no_finish_tool` | +| `preview='...'` | First ~180 chars of the git diff (grey, truncated) | + +## Color Coding + +- **Green**: Success / healthy values +- **Yellow**: Warnings (non-zero exit codes, errors present, no finish tool) +- **Red**: Errors +- **Grey/Dim**: Metadata, previews, timestamps + +## Disabling Colors + +Set `NO_COLOR=1` to disable ANSI color codes in output. + +## File Logging + +Rich logging only affects console output. File logging behavior is unchanged: +- Full logs are written to `logs/instance_.log` +- Stdout/stderr captured in `logs/instance_.output.log` +