From 4483e3d736e71c310691220667e1380b5ba11272 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Feb 2026 23:15:39 +0000 Subject: [PATCH 1/2] Enable configurable context condensation in all benchmarks This change enables context condensation in all benchmarks and makes it configurable via config.py files and command-line arguments. The default condenser from software-agent-sdk is now used by default with max_size=80 and keep_first=4. Changes: - Add condenser configuration fields to EvalMetadata - Add CONDENSER_DEFAULTS to config.py files in swebench, swtbench, and swebenchmultimodal - Add command-line arguments for controlling condenser (--enable-condenser, --disable-condenser, --condenser-max-size, --condenser-keep-first) - Update agent creation in all benchmarks to use LLMSummarizingCondenser when enabled - Add comprehensive tests for condenser configuration Fixes #407 Co-authored-by: openhands --- benchmarks/multiswebench/run_infer.py | 27 +++- benchmarks/swebench/config.py | 10 ++ benchmarks/swebench/run_infer.py | 27 +++- benchmarks/swebenchmultimodal/config.py | 10 ++ benchmarks/swebenchmultimodal/run_infer.py | 27 +++- benchmarks/swtbench/config.py | 10 ++ benchmarks/swtbench/run_infer.py | 27 +++- benchmarks/utils/args_parser.py | 20 +++ benchmarks/utils/models.py | 14 +++ tests/test_condenser_config.py | 136 +++++++++++++++++++++ 10 files changed, 288 insertions(+), 20 deletions(-) create mode 100644 tests/test_condenser_config.py diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index d29bd88b..68a4ab89 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -33,6 +33,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -306,15 +307,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -469,6 +477,12 @@ def main() -> None: critic = create_critic(args) logger.info(f"Using critic: {type(critic).__name__}") + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = MultiSWEBenchEvalMetadata( llm=llm, dataset=args.dataset, @@ -486,6 +500,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index ab6d711d..0d6c6fa2 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -4,11 +4,21 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 80, # Maximum number of events before condensing + "condenser_keep_first": 4, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Verified", "split": "test", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 7593ec0a..b6305b30 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -33,6 +33,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -235,15 +236,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -371,6 +379,12 @@ def main() -> None: critic = create_critic(args) logger.info(f"Using critic: {type(critic).__name__}") + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = EvalMetadata( llm=llm, dataset=args.dataset, @@ -387,6 +401,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index a0bcb772..1ec3239b 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -4,11 +4,21 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 80, # Maximum number of events before condensing + "condenser_keep_first": 4, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Multimodal", "split": "dev", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 64786a06..d031ad88 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -40,6 +40,7 @@ Tool, get_logger, ) +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -257,15 +258,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -461,6 +469,12 @@ def main() -> None: critic = create_critic(args) logger.info(f"Using critic: {type(critic).__name__}") + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = EvalMetadata( llm=llm, dataset=args.dataset, @@ -477,6 +491,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index ad38f825..490065f3 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -4,11 +4,21 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 80, # Maximum number of events before condensing + "condenser_keep_first": 4, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", "split": "test", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index e7e33390..54829c86 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -26,6 +26,7 @@ from benchmarks.utils.version import SDK_SHORT_SHA from openhands.agent_server.docker.build import _base_slug from openhands.sdk import LLM, Agent, Conversation, Tool, __version__, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -256,15 +257,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -390,6 +398,12 @@ def main() -> None: critic = create_critic(args) + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = EvalMetadata( llm=llm, dataset=args.dataset, @@ -406,6 +420,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 742e8566..8f84404a 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -90,4 +90,24 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: default=False, help="Enable sub-agent delegation tools for the agent", ) + parser.add_argument( + "--enable-condenser", + action="store_true", + help="Enable the context condenser to manage conversation history", + ) + parser.add_argument( + "--disable-condenser", + action="store_true", + help="Disable the context condenser", + ) + parser.add_argument( + "--condenser-max-size", + type=int, + help="Maximum number of events before the condenser activates", + ) + parser.add_argument( + "--condenser-keep-first", + type=int, + help="Number of initial events to always keep when condensing", + ) return parser diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index 681dab3a..c06350dd 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -72,6 +72,20 @@ class EvalMetadata(BaseModel): default=False, description="Enable sub-agent delegation tools for the agent", ) + enable_condenser: bool = Field( + default=True, + description="Enable the context condenser to manage conversation history", + ) + condenser_max_size: int = Field( + default=80, + ge=1, + description="Maximum number of events before the condenser activates", + ) + condenser_keep_first: int = Field( + default=4, + ge=0, + description="Number of initial events to always keep when condensing", + ) lmnr: LaminarEvalMetadata | None = Field( default=None, description="Laminar evaluation metadata", diff --git a/tests/test_condenser_config.py b/tests/test_condenser_config.py new file mode 100644 index 00000000..b5ec18f6 --- /dev/null +++ b/tests/test_condenser_config.py @@ -0,0 +1,136 @@ +"""Tests for condenser configuration in benchmarks.""" + +from benchmarks.swebench.config import CONDENSER_DEFAULTS, INFER_DEFAULTS +from benchmarks.swebenchmultimodal.config import ( + CONDENSER_DEFAULTS as SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS, + INFER_DEFAULTS as SWEBENCHMULTIMODAL_INFER_DEFAULTS, +) +from benchmarks.swtbench.config import ( + CONDENSER_DEFAULTS as SWTBENCH_CONDENSER_DEFAULTS, + INFER_DEFAULTS as SWTBENCH_INFER_DEFAULTS, +) +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.critics import PassCritic +from benchmarks.utils.models import EvalMetadata +from openhands.sdk import LLM + + +def test_condenser_defaults_in_swebench_config(): + """Test that condenser defaults are properly defined in swebench config.""" + assert "enable_condenser" in CONDENSER_DEFAULTS + assert "condenser_max_size" in CONDENSER_DEFAULTS + assert "condenser_keep_first" in CONDENSER_DEFAULTS + assert CONDENSER_DEFAULTS["enable_condenser"] is True + assert CONDENSER_DEFAULTS["condenser_max_size"] == 80 + assert CONDENSER_DEFAULTS["condenser_keep_first"] == 4 + + +def test_condenser_defaults_in_swtbench_config(): + """Test that condenser defaults are properly defined in swtbench config.""" + assert "enable_condenser" in SWTBENCH_CONDENSER_DEFAULTS + assert "condenser_max_size" in SWTBENCH_CONDENSER_DEFAULTS + assert "condenser_keep_first" in SWTBENCH_CONDENSER_DEFAULTS + assert SWTBENCH_CONDENSER_DEFAULTS["enable_condenser"] is True + assert SWTBENCH_CONDENSER_DEFAULTS["condenser_max_size"] == 80 + assert SWTBENCH_CONDENSER_DEFAULTS["condenser_keep_first"] == 4 + + +def test_condenser_defaults_in_swebenchmultimodal_config(): + """Test that condenser defaults are properly defined in swebenchmultimodal config.""" + assert "enable_condenser" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS + assert "condenser_max_size" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS + assert "condenser_keep_first" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["enable_condenser"] is True + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_max_size"] == 80 + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_keep_first"] == 4 + + +def test_condenser_defaults_in_infer_defaults(): + """Test that condenser defaults are included in INFER_DEFAULTS.""" + assert "enable_condenser" in INFER_DEFAULTS + assert "condenser_max_size" in INFER_DEFAULTS + assert "condenser_keep_first" in INFER_DEFAULTS + assert INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in SWTBENCH_INFER_DEFAULTS + assert "condenser_max_size" in SWTBENCH_INFER_DEFAULTS + assert "condenser_keep_first" in SWTBENCH_INFER_DEFAULTS + assert SWTBENCH_INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in SWEBENCHMULTIMODAL_INFER_DEFAULTS + assert "condenser_max_size" in SWEBENCHMULTIMODAL_INFER_DEFAULTS + assert "condenser_keep_first" in SWEBENCHMULTIMODAL_INFER_DEFAULTS + assert SWEBENCHMULTIMODAL_INFER_DEFAULTS["enable_condenser"] is True + + +def test_eval_metadata_accepts_condenser_params(): + """Test that EvalMetadata accepts condenser parameters.""" + llm = LLM(model="test-model", api_key="test-key") + metadata = EvalMetadata( + llm=llm, + dataset="test-dataset", + max_iterations=10, + eval_output_dir="/tmp/test", + critic=PassCritic(), + enable_condenser=True, + condenser_max_size=100, + condenser_keep_first=5, + ) + assert metadata.enable_condenser is True + assert metadata.condenser_max_size == 100 + assert metadata.condenser_keep_first == 5 + + +def test_eval_metadata_condenser_defaults(): + """Test that EvalMetadata uses correct defaults for condenser params.""" + llm = LLM(model="test-model", api_key="test-key") + metadata = EvalMetadata( + llm=llm, + dataset="test-dataset", + max_iterations=10, + eval_output_dir="/tmp/test", + critic=PassCritic(), + ) + # Should use default values defined in EvalMetadata + assert metadata.enable_condenser is True + assert metadata.condenser_max_size == 80 + assert metadata.condenser_keep_first == 4 + + +def test_args_parser_has_condenser_args(): + """Test that argument parser includes condenser arguments.""" + parser = get_parser(add_llm_config=False) + # Parse empty args to get defaults + args = parser.parse_args([]) + assert hasattr(args, "enable_condenser") + assert hasattr(args, "disable_condenser") + assert hasattr(args, "condenser_max_size") + assert hasattr(args, "condenser_keep_first") + + +def test_condenser_enable_disable_flags(): + """Test that enable/disable condenser flags work correctly.""" + parser = get_parser(add_llm_config=False) + + # Test enable flag + args = parser.parse_args(["--enable-condenser"]) + assert args.enable_condenser is True + + # Test disable flag + args = parser.parse_args(["--disable-condenser"]) + assert args.disable_condenser is True + + # Test both flags (disable should take precedence in implementation) + args = parser.parse_args(["--enable-condenser", "--disable-condenser"]) + assert args.enable_condenser is True + assert args.disable_condenser is True + + +def test_condenser_size_args(): + """Test that condenser size arguments can be set.""" + parser = get_parser(add_llm_config=False) + args = parser.parse_args( + ["--condenser-max-size", "120", "--condenser-keep-first", "10"] + ) + assert args.condenser_max_size == 120 + assert args.condenser_keep_first == 10 From b3ac04added4c5d9c72bfe50d7b8f31b9dab8530 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 19 Feb 2026 14:35:17 +0000 Subject: [PATCH 2/2] Update condenser configuration: max_size=240, keep_first=2, add GAIA and Commit0 support - Update condenser_max_size from 80 to 240 and condenser_keep_first from 4 to 2 across all benchmarks - Update EvalMetadata defaults in models.py to use new values (240/2) - Add condenser configuration to GAIA and Commit0 benchmarks: - Add CONDENSER_DEFAULTS to config.py files - Add LLMSummarizingCondenser import and agent creation logic to run_infer.py files - Update tests to reflect new condenser defaults and add coverage for GAIA and Commit0 - All tests pass and pre-commit checks pass Co-authored-by: openhands --- benchmarks/commit0/config.py | 10 +++++ benchmarks/commit0/run_infer.py | 12 ++++++ benchmarks/gaia/config.py | 10 +++++ benchmarks/gaia/run_infer.py | 12 ++++++ benchmarks/swebench/config.py | 4 +- benchmarks/swebenchmultimodal/config.py | 4 +- benchmarks/swtbench/config.py | 4 +- benchmarks/utils/models.py | 4 +- tests/test_condenser_config.py | 54 +++++++++++++++++++++---- 9 files changed, 98 insertions(+), 16 deletions(-) diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index dc5e2bc8..a41a044f 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -4,6 +4,15 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) # Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3) INFER_DEFAULTS = { @@ -13,6 +22,7 @@ "num_workers": 16, "max_attempts": 1, "max_retries": 3, + **CONDENSER_DEFAULTS, } # Build defaults (used by build_images.py) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2fafcfb7..ea06cce9 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -31,6 +31,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -303,10 +304,21 @@ def evaluate_instance( tools = get_default_tools(enable_browser=False) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, + condenser=condenser, ) assert isinstance(workspace, RemoteWorkspace) diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index dadaa20a..ccfeb82c 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -4,12 +4,22 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "gaia-benchmark/GAIA", "split": "validation", "level": "2023_all", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Build defaults (used by build_images.py) diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index f2b25940..24e7121a 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -41,6 +41,7 @@ Tool, get_logger, ) +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.event import ActionEvent from openhands.sdk.tool.builtins.finish import FinishAction from openhands.sdk.workspace import RemoteWorkspace @@ -310,10 +311,21 @@ def evaluate_instance( tools.append(Tool(name=DelegateTool.name)) tavily_api_key = os.getenv("TAVILY_API_KEY", "") assert tavily_api_key, "TAVILY_API_KEY environment variable is not set" + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, + condenser=condenser, mcp_config={ "mcpServers": { "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]}, diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index 0d6c6fa2..d5072316 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -9,8 +9,8 @@ # when it exceeds max_size and replacing dropped events with an LLM-generated summary. CONDENSER_DEFAULTS = { "enable_condenser": True, - "condenser_max_size": 80, # Maximum number of events before condensing - "condenser_keep_first": 4, # Number of initial events to always keep + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep } # Inference defaults (used by run_infer.py) diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index 1ec3239b..27249caf 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -9,8 +9,8 @@ # when it exceeds max_size and replacing dropped events with an LLM-generated summary. CONDENSER_DEFAULTS = { "enable_condenser": True, - "condenser_max_size": 80, # Maximum number of events before condensing - "condenser_keep_first": 4, # Number of initial events to always keep + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep } # Inference defaults (used by run_infer.py) diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index 490065f3..605521d8 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -9,8 +9,8 @@ # when it exceeds max_size and replacing dropped events with an LLM-generated summary. CONDENSER_DEFAULTS = { "enable_condenser": True, - "condenser_max_size": 80, # Maximum number of events before condensing - "condenser_keep_first": 4, # Number of initial events to always keep + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep } # Inference defaults (used by run_infer.py) diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index c06350dd..a6d00735 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -77,12 +77,12 @@ class EvalMetadata(BaseModel): description="Enable the context condenser to manage conversation history", ) condenser_max_size: int = Field( - default=80, + default=240, ge=1, description="Maximum number of events before the condenser activates", ) condenser_keep_first: int = Field( - default=4, + default=2, ge=0, description="Number of initial events to always keep when condensing", ) diff --git a/tests/test_condenser_config.py b/tests/test_condenser_config.py index b5ec18f6..f14c5a3a 100644 --- a/tests/test_condenser_config.py +++ b/tests/test_condenser_config.py @@ -1,5 +1,13 @@ """Tests for condenser configuration in benchmarks.""" +from benchmarks.commit0.config import ( + CONDENSER_DEFAULTS as COMMIT0_CONDENSER_DEFAULTS, + INFER_DEFAULTS as COMMIT0_INFER_DEFAULTS, +) +from benchmarks.gaia.config import ( + CONDENSER_DEFAULTS as GAIA_CONDENSER_DEFAULTS, + INFER_DEFAULTS as GAIA_INFER_DEFAULTS, +) from benchmarks.swebench.config import CONDENSER_DEFAULTS, INFER_DEFAULTS from benchmarks.swebenchmultimodal.config import ( CONDENSER_DEFAULTS as SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS, @@ -21,8 +29,8 @@ def test_condenser_defaults_in_swebench_config(): assert "condenser_max_size" in CONDENSER_DEFAULTS assert "condenser_keep_first" in CONDENSER_DEFAULTS assert CONDENSER_DEFAULTS["enable_condenser"] is True - assert CONDENSER_DEFAULTS["condenser_max_size"] == 80 - assert CONDENSER_DEFAULTS["condenser_keep_first"] == 4 + assert CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert CONDENSER_DEFAULTS["condenser_keep_first"] == 2 def test_condenser_defaults_in_swtbench_config(): @@ -31,8 +39,8 @@ def test_condenser_defaults_in_swtbench_config(): assert "condenser_max_size" in SWTBENCH_CONDENSER_DEFAULTS assert "condenser_keep_first" in SWTBENCH_CONDENSER_DEFAULTS assert SWTBENCH_CONDENSER_DEFAULTS["enable_condenser"] is True - assert SWTBENCH_CONDENSER_DEFAULTS["condenser_max_size"] == 80 - assert SWTBENCH_CONDENSER_DEFAULTS["condenser_keep_first"] == 4 + assert SWTBENCH_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert SWTBENCH_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 def test_condenser_defaults_in_swebenchmultimodal_config(): @@ -41,8 +49,28 @@ def test_condenser_defaults_in_swebenchmultimodal_config(): assert "condenser_max_size" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS assert "condenser_keep_first" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["enable_condenser"] is True - assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_max_size"] == 80 - assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_keep_first"] == 4 + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_gaia_config(): + """Test that condenser defaults are properly defined in gaia config.""" + assert "enable_condenser" in GAIA_CONDENSER_DEFAULTS + assert "condenser_max_size" in GAIA_CONDENSER_DEFAULTS + assert "condenser_keep_first" in GAIA_CONDENSER_DEFAULTS + assert GAIA_CONDENSER_DEFAULTS["enable_condenser"] is True + assert GAIA_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert GAIA_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_commit0_config(): + """Test that condenser defaults are properly defined in commit0 config.""" + assert "enable_condenser" in COMMIT0_CONDENSER_DEFAULTS + assert "condenser_max_size" in COMMIT0_CONDENSER_DEFAULTS + assert "condenser_keep_first" in COMMIT0_CONDENSER_DEFAULTS + assert COMMIT0_CONDENSER_DEFAULTS["enable_condenser"] is True + assert COMMIT0_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert COMMIT0_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 def test_condenser_defaults_in_infer_defaults(): @@ -62,6 +90,16 @@ def test_condenser_defaults_in_infer_defaults(): assert "condenser_keep_first" in SWEBENCHMULTIMODAL_INFER_DEFAULTS assert SWEBENCHMULTIMODAL_INFER_DEFAULTS["enable_condenser"] is True + assert "enable_condenser" in GAIA_INFER_DEFAULTS + assert "condenser_max_size" in GAIA_INFER_DEFAULTS + assert "condenser_keep_first" in GAIA_INFER_DEFAULTS + assert GAIA_INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in COMMIT0_INFER_DEFAULTS + assert "condenser_max_size" in COMMIT0_INFER_DEFAULTS + assert "condenser_keep_first" in COMMIT0_INFER_DEFAULTS + assert COMMIT0_INFER_DEFAULTS["enable_condenser"] is True + def test_eval_metadata_accepts_condenser_params(): """Test that EvalMetadata accepts condenser parameters.""" @@ -93,8 +131,8 @@ def test_eval_metadata_condenser_defaults(): ) # Should use default values defined in EvalMetadata assert metadata.enable_condenser is True - assert metadata.condenser_max_size == 80 - assert metadata.condenser_keep_first == 4 + assert metadata.condenser_max_size == 240 + assert metadata.condenser_keep_first == 2 def test_args_parser_has_condenser_args():