diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index dc5e2bc8..a41a044f 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -4,6 +4,15 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) # Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3) INFER_DEFAULTS = { @@ -13,6 +22,7 @@ "num_workers": 16, "max_attempts": 1, "max_retries": 3, + **CONDENSER_DEFAULTS, } # Build defaults (used by build_images.py) diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 2fafcfb7..ea06cce9 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -31,6 +31,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -303,10 +304,21 @@ def evaluate_instance( tools = get_default_tools(enable_browser=False) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, + condenser=condenser, ) assert isinstance(workspace, RemoteWorkspace) diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index dadaa20a..ccfeb82c 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -4,12 +4,22 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "gaia-benchmark/GAIA", "split": "validation", "level": "2023_all", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Build defaults (used by build_images.py) diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index f2b25940..24e7121a 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -41,6 +41,7 @@ Tool, get_logger, ) +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.event import ActionEvent from openhands.sdk.tool.builtins.finish import FinishAction from openhands.sdk.workspace import RemoteWorkspace @@ -310,10 +311,21 @@ def evaluate_instance( tools.append(Tool(name=DelegateTool.name)) tavily_api_key = os.getenv("TAVILY_API_KEY", "") assert tavily_api_key, "TAVILY_API_KEY environment variable is not set" + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, + condenser=condenser, mcp_config={ "mcpServers": { "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]}, diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index d29bd88b..68a4ab89 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -33,6 +33,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -306,15 +307,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -469,6 +477,12 @@ def main() -> None: critic = create_critic(args) logger.info(f"Using critic: {type(critic).__name__}") + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = MultiSWEBenchEvalMetadata( llm=llm, dataset=args.dataset, @@ -486,6 +500,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/swebench/config.py b/benchmarks/swebench/config.py index ab6d711d..d5072316 100644 --- a/benchmarks/swebench/config.py +++ b/benchmarks/swebench/config.py @@ -4,11 +4,21 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Verified", "split": "test", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 7593ec0a..b6305b30 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -33,6 +33,7 @@ ) from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -235,15 +236,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -371,6 +379,12 @@ def main() -> None: critic = create_critic(args) logger.info(f"Using critic: {type(critic).__name__}") + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = EvalMetadata( llm=llm, dataset=args.dataset, @@ -387,6 +401,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/swebenchmultimodal/config.py b/benchmarks/swebenchmultimodal/config.py index a0bcb772..27249caf 100644 --- a/benchmarks/swebenchmultimodal/config.py +++ b/benchmarks/swebenchmultimodal/config.py @@ -4,11 +4,21 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "princeton-nlp/SWE-bench_Multimodal", "split": "dev", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 64786a06..d031ad88 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -40,6 +40,7 @@ Tool, get_logger, ) +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -257,15 +258,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -461,6 +469,12 @@ def main() -> None: critic = create_critic(args) logger.info(f"Using critic: {type(critic).__name__}") + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = EvalMetadata( llm=llm, dataset=args.dataset, @@ -477,6 +491,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/swtbench/config.py b/benchmarks/swtbench/config.py index ad38f825..605521d8 100644 --- a/benchmarks/swtbench/config.py +++ b/benchmarks/swtbench/config.py @@ -4,11 +4,21 @@ Default values aligned with evaluation repository (OpenHands/evaluation). """ +# Condenser configuration +# The condenser manages conversation context by automatically truncating history +# when it exceeds max_size and replacing dropped events with an LLM-generated summary. +CONDENSER_DEFAULTS = { + "enable_condenser": True, + "condenser_max_size": 240, # Maximum number of events before condensing + "condenser_keep_first": 2, # Number of initial events to always keep +} + # Inference defaults (used by run_infer.py) INFER_DEFAULTS = { "dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp", "split": "test", "num_workers": 30, + **CONDENSER_DEFAULTS, } # Evaluation defaults (used by eval_infer.py) diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index e7e33390..54829c86 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -26,6 +26,7 @@ from benchmarks.utils.version import SDK_SHORT_SHA from openhands.agent_server.docker.build import _base_slug from openhands.sdk import LLM, Agent, Conversation, Tool, __version__, get_logger +from openhands.sdk.context.condenser import LLMSummarizingCondenser from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools @@ -256,15 +257,22 @@ def evaluate_instance( ) if self.metadata.enable_delegation: tools.append(Tool(name=DelegateTool.name)) + + # Create condenser if enabled + condenser = None + if self.metadata.enable_condenser: + condenser = LLMSummarizingCondenser( + llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}), + max_size=self.metadata.condenser_max_size, + keep_first=self.metadata.condenser_keep_first, + ) + agent = Agent( llm=self.metadata.llm, tools=tools, system_prompt_kwargs={"cli_mode": True}, - # TODO: we can enable condenser and security analyzer later - # and have them configurable via EvalMetadata - # condenser=get_default_condenser( - # llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}) - # ), + condenser=condenser, + # TODO: we can enable security analyzer later # security_analyzer=LLMSecurityAnalyzer(), ) @@ -390,6 +398,12 @@ def main() -> None: critic = create_critic(args) + # Handle condenser configuration + # --disable-condenser takes precedence over --enable-condenser and defaults + enable_condenser = args.enable_condenser + if args.disable_condenser: + enable_condenser = False + metadata = EvalMetadata( llm=llm, dataset=args.dataset, @@ -406,6 +420,9 @@ def main() -> None: max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, + enable_condenser=enable_condenser, + condenser_max_size=args.condenser_max_size, + condenser_keep_first=args.condenser_keep_first, ) # Run orchestrator with a simple JSONL writer diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 742e8566..8f84404a 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -90,4 +90,24 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: default=False, help="Enable sub-agent delegation tools for the agent", ) + parser.add_argument( + "--enable-condenser", + action="store_true", + help="Enable the context condenser to manage conversation history", + ) + parser.add_argument( + "--disable-condenser", + action="store_true", + help="Disable the context condenser", + ) + parser.add_argument( + "--condenser-max-size", + type=int, + help="Maximum number of events before the condenser activates", + ) + parser.add_argument( + "--condenser-keep-first", + type=int, + help="Number of initial events to always keep when condensing", + ) return parser diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index 681dab3a..a6d00735 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -72,6 +72,20 @@ class EvalMetadata(BaseModel): default=False, description="Enable sub-agent delegation tools for the agent", ) + enable_condenser: bool = Field( + default=True, + description="Enable the context condenser to manage conversation history", + ) + condenser_max_size: int = Field( + default=240, + ge=1, + description="Maximum number of events before the condenser activates", + ) + condenser_keep_first: int = Field( + default=2, + ge=0, + description="Number of initial events to always keep when condensing", + ) lmnr: LaminarEvalMetadata | None = Field( default=None, description="Laminar evaluation metadata", diff --git a/tests/test_condenser_config.py b/tests/test_condenser_config.py new file mode 100644 index 00000000..f14c5a3a --- /dev/null +++ b/tests/test_condenser_config.py @@ -0,0 +1,174 @@ +"""Tests for condenser configuration in benchmarks.""" + +from benchmarks.commit0.config import ( + CONDENSER_DEFAULTS as COMMIT0_CONDENSER_DEFAULTS, + INFER_DEFAULTS as COMMIT0_INFER_DEFAULTS, +) +from benchmarks.gaia.config import ( + CONDENSER_DEFAULTS as GAIA_CONDENSER_DEFAULTS, + INFER_DEFAULTS as GAIA_INFER_DEFAULTS, +) +from benchmarks.swebench.config import CONDENSER_DEFAULTS, INFER_DEFAULTS +from benchmarks.swebenchmultimodal.config import ( + CONDENSER_DEFAULTS as SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS, + INFER_DEFAULTS as SWEBENCHMULTIMODAL_INFER_DEFAULTS, +) +from benchmarks.swtbench.config import ( + CONDENSER_DEFAULTS as SWTBENCH_CONDENSER_DEFAULTS, + INFER_DEFAULTS as SWTBENCH_INFER_DEFAULTS, +) +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.critics import PassCritic +from benchmarks.utils.models import EvalMetadata +from openhands.sdk import LLM + + +def test_condenser_defaults_in_swebench_config(): + """Test that condenser defaults are properly defined in swebench config.""" + assert "enable_condenser" in CONDENSER_DEFAULTS + assert "condenser_max_size" in CONDENSER_DEFAULTS + assert "condenser_keep_first" in CONDENSER_DEFAULTS + assert CONDENSER_DEFAULTS["enable_condenser"] is True + assert CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_swtbench_config(): + """Test that condenser defaults are properly defined in swtbench config.""" + assert "enable_condenser" in SWTBENCH_CONDENSER_DEFAULTS + assert "condenser_max_size" in SWTBENCH_CONDENSER_DEFAULTS + assert "condenser_keep_first" in SWTBENCH_CONDENSER_DEFAULTS + assert SWTBENCH_CONDENSER_DEFAULTS["enable_condenser"] is True + assert SWTBENCH_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert SWTBENCH_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_swebenchmultimodal_config(): + """Test that condenser defaults are properly defined in swebenchmultimodal config.""" + assert "enable_condenser" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS + assert "condenser_max_size" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS + assert "condenser_keep_first" in SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["enable_condenser"] is True + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert SWEBENCHMULTIMODAL_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_gaia_config(): + """Test that condenser defaults are properly defined in gaia config.""" + assert "enable_condenser" in GAIA_CONDENSER_DEFAULTS + assert "condenser_max_size" in GAIA_CONDENSER_DEFAULTS + assert "condenser_keep_first" in GAIA_CONDENSER_DEFAULTS + assert GAIA_CONDENSER_DEFAULTS["enable_condenser"] is True + assert GAIA_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert GAIA_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_commit0_config(): + """Test that condenser defaults are properly defined in commit0 config.""" + assert "enable_condenser" in COMMIT0_CONDENSER_DEFAULTS + assert "condenser_max_size" in COMMIT0_CONDENSER_DEFAULTS + assert "condenser_keep_first" in COMMIT0_CONDENSER_DEFAULTS + assert COMMIT0_CONDENSER_DEFAULTS["enable_condenser"] is True + assert COMMIT0_CONDENSER_DEFAULTS["condenser_max_size"] == 240 + assert COMMIT0_CONDENSER_DEFAULTS["condenser_keep_first"] == 2 + + +def test_condenser_defaults_in_infer_defaults(): + """Test that condenser defaults are included in INFER_DEFAULTS.""" + assert "enable_condenser" in INFER_DEFAULTS + assert "condenser_max_size" in INFER_DEFAULTS + assert "condenser_keep_first" in INFER_DEFAULTS + assert INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in SWTBENCH_INFER_DEFAULTS + assert "condenser_max_size" in SWTBENCH_INFER_DEFAULTS + assert "condenser_keep_first" in SWTBENCH_INFER_DEFAULTS + assert SWTBENCH_INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in SWEBENCHMULTIMODAL_INFER_DEFAULTS + assert "condenser_max_size" in SWEBENCHMULTIMODAL_INFER_DEFAULTS + assert "condenser_keep_first" in SWEBENCHMULTIMODAL_INFER_DEFAULTS + assert SWEBENCHMULTIMODAL_INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in GAIA_INFER_DEFAULTS + assert "condenser_max_size" in GAIA_INFER_DEFAULTS + assert "condenser_keep_first" in GAIA_INFER_DEFAULTS + assert GAIA_INFER_DEFAULTS["enable_condenser"] is True + + assert "enable_condenser" in COMMIT0_INFER_DEFAULTS + assert "condenser_max_size" in COMMIT0_INFER_DEFAULTS + assert "condenser_keep_first" in COMMIT0_INFER_DEFAULTS + assert COMMIT0_INFER_DEFAULTS["enable_condenser"] is True + + +def test_eval_metadata_accepts_condenser_params(): + """Test that EvalMetadata accepts condenser parameters.""" + llm = LLM(model="test-model", api_key="test-key") + metadata = EvalMetadata( + llm=llm, + dataset="test-dataset", + max_iterations=10, + eval_output_dir="/tmp/test", + critic=PassCritic(), + enable_condenser=True, + condenser_max_size=100, + condenser_keep_first=5, + ) + assert metadata.enable_condenser is True + assert metadata.condenser_max_size == 100 + assert metadata.condenser_keep_first == 5 + + +def test_eval_metadata_condenser_defaults(): + """Test that EvalMetadata uses correct defaults for condenser params.""" + llm = LLM(model="test-model", api_key="test-key") + metadata = EvalMetadata( + llm=llm, + dataset="test-dataset", + max_iterations=10, + eval_output_dir="/tmp/test", + critic=PassCritic(), + ) + # Should use default values defined in EvalMetadata + assert metadata.enable_condenser is True + assert metadata.condenser_max_size == 240 + assert metadata.condenser_keep_first == 2 + + +def test_args_parser_has_condenser_args(): + """Test that argument parser includes condenser arguments.""" + parser = get_parser(add_llm_config=False) + # Parse empty args to get defaults + args = parser.parse_args([]) + assert hasattr(args, "enable_condenser") + assert hasattr(args, "disable_condenser") + assert hasattr(args, "condenser_max_size") + assert hasattr(args, "condenser_keep_first") + + +def test_condenser_enable_disable_flags(): + """Test that enable/disable condenser flags work correctly.""" + parser = get_parser(add_llm_config=False) + + # Test enable flag + args = parser.parse_args(["--enable-condenser"]) + assert args.enable_condenser is True + + # Test disable flag + args = parser.parse_args(["--disable-condenser"]) + assert args.disable_condenser is True + + # Test both flags (disable should take precedence in implementation) + args = parser.parse_args(["--enable-condenser", "--disable-condenser"]) + assert args.enable_condenser is True + assert args.disable_condenser is True + + +def test_condenser_size_args(): + """Test that condenser size arguments can be set.""" + parser = get_parser(add_llm_config=False) + args = parser.parse_args( + ["--condenser-max-size", "120", "--condenser-keep-first", "10"] + ) + assert args.condenser_max_size == 120 + assert args.condenser_keep_first == 10