Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions benchmarks/commit0/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Condenser configuration
# The condenser manages conversation context by automatically truncating history
# when it exceeds max_size and replacing dropped events with an LLM-generated summary.
CONDENSER_DEFAULTS = {
"enable_condenser": True,
"condenser_max_size": 240, # Maximum number of events before condensing
"condenser_keep_first": 2, # Number of initial events to always keep
}

# Inference defaults (used by run_infer.py)
# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3)
INFER_DEFAULTS = {
Expand All @@ -13,6 +22,7 @@
"num_workers": 16,
"max_attempts": 1,
"max_retries": 3,
**CONDENSER_DEFAULTS,
}

# Build defaults (used by build_images.py)
Expand Down
12 changes: 12 additions & 0 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
)
from benchmarks.utils.version import SDK_SHORT_SHA
from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger
from openhands.sdk.context.condenser import LLMSummarizingCondenser
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
Expand Down Expand Up @@ -303,10 +304,21 @@ def evaluate_instance(
tools = get_default_tools(enable_browser=False)
if self.metadata.enable_delegation:
tools.append(Tool(name=DelegateTool.name))

# Create condenser if enabled
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)

agent = Agent(
llm=self.metadata.llm,
tools=tools,
system_prompt_kwargs={"cli_mode": True},
condenser=condenser,
)

assert isinstance(workspace, RemoteWorkspace)
Expand Down
10 changes: 10 additions & 0 deletions benchmarks/gaia/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,22 @@
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Condenser configuration
# The condenser manages conversation context by automatically truncating history
# when it exceeds max_size and replacing dropped events with an LLM-generated summary.
CONDENSER_DEFAULTS = {
"enable_condenser": True,
"condenser_max_size": 240, # Maximum number of events before condensing
"condenser_keep_first": 2, # Number of initial events to always keep
}

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "gaia-benchmark/GAIA",
"split": "validation",
"level": "2023_all",
"num_workers": 30,
**CONDENSER_DEFAULTS,
}

# Build defaults (used by build_images.py)
Expand Down
12 changes: 12 additions & 0 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
Tool,
get_logger,
)
from openhands.sdk.context.condenser import LLMSummarizingCondenser
from openhands.sdk.event import ActionEvent
from openhands.sdk.tool.builtins.finish import FinishAction
from openhands.sdk.workspace import RemoteWorkspace
Expand Down Expand Up @@ -310,10 +311,21 @@ def evaluate_instance(
tools.append(Tool(name=DelegateTool.name))
tavily_api_key = os.getenv("TAVILY_API_KEY", "")
assert tavily_api_key, "TAVILY_API_KEY environment variable is not set"

# Create condenser if enabled
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)

agent = Agent(
llm=self.metadata.llm,
tools=tools,
system_prompt_kwargs={"cli_mode": True},
condenser=condenser,
mcp_config={
"mcpServers": {
"fetch": {"command": "uvx", "args": ["mcp-server-fetch"]},
Expand Down
27 changes: 22 additions & 5 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from benchmarks.utils.version import SDK_SHORT_SHA
from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger
from openhands.sdk.context.condenser import LLMSummarizingCondenser
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
Expand Down Expand Up @@ -306,15 +307,22 @@ def evaluate_instance(
)
if self.metadata.enable_delegation:
tools.append(Tool(name=DelegateTool.name))

# Create condenser if enabled
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)

agent = Agent(
llm=self.metadata.llm,
tools=tools,
system_prompt_kwargs={"cli_mode": True},
# TODO: we can enable condenser and security analyzer later
# and have them configurable via EvalMetadata
# condenser=get_default_condenser(
# llm=self.metadata.llm.model_copy(update={"service_id": "condenser"})
# ),
condenser=condenser,
# TODO: we can enable security analyzer later
# security_analyzer=LLMSecurityAnalyzer(),
)

Expand Down Expand Up @@ -469,6 +477,12 @@ def main() -> None:
critic = create_critic(args)
logger.info(f"Using critic: {type(critic).__name__}")

# Handle condenser configuration
# --disable-condenser takes precedence over --enable-condenser and defaults
enable_condenser = args.enable_condenser
if args.disable_condenser:
enable_condenser = False

metadata = MultiSWEBenchEvalMetadata(
llm=llm,
dataset=args.dataset,
Expand All @@ -486,6 +500,9 @@ def main() -> None:
max_retries=args.max_retries,
workspace_type=args.workspace,
enable_delegation=args.enable_delegation,
enable_condenser=enable_condenser,
condenser_max_size=args.condenser_max_size,
condenser_keep_first=args.condenser_keep_first,
)

# Run orchestrator with a simple JSONL writer
Expand Down
10 changes: 10 additions & 0 deletions benchmarks/swebench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Condenser configuration
# The condenser manages conversation context by automatically truncating history
# when it exceeds max_size and replacing dropped events with an LLM-generated summary.
CONDENSER_DEFAULTS = {
"enable_condenser": True,
"condenser_max_size": 240, # Maximum number of events before condensing
"condenser_keep_first": 2, # Number of initial events to always keep
}

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "princeton-nlp/SWE-bench_Verified",
"split": "test",
"num_workers": 30,
**CONDENSER_DEFAULTS,
}

# Evaluation defaults (used by eval_infer.py)
Expand Down
27 changes: 22 additions & 5 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from benchmarks.utils.version import SDK_SHORT_SHA
from openhands.sdk import LLM, Agent, Conversation, Tool, get_logger
from openhands.sdk.context.condenser import LLMSummarizingCondenser
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
Expand Down Expand Up @@ -235,15 +236,22 @@ def evaluate_instance(
)
if self.metadata.enable_delegation:
tools.append(Tool(name=DelegateTool.name))

# Create condenser if enabled
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)

agent = Agent(
llm=self.metadata.llm,
tools=tools,
system_prompt_kwargs={"cli_mode": True},
# TODO: we can enable condenser and security analyzer later
# and have them configurable via EvalMetadata
# condenser=get_default_condenser(
# llm=self.metadata.llm.model_copy(update={"service_id": "condenser"})
# ),
condenser=condenser,
# TODO: we can enable security analyzer later
# security_analyzer=LLMSecurityAnalyzer(),
)

Expand Down Expand Up @@ -371,6 +379,12 @@ def main() -> None:
critic = create_critic(args)
logger.info(f"Using critic: {type(critic).__name__}")

# Handle condenser configuration
# --disable-condenser takes precedence over --enable-condenser and defaults
enable_condenser = args.enable_condenser
if args.disable_condenser:
enable_condenser = False

metadata = EvalMetadata(
llm=llm,
dataset=args.dataset,
Expand All @@ -387,6 +401,9 @@ def main() -> None:
max_retries=args.max_retries,
workspace_type=args.workspace,
enable_delegation=args.enable_delegation,
enable_condenser=enable_condenser,
condenser_max_size=args.condenser_max_size,
condenser_keep_first=args.condenser_keep_first,
)

# Run orchestrator with a simple JSONL writer
Expand Down
10 changes: 10 additions & 0 deletions benchmarks/swebenchmultimodal/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Condenser configuration
# The condenser manages conversation context by automatically truncating history
# when it exceeds max_size and replacing dropped events with an LLM-generated summary.
CONDENSER_DEFAULTS = {
"enable_condenser": True,
"condenser_max_size": 240, # Maximum number of events before condensing
"condenser_keep_first": 2, # Number of initial events to always keep
}

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "princeton-nlp/SWE-bench_Multimodal",
"split": "dev",
"num_workers": 30,
**CONDENSER_DEFAULTS,
}

# Evaluation defaults (used by eval_infer.py)
Expand Down
27 changes: 22 additions & 5 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
Tool,
get_logger,
)
from openhands.sdk.context.condenser import LLMSummarizingCondenser
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
Expand Down Expand Up @@ -257,15 +258,22 @@ def evaluate_instance(
)
if self.metadata.enable_delegation:
tools.append(Tool(name=DelegateTool.name))

# Create condenser if enabled
condenser = None
if self.metadata.enable_condenser:
condenser = LLMSummarizingCondenser(
llm=self.metadata.llm.model_copy(update={"service_id": "condenser"}),
max_size=self.metadata.condenser_max_size,
keep_first=self.metadata.condenser_keep_first,
)

agent = Agent(
llm=self.metadata.llm,
tools=tools,
system_prompt_kwargs={"cli_mode": True},
# TODO: we can enable condenser and security analyzer later
# and have them configurable via EvalMetadata
# condenser=get_default_condenser(
# llm=self.metadata.llm.model_copy(update={"service_id": "condenser"})
# ),
condenser=condenser,
# TODO: we can enable security analyzer later
# security_analyzer=LLMSecurityAnalyzer(),
)

Expand Down Expand Up @@ -461,6 +469,12 @@ def main() -> None:
critic = create_critic(args)
logger.info(f"Using critic: {type(critic).__name__}")

# Handle condenser configuration
# --disable-condenser takes precedence over --enable-condenser and defaults
enable_condenser = args.enable_condenser
if args.disable_condenser:
enable_condenser = False

metadata = EvalMetadata(
llm=llm,
dataset=args.dataset,
Expand All @@ -477,6 +491,9 @@ def main() -> None:
max_retries=args.max_retries,
workspace_type=args.workspace,
enable_delegation=args.enable_delegation,
enable_condenser=enable_condenser,
condenser_max_size=args.condenser_max_size,
condenser_keep_first=args.condenser_keep_first,
)

# Run orchestrator with a simple JSONL writer
Expand Down
10 changes: 10 additions & 0 deletions benchmarks/swtbench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,21 @@
Default values aligned with evaluation repository (OpenHands/evaluation).
"""

# Condenser configuration
# The condenser manages conversation context by automatically truncating history
# when it exceeds max_size and replacing dropped events with an LLM-generated summary.
CONDENSER_DEFAULTS = {
"enable_condenser": True,
"condenser_max_size": 240, # Maximum number of events before condensing
"condenser_keep_first": 2, # Number of initial events to always keep
}

# Inference defaults (used by run_infer.py)
INFER_DEFAULTS = {
"dataset": "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
"split": "test",
"num_workers": 30,
**CONDENSER_DEFAULTS,
}

# Evaluation defaults (used by eval_infer.py)
Expand Down
Loading