From 1012aa0e83a1de7966cdddb77773d8f5486b2636 Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Tue, 27 Jan 2026 13:46:40 -0800 Subject: [PATCH 1/6] agent hyperparam interface --- src/cloudai/cli/handlers.py | 51 +++++++++++++++++-- src/cloudai/models/agent_config.py | 79 ++++++++++++++++++++++++++++++ src/cloudai/models/workload.py | 3 +- 3 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 src/cloudai/models/agent_config.py diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index d474ff421..df381c791 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,11 +21,12 @@ import signal from contextlib import contextmanager from pathlib import Path -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional from unittest.mock import Mock import toml import yaml +from pydantic import ValidationError from cloudai.core import ( BaseInstaller, @@ -40,6 +41,11 @@ TestParser, TestScenario, ) +from cloudai.models.agent_config import ( + BayesianOptimizationConfig, + GeneticAlgorithmConfig, + MultiArmedBanditConfig, +) from cloudai.models.scenario import ReportConfig from cloudai.models.workload import TestDefinition from cloudai.parser import HOOK_ROOT @@ -145,7 +151,19 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: continue env = CloudAIGymEnv(test_run=test_run, runner=runner.runner) - agent = agent_class(env) + + try: + agent_overrides = validate_agent_overrides(agent_type, test_run.test.agent_config) + except ValidationError as e: + logging.error(f"Invalid agent_config for agent '{agent_type}':") + for error in e.errors(): + field = ".".join(str(loc) for loc in error["loc"]) + logging.error(f" - {field}: {error['msg']}") + err = 1 + continue + + agent = agent_class(env, **agent_overrides) + for step in range(agent.max_steps): result = agent.select_action() if result is None: @@ -166,6 +184,33 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: return err +def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]]) -> dict[str, Any]: + """ + Validate and process agent configuration overrides. + """ + if not agent_config: + return {} + + config_class_map = { + "ga": GeneticAlgorithmConfig, + "bo": BayesianOptimizationConfig, + "mab": MultiArmedBanditConfig, + } + + config_class = config_class_map.get(agent_type) + if not config_class: + logging.debug(f"No config validation available for agent type '{agent_type}', using defaults.") + return {} + + validated_config = config_class.model_validate(agent_config) + agent_kwargs = validated_config.model_dump(exclude_none=True) + + if agent_kwargs: + logging.info(f"Applying agent config overrides for '{agent_type}': {agent_kwargs}") + + return agent_kwargs + + def generate_reports(system: System, test_scenario: TestScenario, result_dir: Path) -> None: registry = Registry() diff --git a/src/cloudai/models/agent_config.py b/src/cloudai/models/agent_config.py new file mode 100644 index 000000000..6688baaa2 --- /dev/null +++ b/src/cloudai/models/agent_config.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC +from typing import Any, Optional + +from pydantic import BaseModel, ConfigDict, Field + + +class AgentConfig(BaseModel, ABC): + """ + Base configuration for agent overrides. + """ + + model_config = ConfigDict(extra="forbid") + random_seed: Optional[int] = Field(default=None, description="Random seed for reproducibility") + +class GeneticAlgorithmConfig(AgentConfig): + """ + Configuration overrides for Genetic Algorithm agent. + """ + + population_size: Optional[int] = Field(default=None, ge=2, description="Population size for the genetic algorithm") + n_offsprings: Optional[int] = Field(default=None, ge=1, description="Number of offsprings per generation") + crossover_prob: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Crossover probability") + mutation_prob: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Mutation probability") + + +class BayesianOptimizationConfig(AgentConfig): + """ + Configuration overrides for Bayesian Optimization agent. + """ + + sobol_num_trials: Optional[int] = Field(default=None, ge=1, description="Number of SOBOL initialization trials") + botorch_num_trials: Optional[int] = Field( + default=None, description="Number of BoTorch trials (-1 for unlimited until max_steps)" + ) + +class MultiArmedBanditConfig(AgentConfig): + """ + Configuration overrides for Multi-Armed Bandit agent. + """ + + algorithm: Optional[str] = Field( + default=None, + description="MAB algorithm: ucb1, ts (thompson_sampling), epsilon_greedy, softmax, or random", + ) + algorithm_params: Optional[dict[str, Any]] = Field( + default=None, description="Algorithm-specific parameters (e.g., alpha for UCB1, epsilon for epsilon_greedy)" + ) + seed_parameters: Optional[dict[str, Any]] = Field( + default=None, description="Initial seed configuration to evaluate first" + ) + max_arms: Optional[int] = Field(default=None, ge=1, description="Maximum number of arms in the action space") + warm_start_size: Optional[int] = Field( + default=None, ge=0, description="Number of arms to randomly explore initially" + ) + epsilon_override: Optional[float] = Field( + default=None, ge=0.0, le=1.0, description="Epsilon value for exploration (overrides algorithm epsilon)" + ) + max_explore_steps: Optional[int] = Field( + default=None, ge=0, description="Maximum steps for epsilon exploration (None for unlimited)" + ) + prefer_unseen_random: Optional[bool] = Field( + default=None, description="Prefer unseen arms during random exploration (epsilon)" + ) diff --git a/src/cloudai/models/workload.py b/src/cloudai/models/workload.py index 1745ae734..0a962cf59 100644 --- a/src/cloudai/models/workload.py +++ b/src/cloudai/models/workload.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -107,6 +107,7 @@ class TestDefinition(BaseModel, ABC): agent_steps: int = 1 agent_metrics: list[str] = Field(default=["default"]) agent_reward_function: str = "inverse" + agent_config: Optional[dict[str, Any]] = None @property def cmd_args_dict(self) -> Dict[str, Union[str, List[str]]]: From bd551e49cf05d2ce4cb9ae712c45e28bea1e4abf Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Tue, 27 Jan 2026 13:50:13 -0800 Subject: [PATCH 2/6] fix formatting --- src/cloudai/cli/handlers.py | 14 ++++++-------- src/cloudai/models/agent_config.py | 18 ++++++------------ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index df381c791..674013f5e 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -151,7 +151,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: continue env = CloudAIGymEnv(test_run=test_run, runner=runner.runner) - + try: agent_overrides = validate_agent_overrides(agent_type, test_run.test.agent_config) except ValidationError as e: @@ -161,7 +161,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: logging.error(f" - {field}: {error['msg']}") err = 1 continue - + agent = agent_class(env, **agent_overrides) for step in range(agent.max_steps): @@ -185,9 +185,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]]) -> dict[str, Any]: - """ - Validate and process agent configuration overrides. - """ + """Validate and process agent configuration overrides.""" if not agent_config: return {} @@ -196,7 +194,7 @@ def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, A "bo": BayesianOptimizationConfig, "mab": MultiArmedBanditConfig, } - + config_class = config_class_map.get(agent_type) if not config_class: logging.debug(f"No config validation available for agent type '{agent_type}', using defaults.") @@ -204,10 +202,10 @@ def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, A validated_config = config_class.model_validate(agent_config) agent_kwargs = validated_config.model_dump(exclude_none=True) - + if agent_kwargs: logging.info(f"Applying agent config overrides for '{agent_type}': {agent_kwargs}") - + return agent_kwargs diff --git a/src/cloudai/models/agent_config.py b/src/cloudai/models/agent_config.py index 6688baaa2..3e090a622 100644 --- a/src/cloudai/models/agent_config.py +++ b/src/cloudai/models/agent_config.py @@ -21,17 +21,14 @@ class AgentConfig(BaseModel, ABC): - """ - Base configuration for agent overrides. - """ + """Base configuration for agent overrides.""" model_config = ConfigDict(extra="forbid") random_seed: Optional[int] = Field(default=None, description="Random seed for reproducibility") + class GeneticAlgorithmConfig(AgentConfig): - """ - Configuration overrides for Genetic Algorithm agent. - """ + """Configuration overrides for Genetic Algorithm agent.""" population_size: Optional[int] = Field(default=None, ge=2, description="Population size for the genetic algorithm") n_offsprings: Optional[int] = Field(default=None, ge=1, description="Number of offsprings per generation") @@ -40,19 +37,16 @@ class GeneticAlgorithmConfig(AgentConfig): class BayesianOptimizationConfig(AgentConfig): - """ - Configuration overrides for Bayesian Optimization agent. - """ + """Configuration overrides for Bayesian Optimization agent.""" sobol_num_trials: Optional[int] = Field(default=None, ge=1, description="Number of SOBOL initialization trials") botorch_num_trials: Optional[int] = Field( default=None, description="Number of BoTorch trials (-1 for unlimited until max_steps)" ) + class MultiArmedBanditConfig(AgentConfig): - """ - Configuration overrides for Multi-Armed Bandit agent. - """ + """Configuration overrides for Multi-Armed Bandit agent.""" algorithm: Optional[str] = Field( default=None, From c1ecf2563120941d83c53b9d96150b211b39cdeb Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Tue, 27 Jan 2026 14:11:34 -0800 Subject: [PATCH 3/6] default pass no kwargs --- src/cloudai/cli/handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 674013f5e..c8ff8a74f 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -162,7 +162,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: err = 1 continue - agent = agent_class(env, **agent_overrides) + agent = agent_class(env, **agent_overrides) if agent_overrides else agent_class(env) for step in range(agent.max_steps): result = agent.select_action() From 9e7d6c35f5e59510fcbb335ca4754027fdb7c5ef Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Thu, 29 Jan 2026 13:27:52 -0800 Subject: [PATCH 4/6] update error logging --- src/cloudai/cli/handlers.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index c8ff8a74f..e7f7f205b 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -155,10 +155,12 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: try: agent_overrides = validate_agent_overrides(agent_type, test_run.test.agent_config) except ValidationError as e: - logging.error(f"Invalid agent_config for agent '{agent_type}':") - for error in e.errors(): - field = ".".join(str(loc) for loc in error["loc"]) - logging.error(f" - {field}: {error['msg']}") + items = ", ".join(str(loc) for error in e.errors() for loc in error["loc"]) + logging.error(f"Invalid agent_config for agent '{agent_type}': {items}") + valid_overrides = validate_agent_overrides(agent_type) + logging.error(f"Valid overrides: ") + for item in valid_overrides.items(): + logging.error(f" - {item[0]}: {item[1]}") err = 1 continue @@ -184,21 +186,30 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: return err -def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]]) -> dict[str, Any]: - """Validate and process agent configuration overrides.""" - if not agent_config: - return {} - +def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]] = None) -> dict[str, Any]: + """ + Validate and process agent configuration overrides. + If agent_config is empty, returns the available configuration fields for the agent type. + """ config_class_map = { "ga": GeneticAlgorithmConfig, - "bo": BayesianOptimizationConfig, + "bo_gp": BayesianOptimizationConfig, "mab": MultiArmedBanditConfig, } config_class = config_class_map.get(agent_type) if not config_class: - logging.debug(f"No config validation available for agent type '{agent_type}', using defaults.") - return {} + valid_types = ", ".join(f"'{t}'" for t in config_class_map.keys()) + raise ValueError( + f"Agent type '{agent_type}' does not support configuration overrides. " + f"Valid agent types are: {valid_types}. " + ) + + if not agent_config: + available_overrides = {} + for field_name, field_info in config_class.model_fields.items(): + available_overrides[field_name] = field_info.description + return available_overrides validated_config = config_class.model_validate(agent_config) agent_kwargs = validated_config.model_dump(exclude_none=True) From e7fee0c10a445310f0eb0c8722d94792aeb4258b Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Thu, 29 Jan 2026 13:28:47 -0800 Subject: [PATCH 5/6] fix docstring --- src/cloudai/cli/handlers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index e7f7f205b..44a506f66 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -158,7 +158,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: items = ", ".join(str(loc) for error in e.errors() for loc in error["loc"]) logging.error(f"Invalid agent_config for agent '{agent_type}': {items}") valid_overrides = validate_agent_overrides(agent_type) - logging.error(f"Valid overrides: ") + logging.error("Valid overrides: ") for item in valid_overrides.items(): logging.error(f" - {item[0]}: {item[1]}") err = 1 @@ -189,6 +189,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, Any]] = None) -> dict[str, Any]: """ Validate and process agent configuration overrides. + If agent_config is empty, returns the available configuration fields for the agent type. """ config_class_map = { @@ -199,7 +200,7 @@ def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, A config_class = config_class_map.get(agent_type) if not config_class: - valid_types = ", ".join(f"'{t}'" for t in config_class_map.keys()) + valid_types = ", ".join(f"'{t}'" for t in config_class_map) raise ValueError( f"Agent type '{agent_type}' does not support configuration overrides. " f"Valid agent types are: {valid_types}. " From cfb38f5c8568ccad9299c0f2421bf28ceb12b448 Mon Sep 17 00:00:00 2001 From: Alex Manley Date: Fri, 30 Jan 2026 16:17:57 -0800 Subject: [PATCH 6/6] make agent interface abstract --- src/cloudai/cli/handlers.py | 45 ++++++++++-------------- src/cloudai/configurator/base_agent.py | 6 +++- src/cloudai/models/agent_config.py | 48 +------------------------- 3 files changed, 24 insertions(+), 75 deletions(-) diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py index 44a506f66..475092451 100644 --- a/src/cloudai/cli/handlers.py +++ b/src/cloudai/cli/handlers.py @@ -41,11 +41,6 @@ TestParser, TestScenario, ) -from cloudai.models.agent_config import ( - BayesianOptimizationConfig, - GeneticAlgorithmConfig, - MultiArmedBanditConfig, -) from cloudai.models.scenario import ReportConfig from cloudai.models.workload import TestDefinition from cloudai.parser import HOOK_ROOT @@ -155,12 +150,12 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int: try: agent_overrides = validate_agent_overrides(agent_type, test_run.test.agent_config) except ValidationError as e: - items = ", ".join(str(loc) for error in e.errors() for loc in error["loc"]) - logging.error(f"Invalid agent_config for agent '{agent_type}': {items}") - valid_overrides = validate_agent_overrides(agent_type) + logging.error(f"Invalid agent_config for agent '{agent_type}': ") + for error in e.errors(): + logging.error(f" - {'.'.join(str(var_name) for var_name in error['loc'])}: {error['msg']}") logging.error("Valid overrides: ") - for item in valid_overrides.items(): - logging.error(f" - {item[0]}: {item[1]}") + for item, desc in validate_agent_overrides(agent_type).items(): + logging.error(f" - {item}: {desc}") err = 1 continue @@ -192,32 +187,28 @@ def validate_agent_overrides(agent_type: str, agent_config: Optional[dict[str, A If agent_config is empty, returns the available configuration fields for the agent type. """ - config_class_map = { - "ga": GeneticAlgorithmConfig, - "bo_gp": BayesianOptimizationConfig, - "mab": MultiArmedBanditConfig, - } + registry = Registry() + config_class_map = {} + for agent_name, agent_class in registry.agents_map.items(): + if agent_class.config: + config_class_map[agent_name] = agent_class.config config_class = config_class_map.get(agent_type) if not config_class: - valid_types = ", ".join(f"'{t}'" for t in config_class_map) + valid_types = ", ".join(f"'{agent_name}'" for agent_name in config_class_map) raise ValueError( f"Agent type '{agent_type}' does not support configuration overrides. " f"Valid agent types are: {valid_types}. " ) - if not agent_config: - available_overrides = {} - for field_name, field_info in config_class.model_fields.items(): - available_overrides[field_name] = field_info.description - return available_overrides - - validated_config = config_class.model_validate(agent_config) - agent_kwargs = validated_config.model_dump(exclude_none=True) - - if agent_kwargs: + if agent_config: + validated_config = config_class.model_validate(agent_config) + agent_kwargs = validated_config.model_dump(exclude_none=True) logging.info(f"Applying agent config overrides for '{agent_type}': {agent_kwargs}") - + else: + agent_kwargs = {} + for field_name, field_info in config_class.model_fields.items(): + agent_kwargs[field_name] = field_info.description return agent_kwargs diff --git a/src/cloudai/configurator/base_agent.py b/src/cloudai/configurator/base_agent.py index dbd397099..4b806a53c 100644 --- a/src/cloudai/configurator/base_agent.py +++ b/src/cloudai/configurator/base_agent.py @@ -15,7 +15,9 @@ # limitations under the License. from abc import ABC, abstractmethod -from typing import Any, Dict, Tuple +from typing import Any, Dict, Optional, Tuple + +from cloudai.models.agent_config import AgentConfig from .base_gym import BaseGym @@ -28,6 +30,8 @@ class BaseAgent(ABC): Automatically infers parameter types from TestRun's cmd_args. """ + config: Optional[AgentConfig] = None + def __init__(self, env: BaseGym): """ Initialize the agent with the environment. diff --git a/src/cloudai/models/agent_config.py b/src/cloudai/models/agent_config.py index 3e090a622..0b04059aa 100644 --- a/src/cloudai/models/agent_config.py +++ b/src/cloudai/models/agent_config.py @@ -15,7 +15,7 @@ # limitations under the License. from abc import ABC -from typing import Any, Optional +from typing import Optional from pydantic import BaseModel, ConfigDict, Field @@ -25,49 +25,3 @@ class AgentConfig(BaseModel, ABC): model_config = ConfigDict(extra="forbid") random_seed: Optional[int] = Field(default=None, description="Random seed for reproducibility") - - -class GeneticAlgorithmConfig(AgentConfig): - """Configuration overrides for Genetic Algorithm agent.""" - - population_size: Optional[int] = Field(default=None, ge=2, description="Population size for the genetic algorithm") - n_offsprings: Optional[int] = Field(default=None, ge=1, description="Number of offsprings per generation") - crossover_prob: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Crossover probability") - mutation_prob: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Mutation probability") - - -class BayesianOptimizationConfig(AgentConfig): - """Configuration overrides for Bayesian Optimization agent.""" - - sobol_num_trials: Optional[int] = Field(default=None, ge=1, description="Number of SOBOL initialization trials") - botorch_num_trials: Optional[int] = Field( - default=None, description="Number of BoTorch trials (-1 for unlimited until max_steps)" - ) - - -class MultiArmedBanditConfig(AgentConfig): - """Configuration overrides for Multi-Armed Bandit agent.""" - - algorithm: Optional[str] = Field( - default=None, - description="MAB algorithm: ucb1, ts (thompson_sampling), epsilon_greedy, softmax, or random", - ) - algorithm_params: Optional[dict[str, Any]] = Field( - default=None, description="Algorithm-specific parameters (e.g., alpha for UCB1, epsilon for epsilon_greedy)" - ) - seed_parameters: Optional[dict[str, Any]] = Field( - default=None, description="Initial seed configuration to evaluate first" - ) - max_arms: Optional[int] = Field(default=None, ge=1, description="Maximum number of arms in the action space") - warm_start_size: Optional[int] = Field( - default=None, ge=0, description="Number of arms to randomly explore initially" - ) - epsilon_override: Optional[float] = Field( - default=None, ge=0.0, le=1.0, description="Epsilon value for exploration (overrides algorithm epsilon)" - ) - max_explore_steps: Optional[int] = Field( - default=None, ge=0, description="Maximum steps for epsilon exploration (None for unlimited)" - ) - prefer_unseen_random: Optional[bool] = Field( - default=None, description="Prefer unseen arms during random exploration (epsilon)" - )