From c3d819cc8bfcb97f8909c36196ec01721d460d19 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 24 Nov 2025 17:34:41 +0000 Subject: [PATCH 01/14] Adding optimizer registry and its test cases Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/optimizer.py | 22 +++++++ .../experimental/tests/test_optimizer.py | 62 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 QEfficient/finetune/experimental/tests/test_optimizer.py diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py index d647b73a6..2304a1525 100644 --- a/QEfficient/finetune/experimental/core/optimizer.py +++ b/QEfficient/finetune/experimental/core/optimizer.py @@ -4,3 +4,25 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + +""" +Optimizer components for the training system. +""" + +from typing import Type + +import torch.optim as optim +from torch.optim import Optimizer + +from QEfficient.finetune.experimental.core.component_registry import registry + +registry.optimizer("adam")(optim.Adam) +registry.optimizer("adamw")(optim.AdamW) +registry.optimizer("sgd")(optim.SGD) + + +def get_optimizer_cls(optimizer_name: str) -> Type[Optimizer]: + optimizer_cls = registry.get_optimizer(optimizer_name) + if optimizer_cls is None: + raise ValueError(f"Unknown optimizer: {optimizer_name}") + return optimizer_cls diff --git a/QEfficient/finetune/experimental/tests/test_optimizer.py b/QEfficient/finetune/experimental/tests/test_optimizer.py new file mode 100644 index 000000000..b1f6f82be --- /dev/null +++ b/QEfficient/finetune/experimental/tests/test_optimizer.py @@ -0,0 +1,62 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import sys +from pathlib import Path + +import pytest +import torch.optim as optim + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.finetune.experimental.core.component_registry import ComponentFactory + +sys.path.insert(0, str(Path(__file__).parent.parent)) +OPTIMIZER_CONFIGS = { + "adam": { + "name": "adam", + "opt_cls": optim.Adam, + "lr": 1e-4, + "weight_decay": 0.01, + "betas": (0.9, 0.999), + "eps": 1e-8, + "amsgrad": False, + }, + "adamw": { + "name": "adamw", + "opt_cls": optim.AdamW, + "lr": 1e-4, + "weight_decay": 0.01, + "betas": (0.9, 0.999), + "eps": 1e-8, + "amsgrad": False, + }, + "sgd": { + "name": "sgd", + "opt_cls": optim.SGD, + "lr": 1e-4, + "momentum": 0.9, + "weight_decay": 0.01, + "dampening": 0.0, + "nesterov": False, + }, +} + + +@pytest.fixture +def ref_model(): + return QEFFAutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") + + +@pytest.mark.parametrize("opt_name", OPTIMIZER_CONFIGS.keys()) +def test_optimizers(opt_name, ref_model): + """Test that all optimizers can be created with their configs.""" + # Create optimizer using the factory + config = OPTIMIZER_CONFIGS[opt_name] + opt_inst = ComponentFactory.create_optimizer(**config, model_params=ref_model.model.parameters()) + assert opt_inst is not None + assert isinstance(opt_inst, optim.Optimizer) + assert len(list(opt_inst.param_groups)) == 1 From 9e402bcfbaa69280d598eb95aa129dc09209a232 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 27 Nov 2025 07:00:25 +0000 Subject: [PATCH 02/14] Adding optimizer registry and its test cases Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/optimizer.py | 12 +++-- .../experimental/tests/test_optimizer.py | 47 ++++++++++++++----- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py index 2304a1525..64c110ec5 100644 --- a/QEfficient/finetune/experimental/core/optimizer.py +++ b/QEfficient/finetune/experimental/core/optimizer.py @@ -16,9 +16,15 @@ from QEfficient.finetune.experimental.core.component_registry import registry -registry.optimizer("adam")(optim.Adam) -registry.optimizer("adamw")(optim.AdamW) -registry.optimizer("sgd")(optim.SGD) + +def register_optimizer(optimizer_name: str, cls: Type[Optimizer]) -> None: + """Register a new optimizer class.""" + registry.optimizer(optimizer_name)(cls) + + +register_optimizer("adam", optim.Adam) +register_optimizer("adamw", optim.AdamW) +register_optimizer("sgd", optim.SGD) def get_optimizer_cls(optimizer_name: str) -> Type[Optimizer]: diff --git a/QEfficient/finetune/experimental/tests/test_optimizer.py b/QEfficient/finetune/experimental/tests/test_optimizer.py index b1f6f82be..d84a2a524 100644 --- a/QEfficient/finetune/experimental/tests/test_optimizer.py +++ b/QEfficient/finetune/experimental/tests/test_optimizer.py @@ -5,19 +5,20 @@ # # ----------------------------------------------------------------------------- +import inspect import sys from pathlib import Path import pytest +import torch.nn as nn import torch.optim as optim -from QEfficient import QEFFAutoModelForCausalLM -from QEfficient.finetune.experimental.core.component_registry import ComponentFactory +from QEfficient.finetune.experimental.core.optimizer import get_optimizer_cls, register_optimizer sys.path.insert(0, str(Path(__file__).parent.parent)) OPTIMIZER_CONFIGS = { "adam": { - "name": "adam", + "optimizer_name": "adam", "opt_cls": optim.Adam, "lr": 1e-4, "weight_decay": 0.01, @@ -26,7 +27,7 @@ "amsgrad": False, }, "adamw": { - "name": "adamw", + "optimizer_name": "adamw", "opt_cls": optim.AdamW, "lr": 1e-4, "weight_decay": 0.01, @@ -35,7 +36,7 @@ "amsgrad": False, }, "sgd": { - "name": "sgd", + "optimizer_name": "sgd", "opt_cls": optim.SGD, "lr": 1e-4, "momentum": 0.9, @@ -47,16 +48,40 @@ @pytest.fixture -def ref_model(): - return QEFFAutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") +def dummy_model(): + return nn.Sequential( + nn.Linear(10, 5), + nn.ReLU(), + nn.Linear(5, 1), + ) @pytest.mark.parametrize("opt_name", OPTIMIZER_CONFIGS.keys()) -def test_optimizers(opt_name, ref_model): +def test_optimizers(opt_name, dummy_model): """Test that all optimizers can be created with their configs.""" - # Create optimizer using the factory + # Register optimizer class config = OPTIMIZER_CONFIGS[opt_name] - opt_inst = ComponentFactory.create_optimizer(**config, model_params=ref_model.model.parameters()) - assert opt_inst is not None + register_optimizer(config["optimizer_name"], config["opt_cls"]) + optimizer_class = get_optimizer_cls(config["optimizer_name"]) + assert optimizer_class is not None + assert optimizer_class == config["opt_cls"] + valid_params = inspect.signature(optimizer_class).parameters + filtered_config = {k: v for k, v in config.items() if k in valid_params} + opt_inst = optimizer_class(dummy_model.parameters(), **filtered_config) assert isinstance(opt_inst, optim.Optimizer) assert len(list(opt_inst.param_groups)) == 1 + assert opt_inst.param_groups[0]["lr"] == config["lr"] + if "weight_decay" in config: + assert opt_inst.param_groups[0]["weight_decay"] == config["weight_decay"] + if "betas" in config: + assert opt_inst.param_groups[0]["betas"] == config["betas"] + if "eps" in config: + assert opt_inst.param_groups[0]["eps"] == config["eps"] + if "momentum" in config: + assert opt_inst.param_groups[0]["momentum"] == config["momentum"] + if "dampening" in config: + assert opt_inst.param_groups[0]["dampening"] == config["dampening"] + if "nesterov" in config: + assert opt_inst.param_groups[0]["nesterov"] == config["nesterov"] + if "amsgrad" in config: + assert opt_inst.param_groups[0]["amsgrad"] == config["amsgrad"] From 34f15c4f96aa97eaa392e6c6bbcc9e275bdcc881 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Fri, 28 Nov 2025 06:00:44 +0000 Subject: [PATCH 03/14] Adding optimizer registry and its test cases Signed-off-by: Tanisha Chawada --- QEfficient/finetune/experimental/core/optimizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py index 64c110ec5..de28848ae 100644 --- a/QEfficient/finetune/experimental/core/optimizer.py +++ b/QEfficient/finetune/experimental/core/optimizer.py @@ -32,3 +32,11 @@ def get_optimizer_cls(optimizer_name: str) -> Type[Optimizer]: if optimizer_cls is None: raise ValueError(f"Unknown optimizer: {optimizer_name}") return optimizer_cls + + +def get_optimizer(opt_config): + opt_name = opt_config.pop("optimizer_name") + opt_cls = get_optimizer_cls(opt_name) + opt_config["lr"] = float(opt_config["lr"]) + optimizer_cls_and_kwargs = (opt_cls, opt_config) + return optimizer_cls_and_kwargs From 1d7d4c2d9255cd39971059fdd35aa8b67dc8bf58 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 2 Dec 2025 07:19:58 +0000 Subject: [PATCH 04/14] [QEff. Finetuning]: Optimizer registry and test case inclusion Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/optimizer.py | 23 +++--- .../experimental/tests/test_optimizer.py | 78 ++++++++++--------- 2 files changed, 56 insertions(+), 45 deletions(-) diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py index de28848ae..2f77ce285 100644 --- a/QEfficient/finetune/experimental/core/optimizer.py +++ b/QEfficient/finetune/experimental/core/optimizer.py @@ -16,18 +16,18 @@ from QEfficient.finetune.experimental.core.component_registry import registry - -def register_optimizer(optimizer_name: str, cls: Type[Optimizer]) -> None: - """Register a new optimizer class.""" - registry.optimizer(optimizer_name)(cls) - - -register_optimizer("adam", optim.Adam) -register_optimizer("adamw", optim.AdamW) -register_optimizer("sgd", optim.SGD) +registry.optimizer("Adam")(optim.Adam) +registry.optimizer("AdamW")(optim.AdamW) +registry.optimizer("SGD")(optim.SGD) def get_optimizer_cls(optimizer_name: str) -> Type[Optimizer]: + """ + Get optimizer class from registry. + Args: optimizer_name: Name of the optimizer to retrieve. + Returns: Optimizer class. + Raises: ValueError: If optimizer name is not found in registry. + """ optimizer_cls = registry.get_optimizer(optimizer_name) if optimizer_cls is None: raise ValueError(f"Unknown optimizer: {optimizer_name}") @@ -35,6 +35,11 @@ def get_optimizer_cls(optimizer_name: str) -> Type[Optimizer]: def get_optimizer(opt_config): + """ + Create optimizer from config. + Args: opt_config: Dictionary containing optimizer configuration. + Returns: Tuple of optimizer class and its arguments. + """ opt_name = opt_config.pop("optimizer_name") opt_cls = get_optimizer_cls(opt_name) opt_config["lr"] = float(opt_config["lr"]) diff --git a/QEfficient/finetune/experimental/tests/test_optimizer.py b/QEfficient/finetune/experimental/tests/test_optimizer.py index d84a2a524..d9225f6de 100644 --- a/QEfficient/finetune/experimental/tests/test_optimizer.py +++ b/QEfficient/finetune/experimental/tests/test_optimizer.py @@ -5,20 +5,16 @@ # # ----------------------------------------------------------------------------- -import inspect -import sys -from pathlib import Path - import pytest import torch.nn as nn import torch.optim as optim -from QEfficient.finetune.experimental.core.optimizer import get_optimizer_cls, register_optimizer +from QEfficient.finetune.experimental.core.component_registry import registry +from QEfficient.finetune.experimental.core.optimizer import get_optimizer, get_optimizer_cls -sys.path.insert(0, str(Path(__file__).parent.parent)) OPTIMIZER_CONFIGS = { - "adam": { - "optimizer_name": "adam", + "Adam": { + "optimizer_name": "Adam", "opt_cls": optim.Adam, "lr": 1e-4, "weight_decay": 0.01, @@ -26,8 +22,8 @@ "eps": 1e-8, "amsgrad": False, }, - "adamw": { - "optimizer_name": "adamw", + "AdamW": { + "optimizer_name": "AdamW", "opt_cls": optim.AdamW, "lr": 1e-4, "weight_decay": 0.01, @@ -35,8 +31,8 @@ "eps": 1e-8, "amsgrad": False, }, - "sgd": { - "optimizer_name": "sgd", + "SGD": { + "optimizer_name": "SGD", "opt_cls": optim.SGD, "lr": 1e-4, "momentum": 0.9, @@ -44,6 +40,17 @@ "dampening": 0.0, "nesterov": False, }, + "RMSprop": { + "optimizer_name": "RMSprop", + "opt_cls": optim.RMSprop, + }, +} + +REGISTRY_CONFIG = { + "RMSprop": { + "optimizer_name": "RMSprop", + "opt_cls": optim.RMSprop, + }, } @@ -58,30 +65,29 @@ def dummy_model(): @pytest.mark.parametrize("opt_name", OPTIMIZER_CONFIGS.keys()) def test_optimizers(opt_name, dummy_model): - """Test that all optimizers can be created with their configs.""" - # Register optimizer class + """Test that all registered optimizers can be created with their configs.""" config = OPTIMIZER_CONFIGS[opt_name] - register_optimizer(config["optimizer_name"], config["opt_cls"]) - optimizer_class = get_optimizer_cls(config["optimizer_name"]) - assert optimizer_class is not None - assert optimizer_class == config["opt_cls"] - valid_params = inspect.signature(optimizer_class).parameters - filtered_config = {k: v for k, v in config.items() if k in valid_params} - opt_inst = optimizer_class(dummy_model.parameters(), **filtered_config) + config.pop("opt_cls") + try: + optimizer_class_and_kwargs = get_optimizer(config) + assert optimizer_class_and_kwargs is not None + except ValueError as e: + assert "Unknown optimizer" in str(e) + return + optimizer_class = optimizer_class_and_kwargs[0] + opt_inst = optimizer_class(dummy_model.parameters(), **optimizer_class_and_kwargs[1]) assert isinstance(opt_inst, optim.Optimizer) assert len(list(opt_inst.param_groups)) == 1 - assert opt_inst.param_groups[0]["lr"] == config["lr"] - if "weight_decay" in config: - assert opt_inst.param_groups[0]["weight_decay"] == config["weight_decay"] - if "betas" in config: - assert opt_inst.param_groups[0]["betas"] == config["betas"] - if "eps" in config: - assert opt_inst.param_groups[0]["eps"] == config["eps"] - if "momentum" in config: - assert opt_inst.param_groups[0]["momentum"] == config["momentum"] - if "dampening" in config: - assert opt_inst.param_groups[0]["dampening"] == config["dampening"] - if "nesterov" in config: - assert opt_inst.param_groups[0]["nesterov"] == config["nesterov"] - if "amsgrad" in config: - assert opt_inst.param_groups[0]["amsgrad"] == config["amsgrad"] + + for key in ["lr", "weight_decay", "betas", "eps", "momentum", "dampening", "nesterov", "amsgrad"]: + if key in config: + assert opt_inst.param_groups[0][key] == config[key], f"{key} mismatch" + + +@pytest.mark.parametrize("opt_name, opt_cls", REGISTRY_CONFIG.items()) +def test_registered_optimizer(opt_name, opt_cls): + """Test that the optimizer registerd correctly.""" + registry.optimizer(opt_name)(opt_cls) + optimizer_class = get_optimizer_cls(opt_name) + assert optimizer_class is not None + assert optimizer_class == opt_cls From 926a49f7c217b5c5b6ff7034665cd0e6b8c5d01c Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Wed, 3 Dec 2025 10:01:06 +0000 Subject: [PATCH 05/14] [QEff_Finetuning] Adding callback and its test cases. Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/callbacks.py | 182 ++++++++++++++++++ .../experimental/core/utils/profiler_utils.py | 88 +++++++++ .../experimental/tests/test_callback.py | 79 ++++++++ 3 files changed, 349 insertions(+) create mode 100644 QEfficient/finetune/experimental/tests/test_callback.py diff --git a/QEfficient/finetune/experimental/core/callbacks.py b/QEfficient/finetune/experimental/core/callbacks.py index d647b73a6..3267fb2c7 100644 --- a/QEfficient/finetune/experimental/core/callbacks.py +++ b/QEfficient/finetune/experimental/core/callbacks.py @@ -4,3 +4,185 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + +import json +import os +from typing import Any, Dict, Optional + +from transformers import ( + DefaultFlowCallback, + EarlyStoppingCallback, + PrinterCallback, + ProgressCallback, + TrainingArguments, +) +from transformers.integrations.integration_utils import TensorBoardCallback +from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState + +from QEfficient.finetune.experimental.core.component_registry import registry +from QEfficient.finetune.experimental.core.utils.profiler_utils import ( + get_op_verifier_ctx, + init_qaic_profiling, + stop_qaic_profiling, +) + +registry.callback("early_stopping")(EarlyStoppingCallback) +registry.callback("printer")(PrinterCallback) +registry.callback("default_flow")(DefaultFlowCallback) +registry.callback("tensorboard")(TensorBoardCallback) + + +@registry.callback("enhanced_progressbar") +class EnhancedProgressCallback(ProgressCallback): + """ + A [`TrainerCallback`] that displays the progress of training or evaluation. + You can modify `max_str_len` to control how long strings are truncated when logging. + """ + + def __init__(self, *args, **kwargs): + """ + Initialize the callback with optional max_str_len parameter to control string truncation length. + + Args: + max_str_len (`int`): + Maximum length of strings to display in logs. + Longer strings will be truncated with a message. + """ + super().__init__(*args, **kwargs) + + def on_train_begin(self, args, state, control, **kwargs): + super().on_train_begin(args, state, control, **kwargs) + if self.training_bar is not None: + self.training_bar.set_description("Training Progress") + + def on_log(self, args, state, control, logs=None, **kwargs): + if state.is_world_process_zero and self.training_bar is not None: + # make a shallow copy of logs so we can mutate the fields copied + # but avoid doing any value pickling. + shallow_logs = {} + for k, v in logs.items(): + if isinstance(v, str) and len(v) > self.max_str_len: + shallow_logs[k] = ( + f"[String too long to display, length: {len(v)} > {self.max_str_len}. " + "Consider increasing `max_str_len` if needed.]" + ) + else: + shallow_logs[k] = v + _ = shallow_logs.pop("total_flos", None) + # round numbers so that it looks better in console + if "epoch" in shallow_logs: + shallow_logs["epoch"] = round(shallow_logs["epoch"], 2) + + updated_dict = {} + if "epoch" in shallow_logs: + updated_dict["epoch"] = shallow_logs["epoch"] + if "loss" in shallow_logs: + updated_dict["loss"] = shallow_logs["loss"] + if "learning_rate" in shallow_logs: + updated_dict["lr"] = shallow_logs["learning_rate"] + self.training_bar.set_postfix(updated_dict) + + +@registry.callback("json_logger") +class JSONLoggerCallback(TrainerCallback): + """ + A [`TrainerCallback`] that logs training and evaluation metrics to a JSON file. + """ + + def __init__(self, log_path=None, *args, **kwargs): + """ + Initialize the callback with the path to the JSON log file. + + Args: + log_path (`str`): + Path to the jsonl file where logs will be saved. + """ + super().__init__(*args, **kwargs) + if log_path is None: + log_path = os.path.join(os.environ.get("OUTPUT_DIR", "./"), "training_logs.jsonl") + self.log_path = log_path + # Ensure the log file is created and empty + with open(self.log_path, "w") as _: + pass + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs: Optional[Dict] = None, + **kwargs, + ): + if logs is None: + return + logs.pop("entropy") + logs.pop("mean_token_accuracy") + if state.global_step: + logs["global_step"] = state.global_step + if logs is not None: + with open(self.log_path, "a") as f: + json_line = json.dumps(logs, separators=(",", ":")) + f.write(json_line + "\n") + + +@registry.callback("qaic_profiler_callback") +class QAICProfilerCallback(TrainerCallback): + def __init__(self, *args, **kwargs): + self.start_step = kwargs.get("start_step", -1) + self.end_step = kwargs.get("end_step", -1) + self.device_ids = kwargs.get("device_ids", [0]) + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + if state.global_step == self.start_step: + for device_id in self.device_ids: + init_qaic_profiling(True, f"qaic:{device_id}") + elif state.global_step == self.end_step: + for device_id in self.device_ids: + stop_qaic_profiling(True, f"qaic:{device_id}") + + +@registry.callback("qaic_op_by_op_verifier_callback") +class QAICOpByOpVerifierCallback(TrainerCallback): + def __init__(self, *args, **kwargs): + self.start_step = kwargs.get("start_step", -1) + self.end_step = kwargs.get("end_step", -1) + self.trace_dir = kwargs.get("trace_dir", "qaic_op_by_op_traces") + self.atol = kwargs.get("atol", 1e-1) + self.rtol = kwargs.get("rtol", 1e-5) + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + if self.start_step <= state.global_step < self.end_step: + self.op_verifier_ctx_step = get_op_verifier_ctx( + use_op_by_op_verifier=True, + device_type="qaic", + dump_dir=self.trace_dir, + step=state.global_step, + atol=self.atol, + rtol=self.rtol, + ) + self.op_verifier_ctx_step.__enter__() + + def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the end of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + if self.start_step <= state.global_step < self.end_step: + if self.op_verifier_ctx_step is not None: + self.op_verifier_ctx_step.__exit__(None, None, None) + + +def create_callbacks(name: str, **kwargs) -> Any: + """Create a callback instance.""" + callback_class = registry.get_callback(name) + if callback_class is None: + raise ValueError(f"Unknown callback: {name}. Available: {registry.list_callbacks()}") + return callback_class(**kwargs) diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py index d647b73a6..e24508e83 100644 --- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py +++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py @@ -4,3 +4,91 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + + +from contextlib import nullcontext +from typing import ContextManager + +import torch + + +def get_op_verifier_ctx( + use_op_by_op_verifier: bool, + device_type: str, + dump_dir: str, + step: int, + ref_device: str = "cpu", + ref_dtype: torch.dtype = torch.float32, + atol: float = 1e-1, + rtol: float = 1e-5, + use_ref_output_on_mismatch: bool = True, +) -> ContextManager: + """Get the op-by-op verifier context manager when op-by-op verification is + enabled. It helps in debuging operator related issues by matching the + operator execution on qaic v/s cpu. This is meant only for qaic backend. + + Args: + use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier. + device_type (str): Device on which the model is being executed. + dump_dir (str): Directory to dump the op-by-op verification results. + step (int): Step number for which the op-by-op verification is to be performed. + ref_device (str, optional): Device to use as reference for verification. + Defaults to "cpu". + ref_dtype (torch.dtype, optional): Data type to use as reference + datatype for verification. Defaults to torch.float32. + atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1. + rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5. + use_ref_output_on_mismatch (bool, optional): If an operator has a + mismatch with respect to the reference device, use the reference + device outputs and continue rest of the verification. Defaults to True. + + Returns: + ContextManager: Instance of context manager used to verify the operators. + """ + if (not use_op_by_op_verifier) or ("qaic" in device_type): + return nullcontext() + + # Lazily imported qaic_debug when it is actually needed. + import torch_qaic.debug as qaic_debug + + filter_config = qaic_debug.DispatchFilterConfig.default(device_type) + dump_dir = dump_dir + "/mismatches/step_" + str(step) + return qaic_debug.OpByOpVerifierMode( + ref_device=ref_device, + ref_dtype=ref_dtype, + atol=atol, + rtol=rtol, + use_ref_output_on_mismatch=use_ref_output_on_mismatch, + filter_config=filter_config, + dump_root_dir=dump_dir, + ) + + +def init_qaic_profiling(use_profiler: bool, device_type: str) -> None: + """Initialize the qaic profiling tool. Note: The profiler is only works + for qaic backend. + + Args: + use_profiler (bool): Boolean flag to enable profiler. + device_type (str): Device on which the model is being executed. + """ + if (use_profiler) and ("qaic" in device_type): + # Lazily imported qaic's qaic_profile when it is actually needed. + import torch_qaic.profile as qaic_profile + + qaic_profile.start_profiling(device_type, 1) + + +def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None: + """Stop the qaic profiling tool. Note: The profiler is only works + for qaic backend. + + Args: + use_profiler (bool): Boolean flag to enable profiler. + device_type (str): Device on which the model is being executed. + """ + if (use_profiler) and ("qaic" in device_type): + # Lazily imported qaic's qaic_profile when it is actually needed. + import torch_qaic.profile as qaic_profile + + qaic_profile.stop_profiling(device_type) diff --git a/QEfficient/finetune/experimental/tests/test_callback.py b/QEfficient/finetune/experimental/tests/test_callback.py new file mode 100644 index 000000000..6abdb87df --- /dev/null +++ b/QEfficient/finetune/experimental/tests/test_callback.py @@ -0,0 +1,79 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import pytest +from transformers import TrainerCallback + +from QEfficient.finetune.experimental.core.callbacks import create_callbacks +from QEfficient.finetune.experimental.core.component_registry import registry + + +class ModelSummaryCallback(TrainerCallback): + def __init__(self, max_depth=1): + self.max_depth = max_depth + + def on_train_begin(self, args, state, control, **kwargs): + model = kwargs.get("model") + if model is not None: + print("\n=== Model Summary ===") + print(model.__class__.__name__) + # Print layers up to max_depth + depth = 0 + for name, module in model.named_children(): + print(f" {name}: {module.__class__.__name__}") + depth += 1 + if depth >= self.max_depth: + break + print("======================\n") + + +# Setup test data +CALLBACK_CONFIGS = { + "early_stopping": { + "name": "early_stopping", + "early_stopping_patience": 3, + "early_stopping_threshold": 0.001, + }, + "tensorboard": {"name": "tensorboard", "tb_writer": "SummaryWriter"}, + "model_summary": { + "name": "model_summary", + "max_depth": 1, + }, +} + +REGISTRY_CALLBACK_CONFIGS = { + "model_summary": { + "name": "model_summary", + "max_depth": 1, + "callback_class": ModelSummaryCallback, + }, +} + + +@pytest.mark.parametrize("callback_name", CALLBACK_CONFIGS.keys()) +def test_callbacks(callback_name): + """Test that registered callbacks that can be created with their configs.""" + # Create callbacks using the factory + config = CALLBACK_CONFIGS[callback_name] + try: + callback_inst = create_callbacks(**config) + except ValueError as e: + assert "Unknown callback" in str(e) + return + if hasattr(callback_inst, "callback"): + assert callback_inst.callback is not None + else: + assert callback_inst is not None + + +@pytest.mark.parametrize("callback_name,callback_class", REGISTRY_CALLBACK_CONFIGS.items()) +def test_callbacks_registery(callback_name, callback_class): + """Test that a callback registered correctly.""" + registry.callback(callback_name)(callback_class) + callback = registry.get_callback(callback_name) + assert callback is not None + assert callback == callback_class From 000da81e3575c78186d03033865050cf77fc8b61 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Fri, 5 Dec 2025 09:05:57 +0000 Subject: [PATCH 06/14] [QEff_Finetuning] Adding callback and its test cases. Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/callbacks.py | 21 +++++++++++++++++-- .../experimental/tests/test_callback.py | 19 +++-------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/QEfficient/finetune/experimental/core/callbacks.py b/QEfficient/finetune/experimental/core/callbacks.py index 3267fb2c7..30659e3bb 100644 --- a/QEfficient/finetune/experimental/core/callbacks.py +++ b/QEfficient/finetune/experimental/core/callbacks.py @@ -51,11 +51,16 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def on_train_begin(self, args, state, control, **kwargs): + """Set progress bar description at the start of training.""" super().on_train_begin(args, state, control, **kwargs) if self.training_bar is not None: self.training_bar.set_description("Training Progress") def on_log(self, args, state, control, logs=None, **kwargs): + """ + Override the default `on_log` behavior during training to display + the current epoch number, loss, and learning rate in the logs. + """ if state.is_world_process_zero and self.training_bar is not None: # make a shallow copy of logs so we can mutate the fields copied # but avoid doing any value pickling. @@ -113,10 +118,11 @@ def on_log( logs: Optional[Dict] = None, **kwargs, ): + """Append sanitized log metrics (including global_step) to a JSONL file.""" if logs is None: return - logs.pop("entropy") - logs.pop("mean_token_accuracy") + logs.pop("entropy", None) + logs.pop("mean_token_accuracy", None) if state.global_step: logs["global_step"] = state.global_step if logs is not None: @@ -127,7 +133,13 @@ def on_log( @registry.callback("qaic_profiler_callback") class QAICProfilerCallback(TrainerCallback): + """Callback to profile QAIC devices over a specified training step range.""" + def __init__(self, *args, **kwargs): + """ + Initialize QAIC profiler settings (start/end steps and target device IDs). + """ + self.start_step = kwargs.get("start_step", -1) self.end_step = kwargs.get("end_step", -1) self.device_ids = kwargs.get("device_ids", [0]) @@ -147,7 +159,12 @@ def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: T @registry.callback("qaic_op_by_op_verifier_callback") class QAICOpByOpVerifierCallback(TrainerCallback): + """Callback to verify QAIC operations step-by-step during a specified training range.""" + def __init__(self, *args, **kwargs): + """ " + Initialize QAIC Op-by-Op verifier callback with profiling and tolerance settings. + """ self.start_step = kwargs.get("start_step", -1) self.end_step = kwargs.get("end_step", -1) self.trace_dir = kwargs.get("trace_dir", "qaic_op_by_op_traces") diff --git a/QEfficient/finetune/experimental/tests/test_callback.py b/QEfficient/finetune/experimental/tests/test_callback.py index 6abdb87df..18ec3978d 100644 --- a/QEfficient/finetune/experimental/tests/test_callback.py +++ b/QEfficient/finetune/experimental/tests/test_callback.py @@ -13,22 +13,8 @@ class ModelSummaryCallback(TrainerCallback): - def __init__(self, max_depth=1): - self.max_depth = max_depth - - def on_train_begin(self, args, state, control, **kwargs): - model = kwargs.get("model") - if model is not None: - print("\n=== Model Summary ===") - print(model.__class__.__name__) - # Print layers up to max_depth - depth = 0 - for name, module in model.named_children(): - print(f" {name}: {module.__class__.__name__}") - depth += 1 - if depth >= self.max_depth: - break - print("======================\n") + def __init__(self): + pass # Setup test data @@ -68,6 +54,7 @@ def test_callbacks(callback_name): assert callback_inst.callback is not None else: assert callback_inst is not None + assert isinstance(callback_inst, TrainerCallback) @pytest.mark.parametrize("callback_name,callback_class", REGISTRY_CALLBACK_CONFIGS.items()) From 94eab127d635ebef3c88260138c17203f55f0372 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 9 Dec 2025 06:08:14 +0000 Subject: [PATCH 07/14] [QEff.finetuning] Adding config_manager and its test cases. Signed-off-by: Tanisha Chawada --- .../experimental/core/config_manager.py | 648 ++++++++++++++++++ .../experimental/tests/test_config.yaml | 117 ++++ .../experimental/tests/test_config_manager.py | 50 ++ 3 files changed, 815 insertions(+) create mode 100644 QEfficient/finetune/experimental/tests/test_config.yaml create mode 100644 QEfficient/finetune/experimental/tests/test_config_manager.py diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index d647b73a6..60ed4d4b6 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -4,3 +4,651 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +""" +Configuration manager for handling all training configurations. +Provides centralized configuration loading, validation, and management. +""" + +import json +import os +import sys +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Dict, Optional, Union + +import yaml +from transformers.hf_argparser import HfArgumentParser + +from QEfficient.finetune.experimental.core.component_registry import registry + + +@dataclass +class OptimizerConfig: + """Configuration for optimizers.""" + + optimizer_name: str = field( + default="adamw", + metadata={"help": "The name of the optimizer to use."}, + ) + lr: float = field( + default=5e-5, + metadata={"help": "The initial learning rate for the optimizer."}, + ) + weight_decay: float = field( + default=0.01, + metadata={"help": "The weight decay to apply (if any)."}, + ) + + +@dataclass +class SchedulerConfig: + """Configuration for learning rate schedulers.""" + + scheduler_name: str = field( + default="cosine", + metadata={"help": "The name of the scheduler to use (e.g., 'linear', 'cosine')."}, + ) + warmup_steps: int = field( + default=100, + metadata={ + "help": "Number of steps for the warmup phase. If provided " + "value is within [0-1) range then it will be interpreted as " + "ratio of total training steps for the warmup phase." + }, + ) + + +@dataclass +class DatasetConfig: + """Configuration for datasets.""" + + tokenizer_name: str = field( + default="HuggingFaceTB/SmolLM-135M", + metadata={"help": "The name or path of the tokenizer to use."}, + ) + dataset_type: str = field( + default="seq_completion", + metadata={"help": "The type of dataset (e.g., 'seq_completion')."}, + ) + dataset_name: str = field( + default="knkarthick/samsum", + metadata={"help": "The name or path of the dataset."}, + ) + dataset_subset: str = field( + default="default", + metadata={"help": "The subset of the dataset to use, if applicable."}, + ) + train_split: str = field( + default="train", + metadata={"help": "The name of the training split."}, + ) + test_split: str = field( + default="test", + metadata={"help": "The name of the test/validation split."}, + ) + max_seq_length: int = field( + default=512, + metadata={"help": "The maximum sequence length for tokenization."}, + ) + split_ratio: float = field( + default=0.8, + metadata={"help": "Ratio for train/test split, used when only train_split is provided."}, + ) + input_columns: list[str] = field( + default_factory=lambda: ["text"], + metadata={"help": "List of column names containing input text."}, + ) + target_column: Optional[str] = field( + default=None, + metadata={"help": "Name of the column containing target labels (if applicable)."}, + ) + train_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during training."}, + ) + eval_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during evaluation."}, + ) + num_workers: int = field( + default=4, + metadata={"help": "Number of workers for dataset processing."}, + ) + collate_fn: str = field( + default="dynamic_padding", + metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."}, + ) + group_by_length: bool = field( + default=True, + metadata={"help": "Whether to group samples by length to minimize padding."}, + ) + length_column_name: str = field( + default="input_ids", + metadata={"help": "The column name containing the length of the input sequences."}, + ) + dataloader_pin_memory: bool = field( + default=True, + metadata={"help": "Whether to pin GPU memory for dataloaders."}, + ) + dataloader_persistent_workers: bool = field( + default=True, + metadata={"help": "Whether to keep dataloader workers alive across epochs."}, + ) + dataloader_prefetch_factor: int = field( + default=1, + metadata={"help": "Number of samples loaded in advance by each worker."}, + ) + dataloader_drop_last: bool = field( + default=False, + metadata={"help": "Whether to drop the last incomplete batch."}, + ) + dataloader_num_workers: int = field( + default=1, + metadata={"help": "Number of workers for the DataLoader."}, + ) + + +@dataclass +class PeftConfig: + """Configuration for PEFT (Parameter-Efficient Fine-Tuning) methods.""" + + lora_r: int = field( + default=8, + metadata={"help": "Lora attention dimension."}, + ) + lora_alpha: int = field( + default=16, + metadata={"help": "Lora alpha."}, + ) + lora_dropout: float = field( + default=0.1, + metadata={"help": "The dropout probability for Lora layers."}, + ) + target_modules: list[str] = field( + default_factory=lambda: ["q_proj", "v_proj"], + metadata={"help": "The modules to apply Lora to."}, + ) + bias: str = field( + default="none", + metadata={"help": "Bias type for Lora ('none', 'all', 'lora_only')."}, + ) + task_type: str = field( + default="CAUSAL_LM", + metadata={"help": "The task type for PEFT (e.g., 'CAUSAL_LM', 'SEQ_2_SEQ_LM')."}, + ) + peft_type: str = field( + default="LORA", + metadata={"help": "The PEFT method to use (e.g., 'LORA', 'IA3')."}, + ) + + +@dataclass +class ModelConfig: + """Configuration for models.""" + + model_name: str = field( + default="HuggingFaceTB/SmolLM-135M", + metadata={"help": "The name or path of the pretrained model."}, + ) + model_type: str = field( + default="hf", + metadata={"help": "The type of model ('hf' for Hugging Face, 'custom' for custom models)."}, + ) + auto_class_name: str = field( + default="AutoModelForCausalLM", + metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."}, + ) + load_in_4bit: bool = field( + default=False, + metadata={"help": "Whether to load the model in 4-bit quantization."}, + ) + use_peft: bool = field( + default=True, + metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."}, + ) + peft_config: Optional[PeftConfig] = field( + default_factory=PeftConfig, + metadata={"help": "Configuration for PEFT."}, + ) + use_cache: bool = field( + default=False, + metadata={"help": "Whether to use the past key/values in the model for faster decoding."}, + ) + attn_implementation: str = field( + default="sdpa", + metadata={"help": "The attention implementation to use (e.g., 'sdpa', 'eager')."}, + ) + device_map: Optional[str] = field( + default=None, + metadata={"help": "The device map to use for model distribution (e.g., 'auto')."}, + ) + + +@dataclass +class CallbackConfig: + """Configuration for callbacks.""" + + callbacks: Dict[str, Dict[str, Any]] = field( + default_factory=dict, + metadata={"help": "Dictionary of callback configurations, keyed by callback name."}, + ) + + +@dataclass +class GradientCheckpointingKwargs: + """Arguments for gradient checkpointing.""" + + preserve_rng_state: bool = field( + default=True, + metadata={"help": "Whether to preserve the RNG state when checkpointing."}, + ) + use_reenrant: bool = field( + default=False, + metadata={"help": "Whether to use reentrant gradient checkpointing."}, + ) + + +@dataclass +class DdpConfig: + """Arguments for Distributed Data Parallel (DDP) training.""" + + ddp_backend: str = field( + default="qccl", + metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."}, + ) + ddp_find_unused_parameters: bool = field( + default=True, + metadata={"help": "Whether to find unused parameters in DDP."}, + ) + ddp_bucket_cap_mb: Optional[int] = field( + default=25, + metadata={"help": "The bucket size in MB for DDP communication."}, + ) + ddp_broadcast_buffers: bool = field( + default=True, + metadata={"help": "Whether to broadcast buffers in DDP."}, + ) + ddp_timeout: int = field( + default=1800, + metadata={"help": "Timeout for DDP operations in seconds."}, + ) + + +@dataclass +class TrainingConfig: + """Configuration for training.""" + + type: str = field( + default="sft", + metadata={"help": "The type of training (e.g., 'sft' for Supervised Fine-Tuning)."}, + ) + output_dir: str = field( + default="./training_results", + metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, + ) + overwrite_output_dir: bool = field( + default=False, + metadata={"help": "Whether to overwrite the output directory."}, + ) + seed: int = field( + default=42, + metadata={"help": "Random seed for reproducibility."}, + ) + + do_eval: bool = field( + default=True, + metadata={"help": "Whether to run evaluation during training."}, + ) + eval_strategy: str = field( + default="epoch", + metadata={"help": "The evaluation strategy to use ('no', 'steps', 'epoch')."}, + ) + eval_steps: int = field( + default=100, + metadata={"help": "Number of update steps between two evaluations."}, + ) + + per_device_train_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during training."}, + ) + per_device_eval_batch_size: int = field( + default=1, + metadata={"help": "Batch size per device during evaluation."}, + ) + gradient_accumulation_steps: int = field( + default=1, + metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, + ) + num_train_epochs: int = field( + default=1, + metadata={"help": "Total number of training epochs to perform."}, + ) + max_steps: int = field( + default=-1, + metadata={"help": "If > 0: set total number of training steps to perform."}, + ) + + log_level: str = field( + default="info", + metadata={"help": "Set the verbosity level of the logs ('debug', 'info', 'warning', 'error')."}, + ) + log_on_each_node: bool = field( + default=True, + metadata={"help": "Whether to log on each node in a distributed setup."}, + ) + logging_strategy: str = field( + default="steps", + metadata={"help": "The logging strategy to use ('no', 'steps', 'epoch')."}, + ) + logging_steps: int = field( + default=10, + metadata={"help": "Number of update steps between two loggings."}, + ) + + save_strategy: str = field( + default="epoch", + metadata={"help": "The checkpoint save strategy to use ('no', 'steps', 'epoch')."}, + ) + save_steps: int = field( + default=100, + metadata={"help": "Number of update steps between two checkpoints (if save_strategy is 'steps')."}, + ) + save_total_limit: int = field( + default=5, + metadata={"help": "Limit the total amount of checkpoints. Deletes older checkpoints to stay within limit."}, + ) + metric_for_best_model: str = field( + default="eval_loss", + metadata={"help": "The metric to use to compare two models ('eval_loss', etc.)."}, + ) + + dtype: str = field( + default="fp16", + metadata={"help": "The data type to use for training (e.g., 'fp16', 'bf16')."}, + ) + + gradient_checkpointing: bool = field( + default=False, + metadata={"help": "Whether to use gradient checkpointing."}, + ) + gradient_checkpointing_kwargs: Optional[GradientCheckpointingKwargs] = field( + default_factory=GradientCheckpointingKwargs, + metadata={"help": "Arguments for gradient checkpointing."}, + ) + + torch_compile: bool = field( + default=True, + metadata={"help": "Whether to compile the model with `torch.compile`."}, + ) + include_tokens_per_second: bool = field( + default=True, + metadata={"help": "Whether to include tokens per second in logs."}, + ) + include_num_input_tokens_seen: bool = field( + default=True, + metadata={"help": "Whether to include the number of input tokens seen in logs."}, + ) + average_tokens_across_devices: bool = field( + default=True, + metadata={"help": "Whether to average tokens across devices in distributed training."}, + ) + + disable_tqdm: Optional[bool] = field( + default=None, + metadata={"help": "Whether to disable the tqdm progress bar."}, + ) + fsdp_config: Optional[Dict[str, Any]] = field( + default=None, + metadata={"help": "FSDP configuration dictionary."}, + ) + deepspeed_config: Optional[Dict[str, Any]] = field( + default=None, + metadata={"help": "DeepSpeed configuration dictionary."}, + ) + accelerator_config: Optional[Dict[str, Any]] = field( + default=None, + metadata={"help": "Accelerate configuration dictionary."}, + ) + ddp_config: Optional[DdpConfig] = field( + default_factory=DdpConfig, + metadata={"help": "DDP configuration dictionary."}, + ) + use_cpu: Optional[bool] = field( + default=None, + metadata={"help": "Whether to explicitly run training on CPU."}, + ) + resume_from_checkpoint: Optional[str] = field( + default=None, + metadata={"help": "Path to a checkpoint to resume training from."}, + ) + restore_callback_states_from_checkpoint: Optional[bool] = field( + default=None, + metadata={"help": "Whether to restore callback states from checkpoint."}, + ) + + +@dataclass +class MasterConfig: + """Main training configuration.""" + + model: ModelConfig = field(default_factory=ModelConfig, metadata={"help": "Configuration for the model."}) + + dataset: DatasetConfig = field(default_factory=DatasetConfig, metadata={"help": "Configuration for the dataset."}) + + optimizers: OptimizerConfig = field( + default_factory=OptimizerConfig, metadata={"help": "Configuration for optimizers."} + ) + + scheduler: SchedulerConfig = field( + default_factory=SchedulerConfig, metadata={"help": "Configuration for the learning rate scheduler."} + ) + + callbacks: CallbackConfig = field(default_factory=CallbackConfig, metadata={"help": "Configuration for callbacks."}) + + training: TrainingConfig = field( + default_factory=TrainingConfig, metadata={"help": "Configuration for training parameters."} + ) + + extra_params: Dict[str, Any] = field( + default_factory=dict, metadata={"help": "Additional top-level parameters not explicitly defined."} + ) + + +def parse_arguments(config_path: Optional[str] = None) -> MasterConfig: + """Create argument parser for the new finetuning interface.""" + parser = HfArgumentParser(MasterConfig) + + if config_path: + config_path = os.path.abspath(config_path) + if not os.path.exists(config_path): + raise FileNotFoundError(f"Config file not found: {config_path}") + if not (config_path.endswith(".yaml") or config_path.endswith(".yml")): + raise ValueError(f"Expected a .yaml/.yml file, got: {config_path}") + + try: + (master_config,) = parser.parse_yaml_file(yaml_file=config_path) + return master_config + except Exception as e: + raise ValueError(f"Failed to parse YAML config '{config_path}': {e}") + + if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + master_config = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))[0] + else: + master_config = parser.parse_args_into_dataclasses() + + return master_config + + +class ConfigManager: + """Manages configuration loading, validation, and updates.""" + + def __init__(self, config: MasterConfig): + """ + Initialize ConfigManager with either: + - Path to config file (str or Path) + - Configuration dictionary + - None (creates empty config) + """ + self.config = config + + def load_config(self, config_path: Union[str, Path]) -> None: + """Load configuration from file.""" + config_path = Path(config_path) + + if not config_path.exists(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + if config_path.suffix.lower() in [".yaml", ".yml"]: + with open(config_path, "r") as f: + config_dict = yaml.safe_load(f) + elif config_path.suffix.lower() == ".json": + with open(config_path, "r") as f: + config_dict = json.load(f) + else: + raise ValueError(f"Unsupported configuration file format: {config_path.suffix}") + + self.update_config(config_dict) + + def update_config(self, config_dict: Dict[str, Any]) -> None: + """Update configuration with dictionary values.""" + for key, value in config_dict.items(): + if hasattr(self.config, key): + if isinstance(value, dict) and hasattr(getattr(self.config, key), "__dataclass_fields__"): + # Special handling for callbacks + if key in ["callbacks", "optimizers", "loss_functions"]: + nested_config = getattr(self.config, key) + for component_name, component_dict in value.items(): + if isinstance(component_dict, dict): + getattr(nested_config, key)[component_name] = component_dict + else: + getattr(nested_config, "extra_params")[component_name] = nested_config.extra_params[ + component_name + ] = component_dict + else: + # Update nested dataclass + nested_config = getattr(self.config, key) + for nested_key, nested_value in value.items(): + if hasattr(nested_config, nested_key): + setattr(getattr(self.config, key), nested_key, nested_value) + elif hasattr(nested_config, "extra_params"): + getattr(getattr(self.config, key), "extra_params")[nested_key] = nested_value + else: + setattr(self.config, key, value) + else: + # Store unknown parameters in extra_params + self.config.extra_params[key] = value + + def save_config(self, output_path: Union[str, Path]) -> None: + """Save current configuration to file.""" + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + config_dict = self.config + + if output_path.suffix.lower() in [".yaml", ".yml"]: + with open(output_path, "w") as f: + yaml.dump(config_dict, f, default_flow_style=False, indent=2) + elif output_path.suffix.lower() == ".json": + with open(output_path, "w") as f: + json.dump(config_dict, f, indent=2) + else: + raise ValueError(f"Unsupported output file format: {output_path.suffix}") + + def validate_config(self) -> None: + """Validate configuration parameters.""" + errors = [] + + # Validate model configuration + if not self.config.model.model_name: + errors.append("Model name is required") + + # Validate dataset configuration + if not self.config.dataset.dataset_name: + errors.append("Dataset name is required") + + # Validate training parameters + if self.config.dataset.train_batch_size <= 0: + errors.append("Train batch size must be positive") + + if self.config.dataset.eval_batch_size <= 0: + errors.append("Validation batch size must be positive") + + if self.config.training.num_train_epochs <= 0: + errors.append("Number of epochs must be positive") + + if self.config.training.gradient_accumulation_steps <= 0: + errors.append("Gradient accumulation steps must be positive") + + # Validate device configuration + valid_devices = ["cpu", "cuda", "qaic"] + if self.config.training.device not in valid_devices: + errors.append(f"Device must be one of {valid_devices}") + + if errors: + raise ValueError("Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors)) + + def get_callback_config(self) -> Dict[str, Any]: + """Get callback configuration as dictionary.""" + return self.config.callbacks + + def get_optimizer_config(self) -> Dict[str, Any]: + """Get optimizer configuration as dictionary.""" + return self.config.optimizers + + def get_training_config(self) -> Dict[str, Any]: + """Get training configuration as dictionary.""" + return self.config.training + + def get_scheduler_config(self) -> Dict[str, Any]: + """Get scheduler configuration as dictionary.""" + return self.config.scheduler + + def get_dataset_config(self) -> Dict[str, Any]: + """Get dataset configuration as dictionary.""" + return self.config.dataset + + def get_model_config(self) -> Dict[str, Any]: + """Get model configuration as dictionary.""" + return self.config.model + + def to_dict(self) -> Dict[str, Any]: + """Convert configuration to dictionary.""" + return asdict(self.config) + + def __getattr__(self, name: str) -> Any: + """Allow direct access to config attributes.""" + if hasattr(self.config, name): + return getattr(self.config, name) + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + +def create_trainer_config(name: str, **dependencies) -> tuple: + """ + Create trainer configuration based on registered trainer modules. + + Args: + name: Name of the trainer type + **dependencies: Any dependencies needed to configure the trainer + + Returns: + tuple: (trainer_class, args_class, additional_kwargs) + """ + config = registry.get_trainer_module(name) + + # Process required kwargs based on available dependencies + additional_kwargs = {} + for kwarg, default in config["required_kwargs"].items(): + if kwarg in dependencies: + additional_kwargs[kwarg] = dependencies[kwarg] + elif default != "REQUIRED": + additional_kwargs[kwarg] = default + + # Check for missing required arguments + for kwarg, default in config["required_kwargs"].items(): + if kwarg not in additional_kwargs and default == "REQUIRED": + raise ValueError(f"Required argument '{kwarg}' not provided for trainer '{name}'") + + return config["trainer_cls"], config["args_cls"], additional_kwargs diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml new file mode 100644 index 000000000..59d388bd3 --- /dev/null +++ b/QEfficient/finetune/experimental/tests/test_config.yaml @@ -0,0 +1,117 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +# Model configuration +model: + model_type: "hf" # Hugging Face model + auto_class_name: "AutoModelForCausalLM" + model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name + load_in_4bit: false + use_peft: true + peft_config: + lora_r: 8 + lora_alpha: 16 + lora_dropout: 0.1 + target_modules: ["q_proj", "v_proj"] + bias: "none" # Options: none, all, lora_only + task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. + peft_type: "LORA" # Options: LORA, IA3, etc. + +# Dataset configuration +dataset: + tokenizer_name: "HuggingFaceTB/SmolLM-135M" + dataset_type: "seq_completion" + # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M" + dataset_name: "knkarthick/samsum" + train_split: "train" + max_seq_length: 512 + split_ratio: 0.8 # Ratio for train/test split, used when only train_split is provided + test_split: "test" + group_by_length: True + num_workers: 4 + pin_memory: True + persistent_workers: True + prefetch_factor: 1 + drop_last: False + +# Training configuration +training: + type: "sft" + output_dir: "./training_results" + overwrite_output_dir: False + seed: 42 + + do_eval: True + eval_strategy: "epoch" + eval_steps: 100 + + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 1 + num_train_epochs: 1 + max_steps: -1 + + log_level: "info" + log_on_each_node: True + logging_strategy: "steps" + logging_steps: 10 + + save_strategy: "epoch" + save_steps: 100 # If 'save_strategy' is 'steps' then it will be used. + save_total_limit: 5 + metric_for_best_model: "eval_loss" + + dtype: "fp16" + completion_only_loss: True + report_to: "trackio" + + ddp_config: + ddp_backend: "qccl" + ddp_find_unused_parameters: False + ddp_bucket_cap_mb: 25 + ddp_broadcast_buffers: null + ddp_timeout: 1800 + + # Uncomment below to explicitly run on CPU + use_cpu: False + + gradient_checkpointing: False + gradient_checkpointing_kwargs: + preserve_rng_state : True + use_reenrant: False + + torch_compile: True + include_tokens_per_second: True + include_num_input_tokens_seen: True + average_tokens_across_devices: True + +# Optimizer configuration +optimizers: + optimizer_name: "adamw" + lr: 5e-5 + weight_decay: 0.01 + + +# “linear” → transformers.get_linear_schedule_with_warmup +# “cosine” → transformers.get_cosine_schedule_with_warmup +# “cosine_with_restarts” -->transformers.get_cosine_with_hard_restarts_schedule_with_warmup +# “polynomial” → transformers.get_polynomial_decay_schedule_with_warmup +# “constant” → transformers.get_constant_schedule +# “constant_with_warmup” → transformers.get_constant_schedule_with_warmup +# “inverse_sqrt” → transformers.get_inverse_sqrt_schedule + +scheduler: + scheduler_name: "cosine" + warmup_steps: 100 # warmup_steps or warmup_ratio + warmup_ratio: 0.1 + +callbacks: + early_stopping: + early_stopping_patience: 3 + early_stopping_threshold: 0.001 + tensorboard: + diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py new file mode 100644 index 000000000..10105a33e --- /dev/null +++ b/QEfficient/finetune/experimental/tests/test_config_manager.py @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +from pathlib import Path + +import pytest + +from QEfficient.finetune.experimental.core.config_manager import ConfigManager, parse_arguments + + +@pytest.fixture +def config_path() -> Path: + here = Path(__file__).resolve().parent + return (here / "test_config.yaml").resolve() + + +# git commit -s -m "[QEff.finetuning] Adding config_manager and its test cases." + + +def test_config(config_path): + # parse the yaml file + master_config = parse_arguments(config_path) + config_manager = ConfigManager(master_config) + # Test that the config manager is initialized correctly + assert isinstance(config_manager, ConfigManager) + + # Test that all required fields are present + missing = [ + a + for a in ("model", "dataset", "optimizers", "scheduler", "callbacks", "training") + if not hasattr(config_manager, a) + ] + assert not missing, f"Missing attributes: {missing}" + trainer_config = config_manager.get_training_config() + assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs")) + dataset_config = config_manager.get_dataset_config() + assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name")) + model_config = config_manager.get_model_config() + assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft")) + scheduler_config = config_manager.get_scheduler_config() + assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name")) + callback_config = config_manager.get_callback_config() + assert (hasattr(callback_config, attr) for attr in ("earlystopping")) + optimizer_config = config_manager.get_optimizer_config() + assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr")) From 28ec40b624ff80bd2d3041683caa44835989e189 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 9 Dec 2025 07:31:48 +0000 Subject: [PATCH 08/14] [QEff.finetuning] Adding config_manager and its test cases. Signed-off-by: Tanisha Chawada --- QEfficient/finetune/experimental/tests/test_config_manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py index 10105a33e..b3b9b0b24 100644 --- a/QEfficient/finetune/experimental/tests/test_config_manager.py +++ b/QEfficient/finetune/experimental/tests/test_config_manager.py @@ -19,9 +19,6 @@ def config_path() -> Path: return (here / "test_config.yaml").resolve() -# git commit -s -m "[QEff.finetuning] Adding config_manager and its test cases." - - def test_config(config_path): # parse the yaml file master_config = parse_arguments(config_path) From 1f0a4df2ba4f0204847ec4b72ee3a486bb352c57 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 07:26:49 +0000 Subject: [PATCH 09/14] [QEff.finetuning] Adding config_manager and its test_cases. Signed-off-by: Tanisha Chawada --- .../experimental/core/config_manager.py | 233 +++++++++++++----- .../experimental/core/utils/profiler_utils.py | 88 ------- .../experimental/tests/test_config.yaml | 33 +-- .../experimental/tests/test_config_manager.py | 25 +- 4 files changed, 196 insertions(+), 183 deletions(-) diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index 60ed4d4b6..b28c2e1e3 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -11,10 +11,9 @@ import json import os -import sys -from dataclasses import asdict, dataclass, field +from dataclasses import asdict, dataclass, field, fields, is_dataclass from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union import yaml from transformers.hf_argparser import HfArgumentParser @@ -257,7 +256,7 @@ class DdpConfig: metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."}, ) ddp_find_unused_parameters: bool = field( - default=True, + default=False, metadata={"help": "Whether to find unused parameters in DDP."}, ) ddp_bucket_cap_mb: Optional[int] = field( @@ -294,7 +293,10 @@ class TrainingConfig: default=42, metadata={"help": "Random seed for reproducibility."}, ) - + device: str = field( + default="qaic", + metadata={"help": "The device to use for training ('cuda', 'cpu', etc.)."}, + ) do_eval: bool = field( default=True, metadata={"help": "Whether to run evaluation during training."}, @@ -307,7 +309,6 @@ class TrainingConfig: default=100, metadata={"help": "Number of update steps between two evaluations."}, ) - per_device_train_batch_size: int = field( default=1, metadata={"help": "Batch size per device during training."}, @@ -381,10 +382,6 @@ class TrainingConfig: default=True, metadata={"help": "Whether to compile the model with `torch.compile`."}, ) - include_tokens_per_second: bool = field( - default=True, - metadata={"help": "Whether to include tokens per second in logs."}, - ) include_num_input_tokens_seen: bool = field( default=True, metadata={"help": "Whether to include the number of input tokens seen in logs."}, @@ -426,6 +423,14 @@ class TrainingConfig: default=None, metadata={"help": "Whether to restore callback states from checkpoint."}, ) + report_to: Optional[List[str]] = field( + default=None, + metadata={"help": "The list of integrations to report the results and logs to."}, + ) + completion_only_loss: Optional[bool] = field( + default=False, + metadata={"help": "Whether to compute loss only on completion tokens."}, + ) @dataclass @@ -455,7 +460,7 @@ class MasterConfig: ) -def parse_arguments(config_path: Optional[str] = None) -> MasterConfig: +def parse_arguments(config_path: Optional[str] = None, args: Optional[List[str]] = None) -> MasterConfig: """Create argument parser for the new finetuning interface.""" parser = HfArgumentParser(MasterConfig) @@ -472,12 +477,15 @@ def parse_arguments(config_path: Optional[str] = None) -> MasterConfig: except Exception as e: raise ValueError(f"Failed to parse YAML config '{config_path}': {e}") - if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - master_config = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))[0] + args = [] if args is None else args + # If a single positional YAML file was passed via args, parse it as YAML + if len(args) == 1 and (args[0].endswith(".yaml") or args[0].endswith(".yml")): + yaml_path = os.path.abspath(args[0]) + (master_config,) = parser.parse_yaml_file(yaml_file=yaml_path) else: - master_config = parser.parse_args_into_dataclasses() + (master_config,) = parser.parse_args_into_dataclasses(args=args) + master_config = asdict(master_config) + master_config = MasterConfig(**master_config) return master_config @@ -512,34 +520,58 @@ def load_config(self, config_path: Union[str, Path]) -> None: self.update_config(config_dict) + def _ensure_extra_params(self, obj) -> Dict[str, Any]: + """Ensure obj.extra_params exists and is a dict; return it.""" + ep = getattr(obj, "extra_params", None) + if ep is None: + setattr(obj, "extra_params", {}) + ep = obj.extra_params + if not isinstance(ep, dict): + raise TypeError("extra_params must be a dict.") + return ep + + def _stash_top_level_extra(self, section: str, nested_key: str, value: Any) -> None: + """Store unknown nested values under MasterConfig.extra_params['section.nested_key'].""" + ep = self._ensure_extra_params(self.config) + ep[f"{section}.{nested_key}"] = value + def update_config(self, config_dict: Dict[str, Any]) -> None: """Update configuration with dictionary values.""" + + SPECIAL_KEYS = {"callbacks"} + for key, value in config_dict.items(): if hasattr(self.config, key): - if isinstance(value, dict) and hasattr(getattr(self.config, key), "__dataclass_fields__"): - # Special handling for callbacks - if key in ["callbacks", "optimizers", "loss_functions"]: - nested_config = getattr(self.config, key) - for component_name, component_dict in value.items(): - if isinstance(component_dict, dict): - getattr(nested_config, key)[component_name] = component_dict - else: - getattr(nested_config, "extra_params")[component_name] = nested_config.extra_params[ - component_name - ] = component_dict + target = getattr(self.config, key) + + # Special handling for callbacks (dict inside CallbackConfig) + if key in SPECIAL_KEYS and isinstance(value, dict): + if is_dataclass(target) and hasattr(target, "callbacks") and isinstance(target.callbacks, dict): + for component_name, component_cfg in value.items(): + target.callbacks[component_name] = component_cfg + elif isinstance(target, dict): + target.update(value) else: - # Update nested dataclass - nested_config = getattr(self.config, key) - for nested_key, nested_value in value.items(): - if hasattr(nested_config, nested_key): - setattr(getattr(self.config, key), nested_key, nested_value) - elif hasattr(nested_config, "extra_params"): - getattr(getattr(self.config, key), "extra_params")[nested_key] = nested_value - else: - setattr(self.config, key, value) + self._stash_top_level_extra(key, "__all__", value) + continue + + if isinstance(value, dict) and is_dataclass(target): + known = {f.name for f in fields(target)} + for nested_key, nested_value in value.items(): + if nested_key in known: + setattr(target, nested_key, nested_value) + else: + self._stash_top_level_extra(key, nested_key, nested_value) + continue + + if isinstance(value, dict) and isinstance(target, dict): + target.update(value) + continue + setattr(self.config, key, value) + else: - # Store unknown parameters in extra_params - self.config.extra_params[key] = value + ep = self._ensure_extra_params(self.config) + ep[key] = value def save_config(self, output_path: Union[str, Path]) -> None: """Save current configuration to file.""" @@ -557,38 +589,105 @@ def save_config(self, output_path: Union[str, Path]) -> None: else: raise ValueError(f"Unsupported output file format: {output_path.suffix}") - def validate_config(self) -> None: - """Validate configuration parameters.""" - errors = [] - - # Validate model configuration - if not self.config.model.model_name: - errors.append("Model name is required") - - # Validate dataset configuration - if not self.config.dataset.dataset_name: - errors.append("Dataset name is required") - - # Validate training parameters - if self.config.dataset.train_batch_size <= 0: - errors.append("Train batch size must be positive") - - if self.config.dataset.eval_batch_size <= 0: - errors.append("Validation batch size must be positive") + def _push(self, errs: List[str], cond: bool, msg: str) -> None: + """Append msg to errs if cond is True.""" + if cond: + errs.append(msg) - if self.config.training.num_train_epochs <= 0: - errors.append("Number of epochs must be positive") - - if self.config.training.gradient_accumulation_steps <= 0: - errors.append("Gradient accumulation steps must be positive") - - # Validate device configuration + def validate_config(self) -> None: + """ + Validate configuration parameters for MasterConfig. + """ + errors: List[str] = [] + + cfg = self.config + model = getattr(cfg, "model", {}) + dataset = getattr(cfg, "dataset", {}) + training = getattr(cfg, "training", {}) + + # ---------- Model ---------- + self._push(errors, not model.get("model_name"), "model.model_name is required.") + + # PEFT validation + if model.get("use_peft"): + pc = model.get("peft_config", {}) + self._push(errors, not isinstance(pc, dict), "model.peft_config must be a dict when use_peft=True.") + if isinstance(pc, dict): + self._push( + errors, + not isinstance(pc.get("lora_r", 0), int) or pc.get("lora_r", 0) <= 0, + "model.peft_config.lora_r must be a positive integer.", + ) + self._push( + errors, + not isinstance(pc.get("lora_alpha", 0), int) or pc.get("lora_alpha", 0) <= 0, + "model.peft_config.lora_alpha must be a positive integer.", + ) + self._push( + errors, + not (0.0 <= float(pc.get("lora_dropout", 0.0)) < 1.0), + "model.peft_config.lora_dropout must be in [0,1).", + ) + + # ---------- Dataset ---------- + self._push(errors, not dataset.get("dataset_name"), "dataset.dataset_name is required.") + self._push(errors, not dataset.get("tokenizer_name"), "dataset.tokenizer_name is required.") + self._push(errors, dataset.get("max_seq_length", 0) <= 0, "dataset.max_seq_length must be positive.") + + # ---------- Training ---------- + # Batch sizes + self._push( + errors, + training.get("per_device_train_batch_size", 0) <= 0, + "training.per_device_train_batch_size must be positive.", + ) + self._push( + errors, + training.get("per_device_eval_batch_size", 0) <= 0, + "training.per_device_eval_batch_size must be positive.", + ) + + # Epochs / steps + n_epochs = training.get("num_train_epochs", 0) + max_steps = training.get("max_steps", -1) + self._push( + errors, + n_epochs <= 0 and max_steps <= 0, + "Either training.num_train_epochs > 0 or training.max_steps > 0 must be set.", + ) + + # Gradient accumulation + self._push( + errors, + training.get("gradient_accumulation_steps", 0) <= 0, + "training.gradient_accumulation_steps must be positive.", + ) + + # Logging / saving configs + self._push(errors, training.get("logging_steps", 0) < 0, "training.logging_steps must be >= 0.") + self._push(errors, training.get("save_total_limit", 0) < 0, "training.save_total_limit must be >= 0.") + + # Device valid_devices = ["cpu", "cuda", "qaic"] - if self.config.training.device not in valid_devices: - errors.append(f"Device must be one of {valid_devices}") - + training_device = training.get("device", None) + if training_device not in valid_devices: + self._push(errors, training_device not in valid_devices, f"training.device must be one of {valid_devices}.") + + # DDP config + ddp = training.get("ddp_config", {}) + if isinstance(ddp, dict): + backend = ddp.get("ddp_backend") + # Accept qccl for Qualcomm, nccl for CUDA, gloo for CPU + self._push( + errors, + backend not in {"qccl", "nccl", "gloo", None}, + "training.ddp_config.ddp_backend must be one of {'qccl','nccl','gloo'} or omitted.", + ) + + # ---------- Final ---------- if errors: - raise ValueError("Configuration validation failed:\n" + "\n".join(f"- {error}" for error in errors)) + # Join messages with bullet points for readability + raise ValueError("Configuration validation failed:\n- " + "\n- ".join(errors)) def get_callback_config(self) -> Dict[str, Any]: """Get callback configuration as dictionary.""" diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py index e24508e83..d647b73a6 100644 --- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py +++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py @@ -4,91 +4,3 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- - - -from contextlib import nullcontext -from typing import ContextManager - -import torch - - -def get_op_verifier_ctx( - use_op_by_op_verifier: bool, - device_type: str, - dump_dir: str, - step: int, - ref_device: str = "cpu", - ref_dtype: torch.dtype = torch.float32, - atol: float = 1e-1, - rtol: float = 1e-5, - use_ref_output_on_mismatch: bool = True, -) -> ContextManager: - """Get the op-by-op verifier context manager when op-by-op verification is - enabled. It helps in debuging operator related issues by matching the - operator execution on qaic v/s cpu. This is meant only for qaic backend. - - Args: - use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier. - device_type (str): Device on which the model is being executed. - dump_dir (str): Directory to dump the op-by-op verification results. - step (int): Step number for which the op-by-op verification is to be performed. - ref_device (str, optional): Device to use as reference for verification. - Defaults to "cpu". - ref_dtype (torch.dtype, optional): Data type to use as reference - datatype for verification. Defaults to torch.float32. - atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1. - rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5. - use_ref_output_on_mismatch (bool, optional): If an operator has a - mismatch with respect to the reference device, use the reference - device outputs and continue rest of the verification. Defaults to True. - - Returns: - ContextManager: Instance of context manager used to verify the operators. - """ - if (not use_op_by_op_verifier) or ("qaic" in device_type): - return nullcontext() - - # Lazily imported qaic_debug when it is actually needed. - import torch_qaic.debug as qaic_debug - - filter_config = qaic_debug.DispatchFilterConfig.default(device_type) - dump_dir = dump_dir + "/mismatches/step_" + str(step) - return qaic_debug.OpByOpVerifierMode( - ref_device=ref_device, - ref_dtype=ref_dtype, - atol=atol, - rtol=rtol, - use_ref_output_on_mismatch=use_ref_output_on_mismatch, - filter_config=filter_config, - dump_root_dir=dump_dir, - ) - - -def init_qaic_profiling(use_profiler: bool, device_type: str) -> None: - """Initialize the qaic profiling tool. Note: The profiler is only works - for qaic backend. - - Args: - use_profiler (bool): Boolean flag to enable profiler. - device_type (str): Device on which the model is being executed. - """ - if (use_profiler) and ("qaic" in device_type): - # Lazily imported qaic's qaic_profile when it is actually needed. - import torch_qaic.profile as qaic_profile - - qaic_profile.start_profiling(device_type, 1) - - -def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None: - """Stop the qaic profiling tool. Note: The profiler is only works - for qaic backend. - - Args: - use_profiler (bool): Boolean flag to enable profiler. - device_type (str): Device on which the model is being executed. - """ - if (use_profiler) and ("qaic" in device_type): - # Lazily imported qaic's qaic_profile when it is actually needed. - import torch_qaic.profile as qaic_profile - - qaic_profile.stop_profiling(device_type) diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml index 59d388bd3..e97e99d58 100644 --- a/QEfficient/finetune/experimental/tests/test_config.yaml +++ b/QEfficient/finetune/experimental/tests/test_config.yaml @@ -5,9 +5,9 @@ # # ----------------------------------------------------------------------------- -# Model configuration +# model configuration model: - model_type: "hf" # Hugging Face model + model_type: "hf" auto_class_name: "AutoModelForCausalLM" model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name load_in_4bit: false @@ -17,9 +17,9 @@ model: lora_alpha: 16 lora_dropout: 0.1 target_modules: ["q_proj", "v_proj"] - bias: "none" # Options: none, all, lora_only - task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. - peft_type: "LORA" # Options: LORA, IA3, etc. + bias: "none" + task_type: "CAUSAL_LM" + peft_type: "LORA" # Dataset configuration dataset: @@ -33,10 +33,10 @@ dataset: test_split: "test" group_by_length: True num_workers: 4 - pin_memory: True - persistent_workers: True - prefetch_factor: 1 - drop_last: False + dataloader_pin_memory: True + dataloader_persistent_workers: True + dataloader_prefetch_factor: 1 + dataloader_drop_last: False # Training configuration training: @@ -44,7 +44,7 @@ training: output_dir: "./training_results" overwrite_output_dir: False seed: 42 - + device: "qaic" do_eval: True eval_strategy: "epoch" eval_steps: 100 @@ -61,7 +61,6 @@ training: logging_steps: 10 save_strategy: "epoch" - save_steps: 100 # If 'save_strategy' is 'steps' then it will be used. save_total_limit: 5 metric_for_best_model: "eval_loss" @@ -76,7 +75,6 @@ training: ddp_broadcast_buffers: null ddp_timeout: 1800 - # Uncomment below to explicitly run on CPU use_cpu: False gradient_checkpointing: False @@ -85,7 +83,6 @@ training: use_reenrant: False torch_compile: True - include_tokens_per_second: True include_num_input_tokens_seen: True average_tokens_across_devices: True @@ -95,19 +92,9 @@ optimizers: lr: 5e-5 weight_decay: 0.01 - -# “linear” → transformers.get_linear_schedule_with_warmup -# “cosine” → transformers.get_cosine_schedule_with_warmup -# “cosine_with_restarts” -->transformers.get_cosine_with_hard_restarts_schedule_with_warmup -# “polynomial” → transformers.get_polynomial_decay_schedule_with_warmup -# “constant” → transformers.get_constant_schedule -# “constant_with_warmup” → transformers.get_constant_schedule_with_warmup -# “inverse_sqrt” → transformers.get_inverse_sqrt_schedule - scheduler: scheduler_name: "cosine" warmup_steps: 100 # warmup_steps or warmup_ratio - warmup_ratio: 0.1 callbacks: early_stopping: diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py index b3b9b0b24..fd2abfd48 100644 --- a/QEfficient/finetune/experimental/tests/test_config_manager.py +++ b/QEfficient/finetune/experimental/tests/test_config_manager.py @@ -20,11 +20,14 @@ def config_path() -> Path: def test_config(config_path): - # parse the yaml file - master_config = parse_arguments(config_path) + master_config = parse_arguments(args=[]) config_manager = ConfigManager(master_config) - # Test that the config manager is initialized correctly assert isinstance(config_manager, ConfigManager) + config_manager.load_config(config_path) + try: + config_manager.validate_config() + except Exception as e: + pytest.fail(f"Config validation failed with error: {e}") # Test that all required fields are present missing = [ @@ -34,14 +37,26 @@ def test_config(config_path): ] assert not missing, f"Missing attributes: {missing}" trainer_config = config_manager.get_training_config() - assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs")) + assert trainer_config is not None + assert isinstance(trainer_config, dict) + assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs", "ddp_config")) dataset_config = config_manager.get_dataset_config() + assert dataset_config is not None + assert isinstance(dataset_config, dict) assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name")) model_config = config_manager.get_model_config() - assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft")) + assert model_config is not None + assert isinstance(model_config, dict) + assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft", "peft_config")) scheduler_config = config_manager.get_scheduler_config() + assert scheduler_config is not None + assert isinstance(scheduler_config, dict) assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name")) callback_config = config_manager.get_callback_config() + assert callback_config is not None + assert isinstance(callback_config, dict) assert (hasattr(callback_config, attr) for attr in ("earlystopping")) optimizer_config = config_manager.get_optimizer_config() + assert optimizer_config is not None + assert isinstance(optimizer_config, dict) assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr")) From 2cd53dbfc4c181949e5a0267f76072b44a24e04f Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 13:01:15 +0530 Subject: [PATCH 10/14] Delete QEfficient/finetune/experimental/tests/test_optimizer.py Signed-off-by: Tanisha Chawada --- .../experimental/tests/test_optimizer.py | 93 ------------------- 1 file changed, 93 deletions(-) delete mode 100644 QEfficient/finetune/experimental/tests/test_optimizer.py diff --git a/QEfficient/finetune/experimental/tests/test_optimizer.py b/QEfficient/finetune/experimental/tests/test_optimizer.py deleted file mode 100644 index d9225f6de..000000000 --- a/QEfficient/finetune/experimental/tests/test_optimizer.py +++ /dev/null @@ -1,93 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import pytest -import torch.nn as nn -import torch.optim as optim - -from QEfficient.finetune.experimental.core.component_registry import registry -from QEfficient.finetune.experimental.core.optimizer import get_optimizer, get_optimizer_cls - -OPTIMIZER_CONFIGS = { - "Adam": { - "optimizer_name": "Adam", - "opt_cls": optim.Adam, - "lr": 1e-4, - "weight_decay": 0.01, - "betas": (0.9, 0.999), - "eps": 1e-8, - "amsgrad": False, - }, - "AdamW": { - "optimizer_name": "AdamW", - "opt_cls": optim.AdamW, - "lr": 1e-4, - "weight_decay": 0.01, - "betas": (0.9, 0.999), - "eps": 1e-8, - "amsgrad": False, - }, - "SGD": { - "optimizer_name": "SGD", - "opt_cls": optim.SGD, - "lr": 1e-4, - "momentum": 0.9, - "weight_decay": 0.01, - "dampening": 0.0, - "nesterov": False, - }, - "RMSprop": { - "optimizer_name": "RMSprop", - "opt_cls": optim.RMSprop, - }, -} - -REGISTRY_CONFIG = { - "RMSprop": { - "optimizer_name": "RMSprop", - "opt_cls": optim.RMSprop, - }, -} - - -@pytest.fixture -def dummy_model(): - return nn.Sequential( - nn.Linear(10, 5), - nn.ReLU(), - nn.Linear(5, 1), - ) - - -@pytest.mark.parametrize("opt_name", OPTIMIZER_CONFIGS.keys()) -def test_optimizers(opt_name, dummy_model): - """Test that all registered optimizers can be created with their configs.""" - config = OPTIMIZER_CONFIGS[opt_name] - config.pop("opt_cls") - try: - optimizer_class_and_kwargs = get_optimizer(config) - assert optimizer_class_and_kwargs is not None - except ValueError as e: - assert "Unknown optimizer" in str(e) - return - optimizer_class = optimizer_class_and_kwargs[0] - opt_inst = optimizer_class(dummy_model.parameters(), **optimizer_class_and_kwargs[1]) - assert isinstance(opt_inst, optim.Optimizer) - assert len(list(opt_inst.param_groups)) == 1 - - for key in ["lr", "weight_decay", "betas", "eps", "momentum", "dampening", "nesterov", "amsgrad"]: - if key in config: - assert opt_inst.param_groups[0][key] == config[key], f"{key} mismatch" - - -@pytest.mark.parametrize("opt_name, opt_cls", REGISTRY_CONFIG.items()) -def test_registered_optimizer(opt_name, opt_cls): - """Test that the optimizer registerd correctly.""" - registry.optimizer(opt_name)(opt_cls) - optimizer_class = get_optimizer_cls(opt_name) - assert optimizer_class is not None - assert optimizer_class == opt_cls From c53d7b370c1762ce32323de00d67e5667535c4e8 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 13:01:39 +0530 Subject: [PATCH 11/14] Delete QEfficient/finetune/experimental/tests/test_callback.py Signed-off-by: Tanisha Chawada --- .../experimental/tests/test_callback.py | 66 ------------------- 1 file changed, 66 deletions(-) delete mode 100644 QEfficient/finetune/experimental/tests/test_callback.py diff --git a/QEfficient/finetune/experimental/tests/test_callback.py b/QEfficient/finetune/experimental/tests/test_callback.py deleted file mode 100644 index 18ec3978d..000000000 --- a/QEfficient/finetune/experimental/tests/test_callback.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import pytest -from transformers import TrainerCallback - -from QEfficient.finetune.experimental.core.callbacks import create_callbacks -from QEfficient.finetune.experimental.core.component_registry import registry - - -class ModelSummaryCallback(TrainerCallback): - def __init__(self): - pass - - -# Setup test data -CALLBACK_CONFIGS = { - "early_stopping": { - "name": "early_stopping", - "early_stopping_patience": 3, - "early_stopping_threshold": 0.001, - }, - "tensorboard": {"name": "tensorboard", "tb_writer": "SummaryWriter"}, - "model_summary": { - "name": "model_summary", - "max_depth": 1, - }, -} - -REGISTRY_CALLBACK_CONFIGS = { - "model_summary": { - "name": "model_summary", - "max_depth": 1, - "callback_class": ModelSummaryCallback, - }, -} - - -@pytest.mark.parametrize("callback_name", CALLBACK_CONFIGS.keys()) -def test_callbacks(callback_name): - """Test that registered callbacks that can be created with their configs.""" - # Create callbacks using the factory - config = CALLBACK_CONFIGS[callback_name] - try: - callback_inst = create_callbacks(**config) - except ValueError as e: - assert "Unknown callback" in str(e) - return - if hasattr(callback_inst, "callback"): - assert callback_inst.callback is not None - else: - assert callback_inst is not None - assert isinstance(callback_inst, TrainerCallback) - - -@pytest.mark.parametrize("callback_name,callback_class", REGISTRY_CALLBACK_CONFIGS.items()) -def test_callbacks_registery(callback_name, callback_class): - """Test that a callback registered correctly.""" - registry.callback(callback_name)(callback_class) - callback = registry.get_callback(callback_name) - assert callback is not None - assert callback == callback_class From b75e8b78e5551835f10df9dd48c52a2363346b02 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 13:02:18 +0530 Subject: [PATCH 12/14] Update optimizer.py Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/optimizer.py | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py index 2f77ce285..d647b73a6 100644 --- a/QEfficient/finetune/experimental/core/optimizer.py +++ b/QEfficient/finetune/experimental/core/optimizer.py @@ -4,44 +4,3 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- - -""" -Optimizer components for the training system. -""" - -from typing import Type - -import torch.optim as optim -from torch.optim import Optimizer - -from QEfficient.finetune.experimental.core.component_registry import registry - -registry.optimizer("Adam")(optim.Adam) -registry.optimizer("AdamW")(optim.AdamW) -registry.optimizer("SGD")(optim.SGD) - - -def get_optimizer_cls(optimizer_name: str) -> Type[Optimizer]: - """ - Get optimizer class from registry. - Args: optimizer_name: Name of the optimizer to retrieve. - Returns: Optimizer class. - Raises: ValueError: If optimizer name is not found in registry. - """ - optimizer_cls = registry.get_optimizer(optimizer_name) - if optimizer_cls is None: - raise ValueError(f"Unknown optimizer: {optimizer_name}") - return optimizer_cls - - -def get_optimizer(opt_config): - """ - Create optimizer from config. - Args: opt_config: Dictionary containing optimizer configuration. - Returns: Tuple of optimizer class and its arguments. - """ - opt_name = opt_config.pop("optimizer_name") - opt_cls = get_optimizer_cls(opt_name) - opt_config["lr"] = float(opt_config["lr"]) - optimizer_cls_and_kwargs = (opt_cls, opt_config) - return optimizer_cls_and_kwargs From 13e327434c4243310b3cca38298138ac5a929d55 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 13:03:17 +0530 Subject: [PATCH 13/14] Update callbacks.py Signed-off-by: Tanisha Chawada --- .../finetune/experimental/core/callbacks.py | 199 ------------------ 1 file changed, 199 deletions(-) diff --git a/QEfficient/finetune/experimental/core/callbacks.py b/QEfficient/finetune/experimental/core/callbacks.py index 30659e3bb..d647b73a6 100644 --- a/QEfficient/finetune/experimental/core/callbacks.py +++ b/QEfficient/finetune/experimental/core/callbacks.py @@ -4,202 +4,3 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- - -import json -import os -from typing import Any, Dict, Optional - -from transformers import ( - DefaultFlowCallback, - EarlyStoppingCallback, - PrinterCallback, - ProgressCallback, - TrainingArguments, -) -from transformers.integrations.integration_utils import TensorBoardCallback -from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState - -from QEfficient.finetune.experimental.core.component_registry import registry -from QEfficient.finetune.experimental.core.utils.profiler_utils import ( - get_op_verifier_ctx, - init_qaic_profiling, - stop_qaic_profiling, -) - -registry.callback("early_stopping")(EarlyStoppingCallback) -registry.callback("printer")(PrinterCallback) -registry.callback("default_flow")(DefaultFlowCallback) -registry.callback("tensorboard")(TensorBoardCallback) - - -@registry.callback("enhanced_progressbar") -class EnhancedProgressCallback(ProgressCallback): - """ - A [`TrainerCallback`] that displays the progress of training or evaluation. - You can modify `max_str_len` to control how long strings are truncated when logging. - """ - - def __init__(self, *args, **kwargs): - """ - Initialize the callback with optional max_str_len parameter to control string truncation length. - - Args: - max_str_len (`int`): - Maximum length of strings to display in logs. - Longer strings will be truncated with a message. - """ - super().__init__(*args, **kwargs) - - def on_train_begin(self, args, state, control, **kwargs): - """Set progress bar description at the start of training.""" - super().on_train_begin(args, state, control, **kwargs) - if self.training_bar is not None: - self.training_bar.set_description("Training Progress") - - def on_log(self, args, state, control, logs=None, **kwargs): - """ - Override the default `on_log` behavior during training to display - the current epoch number, loss, and learning rate in the logs. - """ - if state.is_world_process_zero and self.training_bar is not None: - # make a shallow copy of logs so we can mutate the fields copied - # but avoid doing any value pickling. - shallow_logs = {} - for k, v in logs.items(): - if isinstance(v, str) and len(v) > self.max_str_len: - shallow_logs[k] = ( - f"[String too long to display, length: {len(v)} > {self.max_str_len}. " - "Consider increasing `max_str_len` if needed.]" - ) - else: - shallow_logs[k] = v - _ = shallow_logs.pop("total_flos", None) - # round numbers so that it looks better in console - if "epoch" in shallow_logs: - shallow_logs["epoch"] = round(shallow_logs["epoch"], 2) - - updated_dict = {} - if "epoch" in shallow_logs: - updated_dict["epoch"] = shallow_logs["epoch"] - if "loss" in shallow_logs: - updated_dict["loss"] = shallow_logs["loss"] - if "learning_rate" in shallow_logs: - updated_dict["lr"] = shallow_logs["learning_rate"] - self.training_bar.set_postfix(updated_dict) - - -@registry.callback("json_logger") -class JSONLoggerCallback(TrainerCallback): - """ - A [`TrainerCallback`] that logs training and evaluation metrics to a JSON file. - """ - - def __init__(self, log_path=None, *args, **kwargs): - """ - Initialize the callback with the path to the JSON log file. - - Args: - log_path (`str`): - Path to the jsonl file where logs will be saved. - """ - super().__init__(*args, **kwargs) - if log_path is None: - log_path = os.path.join(os.environ.get("OUTPUT_DIR", "./"), "training_logs.jsonl") - self.log_path = log_path - # Ensure the log file is created and empty - with open(self.log_path, "w") as _: - pass - - def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs: Optional[Dict] = None, - **kwargs, - ): - """Append sanitized log metrics (including global_step) to a JSONL file.""" - if logs is None: - return - logs.pop("entropy", None) - logs.pop("mean_token_accuracy", None) - if state.global_step: - logs["global_step"] = state.global_step - if logs is not None: - with open(self.log_path, "a") as f: - json_line = json.dumps(logs, separators=(",", ":")) - f.write(json_line + "\n") - - -@registry.callback("qaic_profiler_callback") -class QAICProfilerCallback(TrainerCallback): - """Callback to profile QAIC devices over a specified training step range.""" - - def __init__(self, *args, **kwargs): - """ - Initialize QAIC profiler settings (start/end steps and target device IDs). - """ - - self.start_step = kwargs.get("start_step", -1) - self.end_step = kwargs.get("end_step", -1) - self.device_ids = kwargs.get("device_ids", [0]) - - def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - """ - Event called at the beginning of a training step. If using gradient accumulation, one training step might take - several inputs. - """ - if state.global_step == self.start_step: - for device_id in self.device_ids: - init_qaic_profiling(True, f"qaic:{device_id}") - elif state.global_step == self.end_step: - for device_id in self.device_ids: - stop_qaic_profiling(True, f"qaic:{device_id}") - - -@registry.callback("qaic_op_by_op_verifier_callback") -class QAICOpByOpVerifierCallback(TrainerCallback): - """Callback to verify QAIC operations step-by-step during a specified training range.""" - - def __init__(self, *args, **kwargs): - """ " - Initialize QAIC Op-by-Op verifier callback with profiling and tolerance settings. - """ - self.start_step = kwargs.get("start_step", -1) - self.end_step = kwargs.get("end_step", -1) - self.trace_dir = kwargs.get("trace_dir", "qaic_op_by_op_traces") - self.atol = kwargs.get("atol", 1e-1) - self.rtol = kwargs.get("rtol", 1e-5) - - def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - """ - Event called at the beginning of a training step. If using gradient accumulation, one training step might take - several inputs. - """ - if self.start_step <= state.global_step < self.end_step: - self.op_verifier_ctx_step = get_op_verifier_ctx( - use_op_by_op_verifier=True, - device_type="qaic", - dump_dir=self.trace_dir, - step=state.global_step, - atol=self.atol, - rtol=self.rtol, - ) - self.op_verifier_ctx_step.__enter__() - - def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): - """ - Event called at the end of a training step. If using gradient accumulation, one training step might take - several inputs. - """ - if self.start_step <= state.global_step < self.end_step: - if self.op_verifier_ctx_step is not None: - self.op_verifier_ctx_step.__exit__(None, None, None) - - -def create_callbacks(name: str, **kwargs) -> Any: - """Create a callback instance.""" - callback_class = registry.get_callback(name) - if callback_class is None: - raise ValueError(f"Unknown callback: {name}. Available: {registry.list_callbacks()}") - return callback_class(**kwargs) From 54c3bbaf32cb5e71cc0f7740713eaff2036d7daf Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Thu, 11 Dec 2025 14:44:24 +0530 Subject: [PATCH 14/14] Update config_manager.py Signed-off-by: Tanisha Chawada --- QEfficient/finetune/experimental/core/config_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index b28c2e1e3..244967f39 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -602,6 +602,7 @@ def validate_config(self) -> None: cfg = self.config model = getattr(cfg, "model", {}) + optimizers = getattr(cfg, "optimizers", {}) dataset = getattr(cfg, "dataset", {}) training = getattr(cfg, "training", {}) @@ -683,7 +684,8 @@ def validate_config(self) -> None: backend not in {"qccl", "nccl", "gloo", None}, "training.ddp_config.ddp_backend must be one of {'qccl','nccl','gloo'} or omitted.", ) - + # -----------Optimizers---------- + self._push(errors, float(optimizers.get("lr", 0)) <= 0, "optimizer.lr must be positive.") # ---------- Final ---------- if errors: # Join messages with bullet points for readability