From f44e5d0fd5bd2f2498de390e16f862f0c7ca59ed Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Wed, 18 Mar 2026 13:22:37 +0530
Subject: [PATCH 01/23] [QEff. Finetune]: Added logger and its test cases.
 (#644) (#868)

- Added a logger which will log onto console and file. This code is
similar to existing QEff. Finetuning logger code.
- Also added dist_utils which serves as utility code when dealing with
distributed training.
- Added logger test cases for sanity checks.

---------

Signed-off-by: Meet Patel <meetkuma@qti.qualcomm.com>
Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 .../finetune/experimental/core/logger.py      | 170 +++++++++++++
 .../experimental/core/utils/dist_utils.py     |  33 +++
 .../experimental/tests/test_logger.py         | 233 ++++++++++++++++++
 3 files changed, 436 insertions(+)
 create mode 100644 QEfficient/finetune/experimental/core/logger.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_logger.py

diff --git a/QEfficient/finetune/experimental/core/logger.py b/QEfficient/finetune/experimental/core/logger.py
new file mode 100644
index 0000000000..a1b9c771f6
--- /dev/null
+++ b/QEfficient/finetune/experimental/core/logger.py
@@ -0,0 +1,170 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import logging
+import sys
+from pathlib import Path
+from typing import Optional
+
+from transformers.utils.logging import get_logger as hf_get_logger
+
+from QEfficient.finetune.experimental.core.utils.dist_utils import get_local_rank
+
+# -----------------------------------------------------------------------------
+# Logger usage:
+# Initialize logger:
+#   logger = Logger("my_logger", log_file="logs/output.log", level=logging.DEBUG)
+# Log messages:
+#   logger.info("This is an info message")
+#   logger.error("This is an error message")
+#   logger.log_rank_zero("This message is logged only on rank 0")
+#   logger.log_exception("An error occurred", exception, raise_exception=False)
+# Attach file handler later if needed:
+#   logger.prepare_for_logs(output_dir="logs", log_level="DEBUG")
+# -----------------------------------------------------------------------------
+
+
+class Logger:
+    """Custom logger with console and file logging capabilities."""
+
+    def __init__(
+        self,
+        name: str = "transformers",  # We are using "transformers" as default to align with HF logs
+        log_file: Optional[str] = None,
+        level: int = logging.INFO,
+    ):
+        """
+        Initialize the logger.
+
+        Args:
+            name: Logger name
+            log_file: Path to log file (if None, log only to console)
+            level: Logging level
+        """
+        self.logger = hf_get_logger(name)
+        self.logger.setLevel(level)
+
+        # Clear any existing handlers
+        self.logger.handlers.clear()
+
+        # Create formatter
+        self.formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+
+        # Console handler
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(level)
+        console_handler.setFormatter(self.formatter)
+        self.logger.addHandler(console_handler)
+
+        # File handler (if log_file is provided)
+        if log_file:
+            # Create directory if it doesn't exist
+            log_path = Path(log_file)
+            log_path.parent.mkdir(parents=True, exist_ok=True)
+
+            file_handler = logging.FileHandler(log_file)
+            file_handler.setLevel(level)
+            file_handler.setFormatter(self.formatter)
+            self.logger.addHandler(file_handler)
+
+    def debug(self, message: str) -> None:
+        """Log debug message."""
+        self.logger.debug(message)
+
+    def info(self, message: str) -> None:
+        """Log info message."""
+        self.logger.info(message)
+
+    def warning(self, message: str) -> None:
+        """Log warning message."""
+        self.logger.warning(message)
+
+    def error(self, message: str) -> None:
+        """Log error message."""
+        self.logger.error(message)
+
+    def critical(self, message: str) -> None:
+        """Log critical message."""
+        self.logger.critical(message)
+
+    def log_rank_zero(self, message: str, level: int = logging.INFO) -> None:
+        """
+        Log message only on rank 0 process.
+
+        Args:
+            message: Message to log
+            level: Logging level
+        """
+        if get_local_rank() == 0:
+            self.logger.log(level, message)
+
+    def log_exception(self, message: str, exception: Exception, raise_exception: bool = True) -> None:
+        """
+        Log exception message and optionally raise the exception.
+
+        Args:
+            message: Custom message to log
+            exception: Exception to log
+            raise_exception: Whether to raise the exception after logging
+        """
+        error_message = f"{message}: {str(exception)}"
+        self.logger.error(error_message)
+
+        if raise_exception:
+            raise exception
+
+    def prepare_for_logs(self, output_dir: Optional[str] = None, log_level: str = "INFO") -> None:
+        """
+        Prepare existing logger to log to both console and file with specified
+        output directory and log level.
+
+        Args:
+            output_dir: Output directory for logs
+            log_level: Logging level as string
+        """
+        # Convert string log level to logging constant
+        level = getattr(logging, log_level.upper(), logging.INFO)
+        self.logger.setLevel(level)
+
+        # Update existing handlers' levels
+        for handler in self.logger.handlers:
+            handler.setLevel(level)
+
+        # Add file handler if saving metrics
+        if output_dir:
+            log_file = Path(output_dir) / "training.log"
+            log_file.parent.mkdir(parents=True, exist_ok=True)
+
+            # Check if file handler already exists
+            file_handler_exists = any(isinstance(handler, logging.FileHandler) for handler in self.logger.handlers)
+
+            if not file_handler_exists:
+                file_handler = logging.FileHandler(log_file)
+                file_handler.setLevel(level)
+                file_handler.setFormatter(self.formatter)
+                self.logger.addHandler(file_handler)
+
+
+# Global logger instance
+_logger: Optional[Logger] = None
+
+
+def get_logger(log_file: Optional[str] = None) -> Logger:
+    """
+    Get or create a logger instance.
+
+    Args:
+        log_file: Path to log file (if None, log only to console)
+
+    Returns:
+        Logger instance
+    """
+    global _logger
+    if _logger is None:
+        _logger = Logger(log_file=log_file)
+    return _logger
diff --git a/QEfficient/finetune/experimental/core/utils/dist_utils.py b/QEfficient/finetune/experimental/core/utils/dist_utils.py
index d647b73a65..aed88862d8 100644
--- a/QEfficient/finetune/experimental/core/utils/dist_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/dist_utils.py
@@ -4,3 +4,36 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import torch.distributed as dist
+
+
+def is_dist_available_and_initialized() -> bool:
+    """Check if distributed training is available and initialized."""
+    return dist.is_available() and dist.is_initialized()
+
+
+def get_rank() -> int:
+    """Return the global rank of the current process, else 0."""
+    if not is_dist_available_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_local_rank() -> int:
+    """Return the local rank of the current process on its node, else 0."""
+    if not is_dist_available_and_initialized():
+        return 0
+    return dist.get_node_local_rank()
+
+
+def get_world_size() -> int:
+    """Get the total number of processes in distributed training."""
+    if not is_dist_available_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def is_main_process() -> bool:
+    """Check if the current process is the main process (rank 0)."""
+    return get_rank() == 0
diff --git a/QEfficient/finetune/experimental/tests/test_logger.py b/QEfficient/finetune/experimental/tests/test_logger.py
new file mode 100644
index 0000000000..0af0c8b512
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_logger.py
@@ -0,0 +1,233 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import logging
+from unittest.mock import patch
+
+import pytest
+
+from QEfficient.finetune.experimental.core.logger import Logger, get_logger
+
+
+class TestLogger:
+    def setup_method(self):
+        """Reset the global logger before each test method"""
+        import QEfficient.finetune.experimental.core.logger as logger_module
+
+        logger_module._logger = None
+
+    def test_init_console_only(self):
+        """Test logger initialization with console-only output"""
+        logger = Logger("test_logger")
+
+        # Check logger attributes
+        assert logger.logger.name == "test_logger"
+        assert logger.logger.level == logging.INFO
+
+        # Check handlers - should have console handler only
+        assert len(logger.logger.handlers) == 1  # Only console handler
+        assert isinstance(logger.logger.handlers[0], logging.StreamHandler)
+
+    def test_init_with_file(self, tmp_path):
+        """Test logger initialization with file output"""
+        log_file = tmp_path / "test.log"
+        logger = Logger("file_test_logger", str(log_file))
+
+        # Check handlers - should have both console and file handlers
+        assert len(logger.logger.handlers) == 2  # Console + file handler
+        assert isinstance(logger.logger.handlers[0], logging.StreamHandler)
+        assert isinstance(logger.logger.handlers[1], logging.FileHandler)
+
+        # Check file creation
+        assert log_file.exists()
+
+    def test_log_levels(self, caplog):
+        """Test all log levels work correctly"""
+        logger = Logger("level_test_logger", level=logging.DEBUG)
+
+        with caplog.at_level(logging.DEBUG):
+            logger.debug("Debug message")
+            logger.info("Info message")
+            logger.warning("Warning message")
+            logger.error("Error message")
+            logger.critical("Critical message")
+
+            # Check all messages were logged
+            assert "Debug message" in caplog.text
+            assert "Info message" in caplog.text
+            assert "Warning message" in caplog.text
+            assert "Error message" in caplog.text
+            assert "Critical message" in caplog.text
+
+    @patch("QEfficient.finetune.experimental.core.logger.get_local_rank")
+    def test_log_rank_zero_positive_case(self, mock_get_local_rank, caplog):
+        """Test rank zero logging functionality"""
+        mock_get_local_rank.return_value = 0
+        logger = Logger("rank_test_logger")
+
+        with caplog.at_level(logging.INFO):
+            logger.log_rank_zero("Rank zero message")
+
+            assert "Rank zero message" in caplog.text
+
+    @patch("QEfficient.finetune.experimental.core.logger.get_local_rank")
+    def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog):
+        """Test to verify that only rank‑zero messages are logged"""
+        mock_get_local_rank.return_value = 1
+        logger = Logger("rank_test_logger")
+
+        with caplog.at_level(logging.INFO):
+            logger.log_rank_zero("Should not appear")
+
+            assert "Should not appear" not in caplog.text
+
+    def test_log_exception_raise(self, caplog):
+        """Test exception logging with raising"""
+        logger = Logger("exception_test_logger")
+
+        with pytest.raises(ValueError), caplog.at_level(logging.ERROR):
+            logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=True)
+
+        # The actual logged message is "Custom error: Test exception"
+        # But the exception itself contains just "Test exception"
+        assert "Custom error: Test exception" in caplog.text
+
+    def test_log_exception_no_raise(self, caplog):
+        """Test exception logging without raising"""
+        logger = Logger("exception_test_logger")
+
+        with caplog.at_level(logging.ERROR):
+            logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=False)
+
+            # Check that the formatted message was logged
+            assert "Custom error: Test exception" in caplog.text
+
+    def test_prepare_for_logs(self, tmp_path):
+        """Test preparing logger for training logs"""
+        output_dir = tmp_path / "output"
+        logger = Logger("prepare_test_logger")
+
+        # Prepare for logs
+        logger.prepare_for_logs(str(output_dir), log_level="DEBUG")
+
+        # Check file handler was added
+        file_handlers = [h for h in logger.logger.handlers if isinstance(h, logging.FileHandler)]
+        assert len(file_handlers) == 1
+
+        # Check file exists
+        log_file = output_dir / "training.log"
+        assert log_file.exists()
+
+        # Check log level was updated
+        assert logger.logger.level == logging.DEBUG
+
+    def test_prepare_for_logs_no_file_handler(self):
+        """Test preparing logger without saving to file"""
+        logger = Logger("prepare_test_logger")
+
+        # Prepare for logs without saving metrics
+        logger.prepare_for_logs(log_level="INFO")
+
+        # Check no file handler was added
+        file_handlers = [h for h in logger.logger.handlers if isinstance(h, logging.FileHandler)]
+        assert len(file_handlers) == 0
+
+    def test_prepare_for_logs_already_has_file_handler(self, tmp_path):
+        """Test preparing logger when file handler already exists"""
+        output_dir = tmp_path / "output"
+        logger = Logger("prepare_test_logger")
+
+        # Add a file handler manually first
+        log_file = output_dir / "manual.log"
+        log_file.parent.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(str(log_file))
+        logger.logger.addHandler(file_handler)
+
+        # Prepare for logs again
+        logger.prepare_for_logs(str(output_dir), log_level="INFO")
+
+        # Should still have only one file handler
+        file_handlers = [h for h in logger.logger.handlers if isinstance(h, logging.FileHandler)]
+        assert len(file_handlers) == 1
+
+    def test_get_logger_singleton(self):
+        """Test that get_logger returns the same instance"""
+        logger1 = get_logger()
+        logger2 = get_logger()
+
+        assert logger1 is logger2
+
+    def test_get_logger_with_file(self, tmp_path):
+        """Test get_logger with file parameter"""
+        log_file = tmp_path / "get_logger_test.log"
+        logger = get_logger(str(log_file))
+
+        # Check that we have 2 handlers (console + file)
+        assert len(logger.logger.handlers) == 2  # Console + file
+        assert isinstance(logger.logger.handlers[1], logging.FileHandler)
+
+        # Check file exists
+        assert log_file.exists()
+
+
+class TestLoggerIntegration:
+    """Integration tests for logger functionality"""
+
+    def setup_method(self):
+        """Reset the global logger before each test method"""
+        import QEfficient.finetune.experimental.core.logger as logger_module
+
+        logger_module._logger = None
+
+    def test_complete_workflow(self, tmp_path, caplog):
+        """Test complete logger workflow"""
+        # Setup
+        log_file = tmp_path / "workflow.log"
+        logger = Logger("workflow_test", str(log_file), logging.DEBUG)
+
+        # Test all methods
+        logger.debug("Debug test")
+        logger.info("Info test")
+        logger.warning("Warning test")
+        logger.error("Error test")
+        logger.critical("Critical test")
+
+        # Test exception handling
+        try:
+            raise ValueError("Test exception")
+        except ValueError as e:
+            logger.log_exception("Caught exception", e, raise_exception=False)
+
+        # Test rank zero logging
+        with patch("QEfficient.finetune.experimental.core.logger.get_local_rank") as mock_rank:
+            mock_rank.return_value = 0
+            logger.log_rank_zero("Rank zero test")
+
+        # Verify all messages were logged
+        with caplog.at_level(logging.DEBUG):
+            assert "Debug test" in caplog.text
+            assert "Info test" in caplog.text
+            assert "Warning test" in caplog.text
+            assert "Error test" in caplog.text
+            assert "Critical test" in caplog.text
+            assert "Caught exception: Test exception" in caplog.text
+            assert "Rank zero test" in caplog.text
+
+            # Check file was written to
+            assert log_file.exists()
+            content = log_file.read_text()
+            assert "Debug test" in content
+            assert "Info test" in content
+            assert "Warning test" in content
+            assert "Error test" in content
+            assert "Critical test" in content
+            assert "Caught exception: Test exception" in content
+            assert "Rank zero test" in content
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From afdab676e5a18b213ff1892b44a95924d548bafa Mon Sep 17 00:00:00 2001
From: smedhe <smedhe@qti.qualcomm.com>
Date: Wed, 18 Mar 2026 14:18:36 +0530
Subject: [PATCH 02/23] [QEff. Finetune_experimental] cherrypicking pr (#870)

cherry picking PRs- 697,658,667,666,656,652,647,649,645

---------

Signed-off-by: Meet Patel <meetkuma@qti.qualcomm.com>
Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
Signed-off-by: Dhiraj Kumar Sah <dhirajku@qti.qualcomm.com>
Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Signed-off-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 .../finetune/experimental/core/callbacks.py   | 199 +++++
 .../experimental/core/component_registry.py   | 204 +++++
 .../experimental/core/config_manager.py       | 747 ++++++++++++++++++
 .../finetune/experimental/core/dataset.py     | 251 ++++++
 .../finetune/experimental/core/model.py       | 132 ++++
 .../finetune/experimental/core/optimizer.py   |  25 +
 .../experimental/core/trainer/base_trainer.py |  73 ++
 .../experimental/core/trainer/sft_trainer.py  |   9 +
 .../experimental/core/utils/dataset_utils.py  |  25 +
 .../experimental/core/utils/profiler_utils.py |  88 +++
 .../experimental/tests/test_callback.py       |  63 ++
 .../experimental/tests/test_config.yaml       | 104 +++
 .../experimental/tests/test_config_manager.py |  62 ++
 .../experimental/tests/test_dataset.py        | 528 +++++++++++++
 .../finetune/experimental/tests/test_model.py | 136 ++++
 .../experimental/tests/test_optimizer.py      |  96 +++
 .../experimental/tests/test_registry.py       | 167 ++++
 .../experimental/tests/test_trainer.py        | 493 ++++++++++++
 18 files changed, 3402 insertions(+)
 create mode 100644 QEfficient/finetune/experimental/tests/test_callback.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_config.yaml
 create mode 100644 QEfficient/finetune/experimental/tests/test_config_manager.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_dataset.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_model.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_optimizer.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_registry.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_trainer.py

diff --git a/QEfficient/finetune/experimental/core/callbacks.py b/QEfficient/finetune/experimental/core/callbacks.py
index d647b73a65..30659e3bbd 100644
--- a/QEfficient/finetune/experimental/core/callbacks.py
+++ b/QEfficient/finetune/experimental/core/callbacks.py
@@ -4,3 +4,202 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import json
+import os
+from typing import Any, Dict, Optional
+
+from transformers import (
+    DefaultFlowCallback,
+    EarlyStoppingCallback,
+    PrinterCallback,
+    ProgressCallback,
+    TrainingArguments,
+)
+from transformers.integrations.integration_utils import TensorBoardCallback
+from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.utils.profiler_utils import (
+    get_op_verifier_ctx,
+    init_qaic_profiling,
+    stop_qaic_profiling,
+)
+
+registry.callback("early_stopping")(EarlyStoppingCallback)
+registry.callback("printer")(PrinterCallback)
+registry.callback("default_flow")(DefaultFlowCallback)
+registry.callback("tensorboard")(TensorBoardCallback)
+
+
+@registry.callback("enhanced_progressbar")
+class EnhancedProgressCallback(ProgressCallback):
+    """
+    A [`TrainerCallback`] that displays the progress of training or evaluation.
+    You can modify `max_str_len` to control how long strings are truncated when logging.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the callback with optional max_str_len parameter to control string truncation length.
+
+        Args:
+            max_str_len (`int`):
+                Maximum length of strings to display in logs.
+                Longer strings will be truncated with a message.
+        """
+        super().__init__(*args, **kwargs)
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        """Set progress bar description at the start of training."""
+        super().on_train_begin(args, state, control, **kwargs)
+        if self.training_bar is not None:
+            self.training_bar.set_description("Training Progress")
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        """
+        Override the default `on_log` behavior during training to display
+        the current epoch number, loss, and learning rate in the logs.
+        """
+        if state.is_world_process_zero and self.training_bar is not None:
+            # make a shallow copy of logs so we can mutate the fields copied
+            # but avoid doing any value pickling.
+            shallow_logs = {}
+            for k, v in logs.items():
+                if isinstance(v, str) and len(v) > self.max_str_len:
+                    shallow_logs[k] = (
+                        f"[String too long to display, length: {len(v)} > {self.max_str_len}. "
+                        "Consider increasing `max_str_len` if needed.]"
+                    )
+                else:
+                    shallow_logs[k] = v
+            _ = shallow_logs.pop("total_flos", None)
+            # round numbers so that it looks better in console
+            if "epoch" in shallow_logs:
+                shallow_logs["epoch"] = round(shallow_logs["epoch"], 2)
+
+            updated_dict = {}
+            if "epoch" in shallow_logs:
+                updated_dict["epoch"] = shallow_logs["epoch"]
+            if "loss" in shallow_logs:
+                updated_dict["loss"] = shallow_logs["loss"]
+            if "learning_rate" in shallow_logs:
+                updated_dict["lr"] = shallow_logs["learning_rate"]
+            self.training_bar.set_postfix(updated_dict)
+
+
+@registry.callback("json_logger")
+class JSONLoggerCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs training and evaluation metrics to a JSON file.
+    """
+
+    def __init__(self, log_path=None, *args, **kwargs):
+        """
+        Initialize the callback with the path to the JSON log file.
+
+        Args:
+            log_path (`str`):
+                Path to the jsonl file where logs will be saved.
+        """
+        super().__init__(*args, **kwargs)
+        if log_path is None:
+            log_path = os.path.join(os.environ.get("OUTPUT_DIR", "./"), "training_logs.jsonl")
+        self.log_path = log_path
+        # Ensure the log file is created and empty
+        with open(self.log_path, "w") as _:
+            pass
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """Append sanitized log metrics (including global_step) to a JSONL file."""
+        if logs is None:
+            return
+        logs.pop("entropy", None)
+        logs.pop("mean_token_accuracy", None)
+        if state.global_step:
+            logs["global_step"] = state.global_step
+        if logs is not None:
+            with open(self.log_path, "a") as f:
+                json_line = json.dumps(logs, separators=(",", ":"))
+                f.write(json_line + "\n")
+
+
+@registry.callback("qaic_profiler_callback")
+class QAICProfilerCallback(TrainerCallback):
+    """Callback to profile QAIC devices over a specified training step range."""
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize QAIC profiler settings (start/end steps and target device IDs).
+        """
+
+        self.start_step = kwargs.get("start_step", -1)
+        self.end_step = kwargs.get("end_step", -1)
+        self.device_ids = kwargs.get("device_ids", [0])
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        if state.global_step == self.start_step:
+            for device_id in self.device_ids:
+                init_qaic_profiling(True, f"qaic:{device_id}")
+        elif state.global_step == self.end_step:
+            for device_id in self.device_ids:
+                stop_qaic_profiling(True, f"qaic:{device_id}")
+
+
+@registry.callback("qaic_op_by_op_verifier_callback")
+class QAICOpByOpVerifierCallback(TrainerCallback):
+    """Callback to verify QAIC operations step-by-step during a specified training range."""
+
+    def __init__(self, *args, **kwargs):
+        """ "
+        Initialize QAIC Op-by-Op verifier callback with profiling and tolerance settings.
+        """
+        self.start_step = kwargs.get("start_step", -1)
+        self.end_step = kwargs.get("end_step", -1)
+        self.trace_dir = kwargs.get("trace_dir", "qaic_op_by_op_traces")
+        self.atol = kwargs.get("atol", 1e-1)
+        self.rtol = kwargs.get("rtol", 1e-5)
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        if self.start_step <= state.global_step < self.end_step:
+            self.op_verifier_ctx_step = get_op_verifier_ctx(
+                use_op_by_op_verifier=True,
+                device_type="qaic",
+                dump_dir=self.trace_dir,
+                step=state.global_step,
+                atol=self.atol,
+                rtol=self.rtol,
+            )
+            self.op_verifier_ctx_step.__enter__()
+
+    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        if self.start_step <= state.global_step < self.end_step:
+            if self.op_verifier_ctx_step is not None:
+                self.op_verifier_ctx_step.__exit__(None, None, None)
+
+
+def create_callbacks(name: str, **kwargs) -> Any:
+    """Create a callback instance."""
+    callback_class = registry.get_callback(name)
+    if callback_class is None:
+        raise ValueError(f"Unknown callback: {name}. Available: {registry.list_callbacks()}")
+    return callback_class(**kwargs)
diff --git a/QEfficient/finetune/experimental/core/component_registry.py b/QEfficient/finetune/experimental/core/component_registry.py
index d647b73a65..d1f9480311 100644
--- a/QEfficient/finetune/experimental/core/component_registry.py
+++ b/QEfficient/finetune/experimental/core/component_registry.py
@@ -4,3 +4,207 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import logging
+from typing import Callable, Dict, Optional, Type
+
+# from QEfficient.finetune.experimental.core.logger import get_logger
+
+# logger = get_logger()
+logger = logging.getLogger(__name__)
+
+
+def get_object(obj_dict: Dict, name: str, object_type: str, list_fn: Callable) -> Optional[Type]:
+    """Utility to get object from a dictionary with error handling."""
+    obj = obj_dict.get(name)
+    if obj is None:
+        raise ValueError(f"Unknown {object_type}: {name}. Available: {list_fn()}")
+    return obj
+
+
+class ComponentRegistry:
+    """Registry for managing different training components."""
+
+    def __init__(self):
+        self._optimizers: Dict[str, Type] = {}
+        self._schedulers: Dict[str, Type] = {}
+        self._datasets: Dict[str, Type] = {}
+        self._models: Dict[str, Type] = {}
+        self._data_collators: Dict[str, Type] = {}
+        self._metrics: Dict[str, Type] = {}
+        self._loss_functions: Dict[str, Type] = {}
+        self._callbacks: Dict[str, Type] = {}
+        self._hooks: Dict[str, Type] = {}
+        self._trainer_modules: Dict[str, Type] = {}
+
+    def trainer_module(self, name: str, args_cls=None, required_kwargs=None):
+        """
+        Decorator to register a trainer module with its configuration.
+        Each trainer module has to be binded to its args class and required kwargs.
+
+        Args:
+            name: Name of the trainer type
+            args_cls: The arguments class for this trainer
+            required_kwargs: Dictionary of required keyword arguments and their default values
+        """
+        required_kwargs = required_kwargs or {}
+
+        def decorator(trainer_cls):
+            self._trainer_modules[name] = {
+                "trainer_cls": trainer_cls,
+                "args_cls": args_cls,
+                "required_kwargs": required_kwargs,
+            }
+            logger.info(f"Registered trainer module: {name}")
+            return self._trainer_modules[name]
+
+        return decorator
+
+    def optimizer(self, name: str):
+        """Decorator to register an optimizer class."""
+
+        def decorator(cls: Type):
+            self._optimizers[name] = cls
+            logger.info(f"Registered optimizer: {name}")
+            return cls
+
+        return decorator
+
+    def scheduler(self, name: str):
+        """Decorator to register a scheduler class."""
+
+        def decorator(cls: Type):
+            self._schedulers[name] = cls
+            logger.info(f"Registered scheduler: {name}")
+            return cls
+
+        return decorator
+
+    def dataset(self, name: str):
+        """Decorator to register a dataset class."""
+
+        def decorator(cls: Type):
+            self._datasets[name] = cls
+            logger.info(f"Registered dataset: {name}")
+            return cls
+
+        return decorator
+
+    def model(self, name: str):
+        """Decorator to register a model class."""
+
+        def decorator(cls: Type):
+            self._models[name] = cls
+            logger.info(f"Registered model: {name}")
+            return cls
+
+        return decorator
+
+    def data_collator(self, name: str):
+        """Decorator to register a data collator class."""
+
+        def decorator(fn_pointer: Type):
+            self._data_collators[name] = fn_pointer
+            logger.info(f"Registered data collator: {name}")
+            return fn_pointer
+
+        return decorator
+
+    def loss_function(self, name: str):
+        """Decorator to register a loss function class."""
+
+        def decorator(cls: Type):
+            self._loss_functions[name] = cls
+            logger.info(f"Registered loss function: {name}")
+            return cls
+
+        return decorator
+
+    def callback(self, name: str):
+        """Decorator to register a callback class."""
+
+        def decorator(cls: Type):
+            self._callbacks[name] = cls
+            logger.info(f"Registered callback: {name}")
+            return cls
+
+        return decorator
+
+    def get_trainer_module(self, name: str) -> Optional[Type]:
+        """Get trainer module class by name."""
+        return get_object(self._trainer_modules, name, "trainer module", self.list_trainer_modules)
+
+    def get_optimizer(self, name: str) -> Optional[Type]:
+        """Get optimizer class by name."""
+        return get_object(self._optimizers, name, "optimizer", self.list_optimizers)
+
+    def get_scheduler(self, name: str) -> Optional[Type]:
+        """Get scheduler class by name."""
+        return get_object(self._schedulers, name, "scheduler", self.list_schedulers)
+
+    def get_dataset(self, name: str) -> Optional[Type]:
+        """Get dataset class by name."""
+        return get_object(self._datasets, name, "dataset", self.list_datasets)
+
+    def get_model(self, name: str) -> Optional[Type]:
+        """Get model class by name."""
+        return get_object(self._models, name, "model", self.list_models)
+
+    def get_data_collator(self, name: str) -> Optional[Type]:
+        """Get data collator class by name."""
+        return get_object(self._data_collators, name, "data collator", self.list_data_collators)
+
+    def get_loss_function(self, name: str) -> Optional[Type]:
+        """Get loss function class by name."""
+        return get_object(self._loss_functions, name, "loss function", self.list_loss_functions)
+
+    def get_callback(self, name: str) -> Optional[Type]:
+        """Get callback class by name."""
+        return get_object(self._callbacks, name, "callback", self.list_callbacks)
+
+    def list_trainer_modules(self) -> list[str]:
+        """List all registered trainer modules."""
+        return list(self._trainer_modules.keys())
+
+    def list_optimizers(self) -> list[str]:
+        """List all registered optimizers."""
+        return list(self._optimizers.keys())
+
+    def list_schedulers(self) -> list[str]:
+        """List all registered schedulers."""
+        return list(self._schedulers.keys())
+
+    def list_datasets(self) -> list[str]:
+        """List all registered datasets."""
+        return list(self._datasets.keys())
+
+    def list_models(self) -> list[str]:
+        """List all registered models."""
+        return list(self._models.keys())
+
+    def list_data_collators(self) -> list[str]:
+        """List all registered data collators."""
+        return list(self._data_collators.keys())
+
+    def list_loss_functions(self) -> list[str]:
+        """List all registered loss functions."""
+        return list(self._loss_functions.keys())
+
+    def list_callbacks(self) -> list[str]:
+        """List all registered callbacks."""
+        return list(self._callbacks.keys())
+
+
+# Global registry instance
+registry = ComponentRegistry()
+
+
+class ComponentFactory:
+    @staticmethod
+    def create_model(model_type: str, model_name: str, **kwargs) -> any:
+        """Create a model instance."""
+        model_class = registry.get_model(model_type)
+        if model_class is None:
+            raise ValueError(f"Unknown model: {model_type}. Available: {registry.list_models()}")
+        model_instance = model_class.create(model_name, **kwargs)
+        return model_instance
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index d647b73a65..b28c2e1e33 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -4,3 +4,750 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+"""
+Configuration manager for handling all training configurations.
+Provides centralized configuration loading, validation, and management.
+"""
+
+import json
+import os
+from dataclasses import asdict, dataclass, field, fields, is_dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+import yaml
+from transformers.hf_argparser import HfArgumentParser
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+
+
+@dataclass
+class OptimizerConfig:
+    """Configuration for optimizers."""
+
+    optimizer_name: str = field(
+        default="adamw",
+        metadata={"help": "The name of the optimizer to use."},
+    )
+    lr: float = field(
+        default=5e-5,
+        metadata={"help": "The initial learning rate for the optimizer."},
+    )
+    weight_decay: float = field(
+        default=0.01,
+        metadata={"help": "The weight decay to apply (if any)."},
+    )
+
+
+@dataclass
+class SchedulerConfig:
+    """Configuration for learning rate schedulers."""
+
+    scheduler_name: str = field(
+        default="cosine",
+        metadata={"help": "The name of the scheduler to use (e.g., 'linear', 'cosine')."},
+    )
+    warmup_steps: int = field(
+        default=100,
+        metadata={
+            "help": "Number of steps for the warmup phase. If provided "
+            "value is within [0-1) range then it will be interpreted as "
+            "ratio of total training steps for the warmup phase."
+        },
+    )
+
+
+@dataclass
+class DatasetConfig:
+    """Configuration for datasets."""
+
+    tokenizer_name: str = field(
+        default="HuggingFaceTB/SmolLM-135M",
+        metadata={"help": "The name or path of the tokenizer to use."},
+    )
+    dataset_type: str = field(
+        default="seq_completion",
+        metadata={"help": "The type of dataset (e.g., 'seq_completion')."},
+    )
+    dataset_name: str = field(
+        default="knkarthick/samsum",
+        metadata={"help": "The name or path of the dataset."},
+    )
+    dataset_subset: str = field(
+        default="default",
+        metadata={"help": "The subset of the dataset to use, if applicable."},
+    )
+    train_split: str = field(
+        default="train",
+        metadata={"help": "The name of the training split."},
+    )
+    test_split: str = field(
+        default="test",
+        metadata={"help": "The name of the test/validation split."},
+    )
+    max_seq_length: int = field(
+        default=512,
+        metadata={"help": "The maximum sequence length for tokenization."},
+    )
+    split_ratio: float = field(
+        default=0.8,
+        metadata={"help": "Ratio for train/test split, used when only train_split is provided."},
+    )
+    input_columns: list[str] = field(
+        default_factory=lambda: ["text"],
+        metadata={"help": "List of column names containing input text."},
+    )
+    target_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the column containing target labels (if applicable)."},
+    )
+    train_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during training."},
+    )
+    eval_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during evaluation."},
+    )
+    num_workers: int = field(
+        default=4,
+        metadata={"help": "Number of workers for dataset processing."},
+    )
+    collate_fn: str = field(
+        default="dynamic_padding",
+        metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
+    )
+    group_by_length: bool = field(
+        default=True,
+        metadata={"help": "Whether to group samples by length to minimize padding."},
+    )
+    length_column_name: str = field(
+        default="input_ids",
+        metadata={"help": "The column name containing the length of the input sequences."},
+    )
+    dataloader_pin_memory: bool = field(
+        default=True,
+        metadata={"help": "Whether to pin GPU memory for dataloaders."},
+    )
+    dataloader_persistent_workers: bool = field(
+        default=True,
+        metadata={"help": "Whether to keep dataloader workers alive across epochs."},
+    )
+    dataloader_prefetch_factor: int = field(
+        default=1,
+        metadata={"help": "Number of samples loaded in advance by each worker."},
+    )
+    dataloader_drop_last: bool = field(
+        default=False,
+        metadata={"help": "Whether to drop the last incomplete batch."},
+    )
+    dataloader_num_workers: int = field(
+        default=1,
+        metadata={"help": "Number of workers for the DataLoader."},
+    )
+
+
+@dataclass
+class PeftConfig:
+    """Configuration for PEFT (Parameter-Efficient Fine-Tuning) methods."""
+
+    lora_r: int = field(
+        default=8,
+        metadata={"help": "Lora attention dimension."},
+    )
+    lora_alpha: int = field(
+        default=16,
+        metadata={"help": "Lora alpha."},
+    )
+    lora_dropout: float = field(
+        default=0.1,
+        metadata={"help": "The dropout probability for Lora layers."},
+    )
+    target_modules: list[str] = field(
+        default_factory=lambda: ["q_proj", "v_proj"],
+        metadata={"help": "The modules to apply Lora to."},
+    )
+    bias: str = field(
+        default="none",
+        metadata={"help": "Bias type for Lora ('none', 'all', 'lora_only')."},
+    )
+    task_type: str = field(
+        default="CAUSAL_LM",
+        metadata={"help": "The task type for PEFT (e.g., 'CAUSAL_LM', 'SEQ_2_SEQ_LM')."},
+    )
+    peft_type: str = field(
+        default="LORA",
+        metadata={"help": "The PEFT method to use (e.g., 'LORA', 'IA3')."},
+    )
+
+
+@dataclass
+class ModelConfig:
+    """Configuration for models."""
+
+    model_name: str = field(
+        default="HuggingFaceTB/SmolLM-135M",
+        metadata={"help": "The name or path of the pretrained model."},
+    )
+    model_type: str = field(
+        default="hf",
+        metadata={"help": "The type of model ('hf' for Hugging Face, 'custom' for custom models)."},
+    )
+    auto_class_name: str = field(
+        default="AutoModelForCausalLM",
+        metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."},
+    )
+    load_in_4bit: bool = field(
+        default=False,
+        metadata={"help": "Whether to load the model in 4-bit quantization."},
+    )
+    use_peft: bool = field(
+        default=True,
+        metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."},
+    )
+    peft_config: Optional[PeftConfig] = field(
+        default_factory=PeftConfig,
+        metadata={"help": "Configuration for PEFT."},
+    )
+    use_cache: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the past key/values in the model for faster decoding."},
+    )
+    attn_implementation: str = field(
+        default="sdpa",
+        metadata={"help": "The attention implementation to use (e.g., 'sdpa', 'eager')."},
+    )
+    device_map: Optional[str] = field(
+        default=None,
+        metadata={"help": "The device map to use for model distribution (e.g., 'auto')."},
+    )
+
+
+@dataclass
+class CallbackConfig:
+    """Configuration for callbacks."""
+
+    callbacks: Dict[str, Dict[str, Any]] = field(
+        default_factory=dict,
+        metadata={"help": "Dictionary of callback configurations, keyed by callback name."},
+    )
+
+
+@dataclass
+class GradientCheckpointingKwargs:
+    """Arguments for gradient checkpointing."""
+
+    preserve_rng_state: bool = field(
+        default=True,
+        metadata={"help": "Whether to preserve the RNG state when checkpointing."},
+    )
+    use_reenrant: bool = field(
+        default=False,
+        metadata={"help": "Whether to use reentrant gradient checkpointing."},
+    )
+
+
+@dataclass
+class DdpConfig:
+    """Arguments for Distributed Data Parallel (DDP) training."""
+
+    ddp_backend: str = field(
+        default="qccl",
+        metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."},
+    )
+    ddp_find_unused_parameters: bool = field(
+        default=False,
+        metadata={"help": "Whether to find unused parameters in DDP."},
+    )
+    ddp_bucket_cap_mb: Optional[int] = field(
+        default=25,
+        metadata={"help": "The bucket size in MB for DDP communication."},
+    )
+    ddp_broadcast_buffers: bool = field(
+        default=True,
+        metadata={"help": "Whether to broadcast buffers in DDP."},
+    )
+    ddp_timeout: int = field(
+        default=1800,
+        metadata={"help": "Timeout for DDP operations in seconds."},
+    )
+
+
+@dataclass
+class TrainingConfig:
+    """Configuration for training."""
+
+    type: str = field(
+        default="sft",
+        metadata={"help": "The type of training (e.g., 'sft' for Supervised Fine-Tuning)."},
+    )
+    output_dir: str = field(
+        default="./training_results",
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={"help": "Whether to overwrite the output directory."},
+    )
+    seed: int = field(
+        default=42,
+        metadata={"help": "Random seed for reproducibility."},
+    )
+    device: str = field(
+        default="qaic",
+        metadata={"help": "The device to use for training ('cuda', 'cpu', etc.)."},
+    )
+    do_eval: bool = field(
+        default=True,
+        metadata={"help": "Whether to run evaluation during training."},
+    )
+    eval_strategy: str = field(
+        default="epoch",
+        metadata={"help": "The evaluation strategy to use ('no', 'steps', 'epoch')."},
+    )
+    eval_steps: int = field(
+        default=100,
+        metadata={"help": "Number of update steps between two evaluations."},
+    )
+    per_device_train_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during training."},
+    )
+    per_device_eval_batch_size: int = field(
+        default=1,
+        metadata={"help": "Batch size per device during evaluation."},
+    )
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
+    )
+    num_train_epochs: int = field(
+        default=1,
+        metadata={"help": "Total number of training epochs to perform."},
+    )
+    max_steps: int = field(
+        default=-1,
+        metadata={"help": "If > 0: set total number of training steps to perform."},
+    )
+
+    log_level: str = field(
+        default="info",
+        metadata={"help": "Set the verbosity level of the logs ('debug', 'info', 'warning', 'error')."},
+    )
+    log_on_each_node: bool = field(
+        default=True,
+        metadata={"help": "Whether to log on each node in a distributed setup."},
+    )
+    logging_strategy: str = field(
+        default="steps",
+        metadata={"help": "The logging strategy to use ('no', 'steps', 'epoch')."},
+    )
+    logging_steps: int = field(
+        default=10,
+        metadata={"help": "Number of update steps between two loggings."},
+    )
+
+    save_strategy: str = field(
+        default="epoch",
+        metadata={"help": "The checkpoint save strategy to use ('no', 'steps', 'epoch')."},
+    )
+    save_steps: int = field(
+        default=100,
+        metadata={"help": "Number of update steps between two checkpoints (if save_strategy is 'steps')."},
+    )
+    save_total_limit: int = field(
+        default=5,
+        metadata={"help": "Limit the total amount of checkpoints. Deletes older checkpoints to stay within limit."},
+    )
+    metric_for_best_model: str = field(
+        default="eval_loss",
+        metadata={"help": "The metric to use to compare two models ('eval_loss', etc.)."},
+    )
+
+    dtype: str = field(
+        default="fp16",
+        metadata={"help": "The data type to use for training (e.g., 'fp16', 'bf16')."},
+    )
+
+    gradient_checkpointing: bool = field(
+        default=False,
+        metadata={"help": "Whether to use gradient checkpointing."},
+    )
+    gradient_checkpointing_kwargs: Optional[GradientCheckpointingKwargs] = field(
+        default_factory=GradientCheckpointingKwargs,
+        metadata={"help": "Arguments for gradient checkpointing."},
+    )
+
+    torch_compile: bool = field(
+        default=True,
+        metadata={"help": "Whether to compile the model with `torch.compile`."},
+    )
+    include_num_input_tokens_seen: bool = field(
+        default=True,
+        metadata={"help": "Whether to include the number of input tokens seen in logs."},
+    )
+    average_tokens_across_devices: bool = field(
+        default=True,
+        metadata={"help": "Whether to average tokens across devices in distributed training."},
+    )
+
+    disable_tqdm: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to disable the tqdm progress bar."},
+    )
+    fsdp_config: Optional[Dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "FSDP configuration dictionary."},
+    )
+    deepspeed_config: Optional[Dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "DeepSpeed configuration dictionary."},
+    )
+    accelerator_config: Optional[Dict[str, Any]] = field(
+        default=None,
+        metadata={"help": "Accelerate configuration dictionary."},
+    )
+    ddp_config: Optional[DdpConfig] = field(
+        default_factory=DdpConfig,
+        metadata={"help": "DDP configuration dictionary."},
+    )
+    use_cpu: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to explicitly run training on CPU."},
+    )
+    resume_from_checkpoint: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to a checkpoint to resume training from."},
+    )
+    restore_callback_states_from_checkpoint: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Whether to restore callback states from checkpoint."},
+    )
+    report_to: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": "The list of integrations to report the results and logs to."},
+    )
+    completion_only_loss: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to compute loss only on completion tokens."},
+    )
+
+
+@dataclass
+class MasterConfig:
+    """Main training configuration."""
+
+    model: ModelConfig = field(default_factory=ModelConfig, metadata={"help": "Configuration for the model."})
+
+    dataset: DatasetConfig = field(default_factory=DatasetConfig, metadata={"help": "Configuration for the dataset."})
+
+    optimizers: OptimizerConfig = field(
+        default_factory=OptimizerConfig, metadata={"help": "Configuration for optimizers."}
+    )
+
+    scheduler: SchedulerConfig = field(
+        default_factory=SchedulerConfig, metadata={"help": "Configuration for the learning rate scheduler."}
+    )
+
+    callbacks: CallbackConfig = field(default_factory=CallbackConfig, metadata={"help": "Configuration for callbacks."})
+
+    training: TrainingConfig = field(
+        default_factory=TrainingConfig, metadata={"help": "Configuration for training parameters."}
+    )
+
+    extra_params: Dict[str, Any] = field(
+        default_factory=dict, metadata={"help": "Additional top-level parameters not explicitly defined."}
+    )
+
+
+def parse_arguments(config_path: Optional[str] = None, args: Optional[List[str]] = None) -> MasterConfig:
+    """Create argument parser for the new finetuning interface."""
+    parser = HfArgumentParser(MasterConfig)
+
+    if config_path:
+        config_path = os.path.abspath(config_path)
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+        if not (config_path.endswith(".yaml") or config_path.endswith(".yml")):
+            raise ValueError(f"Expected a .yaml/.yml file, got: {config_path}")
+
+        try:
+            (master_config,) = parser.parse_yaml_file(yaml_file=config_path)
+            return master_config
+        except Exception as e:
+            raise ValueError(f"Failed to parse YAML config '{config_path}': {e}")
+
+    args = [] if args is None else args
+    # If a single positional YAML file was passed via args, parse it as YAML
+    if len(args) == 1 and (args[0].endswith(".yaml") or args[0].endswith(".yml")):
+        yaml_path = os.path.abspath(args[0])
+        (master_config,) = parser.parse_yaml_file(yaml_file=yaml_path)
+    else:
+        (master_config,) = parser.parse_args_into_dataclasses(args=args)
+        master_config = asdict(master_config)
+        master_config = MasterConfig(**master_config)
+
+    return master_config
+
+
+class ConfigManager:
+    """Manages configuration loading, validation, and updates."""
+
+    def __init__(self, config: MasterConfig):
+        """
+        Initialize ConfigManager with either:
+        - Path to config file (str or Path)
+        - Configuration dictionary
+        - None (creates empty config)
+        """
+        self.config = config
+
+    def load_config(self, config_path: Union[str, Path]) -> None:
+        """Load configuration from file."""
+        config_path = Path(config_path)
+
+        if not config_path.exists():
+            raise FileNotFoundError(f"Configuration file not found: {config_path}")
+
+        if config_path.suffix.lower() in [".yaml", ".yml"]:
+            with open(config_path, "r") as f:
+                config_dict = yaml.safe_load(f)
+        elif config_path.suffix.lower() == ".json":
+            with open(config_path, "r") as f:
+                config_dict = json.load(f)
+        else:
+            raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
+
+        self.update_config(config_dict)
+
+    def _ensure_extra_params(self, obj) -> Dict[str, Any]:
+        """Ensure obj.extra_params exists and is a dict; return it."""
+        ep = getattr(obj, "extra_params", None)
+        if ep is None:
+            setattr(obj, "extra_params", {})
+            ep = obj.extra_params
+        if not isinstance(ep, dict):
+            raise TypeError("extra_params must be a dict.")
+        return ep
+
+    def _stash_top_level_extra(self, section: str, nested_key: str, value: Any) -> None:
+        """Store unknown nested values under MasterConfig.extra_params['section.nested_key']."""
+        ep = self._ensure_extra_params(self.config)
+        ep[f"{section}.{nested_key}"] = value
+
+    def update_config(self, config_dict: Dict[str, Any]) -> None:
+        """Update configuration with dictionary values."""
+
+        SPECIAL_KEYS = {"callbacks"}
+
+        for key, value in config_dict.items():
+            if hasattr(self.config, key):
+                target = getattr(self.config, key)
+
+                # Special handling for callbacks (dict inside CallbackConfig)
+                if key in SPECIAL_KEYS and isinstance(value, dict):
+                    if is_dataclass(target) and hasattr(target, "callbacks") and isinstance(target.callbacks, dict):
+                        for component_name, component_cfg in value.items():
+                            target.callbacks[component_name] = component_cfg
+                    elif isinstance(target, dict):
+                        target.update(value)
+                    else:
+                        self._stash_top_level_extra(key, "__all__", value)
+                    continue
+
+                if isinstance(value, dict) and is_dataclass(target):
+                    known = {f.name for f in fields(target)}
+                    for nested_key, nested_value in value.items():
+                        if nested_key in known:
+                            setattr(target, nested_key, nested_value)
+                        else:
+                            self._stash_top_level_extra(key, nested_key, nested_value)
+                    continue
+
+                if isinstance(value, dict) and isinstance(target, dict):
+                    target.update(value)
+                    continue
+                setattr(self.config, key, value)
+
+            else:
+                ep = self._ensure_extra_params(self.config)
+                ep[key] = value
+
+    def save_config(self, output_path: Union[str, Path]) -> None:
+        """Save current configuration to file."""
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        config_dict = self.config
+
+        if output_path.suffix.lower() in [".yaml", ".yml"]:
+            with open(output_path, "w") as f:
+                yaml.dump(config_dict, f, default_flow_style=False, indent=2)
+        elif output_path.suffix.lower() == ".json":
+            with open(output_path, "w") as f:
+                json.dump(config_dict, f, indent=2)
+        else:
+            raise ValueError(f"Unsupported output file format: {output_path.suffix}")
+
+    def _push(self, errs: List[str], cond: bool, msg: str) -> None:
+        """Append msg to errs if cond is True."""
+        if cond:
+            errs.append(msg)
+
+    def validate_config(self) -> None:
+        """
+        Validate configuration parameters for MasterConfig.
+        """
+        errors: List[str] = []
+
+        cfg = self.config
+        model = getattr(cfg, "model", {})
+        dataset = getattr(cfg, "dataset", {})
+        training = getattr(cfg, "training", {})
+
+        # ---------- Model ----------
+        self._push(errors, not model.get("model_name"), "model.model_name is required.")
+
+        # PEFT validation
+        if model.get("use_peft"):
+            pc = model.get("peft_config", {})
+            self._push(errors, not isinstance(pc, dict), "model.peft_config must be a dict when use_peft=True.")
+            if isinstance(pc, dict):
+                self._push(
+                    errors,
+                    not isinstance(pc.get("lora_r", 0), int) or pc.get("lora_r", 0) <= 0,
+                    "model.peft_config.lora_r must be a positive integer.",
+                )
+                self._push(
+                    errors,
+                    not isinstance(pc.get("lora_alpha", 0), int) or pc.get("lora_alpha", 0) <= 0,
+                    "model.peft_config.lora_alpha must be a positive integer.",
+                )
+                self._push(
+                    errors,
+                    not (0.0 <= float(pc.get("lora_dropout", 0.0)) < 1.0),
+                    "model.peft_config.lora_dropout must be in [0,1).",
+                )
+
+        # ---------- Dataset ----------
+        self._push(errors, not dataset.get("dataset_name"), "dataset.dataset_name is required.")
+        self._push(errors, not dataset.get("tokenizer_name"), "dataset.tokenizer_name is required.")
+        self._push(errors, dataset.get("max_seq_length", 0) <= 0, "dataset.max_seq_length must be positive.")
+
+        # ---------- Training ----------
+        # Batch sizes
+        self._push(
+            errors,
+            training.get("per_device_train_batch_size", 0) <= 0,
+            "training.per_device_train_batch_size must be positive.",
+        )
+        self._push(
+            errors,
+            training.get("per_device_eval_batch_size", 0) <= 0,
+            "training.per_device_eval_batch_size must be positive.",
+        )
+
+        # Epochs / steps
+        n_epochs = training.get("num_train_epochs", 0)
+        max_steps = training.get("max_steps", -1)
+        self._push(
+            errors,
+            n_epochs <= 0 and max_steps <= 0,
+            "Either training.num_train_epochs > 0 or training.max_steps > 0 must be set.",
+        )
+
+        # Gradient accumulation
+        self._push(
+            errors,
+            training.get("gradient_accumulation_steps", 0) <= 0,
+            "training.gradient_accumulation_steps must be positive.",
+        )
+
+        # Logging / saving configs
+        self._push(errors, training.get("logging_steps", 0) < 0, "training.logging_steps must be >= 0.")
+        self._push(errors, training.get("save_total_limit", 0) < 0, "training.save_total_limit must be >= 0.")
+
+        # Device
+        valid_devices = ["cpu", "cuda", "qaic"]
+        training_device = training.get("device", None)
+        if training_device not in valid_devices:
+            self._push(errors, training_device not in valid_devices, f"training.device must be one of {valid_devices}.")
+
+        # DDP config
+        ddp = training.get("ddp_config", {})
+        if isinstance(ddp, dict):
+            backend = ddp.get("ddp_backend")
+            # Accept qccl for Qualcomm, nccl for CUDA, gloo for CPU
+            self._push(
+                errors,
+                backend not in {"qccl", "nccl", "gloo", None},
+                "training.ddp_config.ddp_backend must be one of {'qccl','nccl','gloo'} or omitted.",
+            )
+
+        # ---------- Final ----------
+        if errors:
+            # Join messages with bullet points for readability
+            raise ValueError("Configuration validation failed:\n- " + "\n- ".join(errors))
+
+    def get_callback_config(self) -> Dict[str, Any]:
+        """Get callback configuration as dictionary."""
+        return self.config.callbacks
+
+    def get_optimizer_config(self) -> Dict[str, Any]:
+        """Get optimizer configuration as dictionary."""
+        return self.config.optimizers
+
+    def get_training_config(self) -> Dict[str, Any]:
+        """Get training configuration as dictionary."""
+        return self.config.training
+
+    def get_scheduler_config(self) -> Dict[str, Any]:
+        """Get scheduler configuration as dictionary."""
+        return self.config.scheduler
+
+    def get_dataset_config(self) -> Dict[str, Any]:
+        """Get dataset configuration as dictionary."""
+        return self.config.dataset
+
+    def get_model_config(self) -> Dict[str, Any]:
+        """Get model configuration as dictionary."""
+        return self.config.model
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert configuration to dictionary."""
+        return asdict(self.config)
+
+    def __getattr__(self, name: str) -> Any:
+        """Allow direct access to config attributes."""
+        if hasattr(self.config, name):
+            return getattr(self.config, name)
+        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+
+def create_trainer_config(name: str, **dependencies) -> tuple:
+    """
+    Create trainer configuration based on registered trainer modules.
+
+    Args:
+        name: Name of the trainer type
+        **dependencies: Any dependencies needed to configure the trainer
+
+    Returns:
+        tuple: (trainer_class, args_class, additional_kwargs)
+    """
+    config = registry.get_trainer_module(name)
+
+    # Process required kwargs based on available dependencies
+    additional_kwargs = {}
+    for kwarg, default in config["required_kwargs"].items():
+        if kwarg in dependencies:
+            additional_kwargs[kwarg] = dependencies[kwarg]
+        elif default != "REQUIRED":
+            additional_kwargs[kwarg] = default
+
+    # Check for missing required arguments
+    for kwarg, default in config["required_kwargs"].items():
+        if kwarg not in additional_kwargs and default == "REQUIRED":
+            raise ValueError(f"Required argument '{kwarg}' not provided for trainer '{name}'")
+
+    return config["trainer_cls"], config["args_cls"], additional_kwargs
diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index d647b73a65..4a243c40b2 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -4,3 +4,254 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+"""
+Dataset components for the training system.
+"""
+
+import importlib
+import os
+import re
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict
+
+from datasets import load_dataset, load_dataset_builder
+from torch.utils.data import Dataset
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.utils.dataset_utils import (
+    apply_train_test_split,
+)
+
+
+class BaseDataset(Dataset, ABC):
+    """Base class for all datasets to ensure consistent interface."""
+
+    def __init__(self, dataset_name: str, split: str, seed: int = 42, **kwargs):
+        self.dataset_name = dataset_name
+        self.split = split
+        self.seed = seed
+        self.kwargs = kwargs
+        self._initialize_dataset()
+
+    @abstractmethod
+    def _initialize_dataset(self):
+        """Subclasses should implement this to load and prepare the dataset."""
+        pass
+
+    @abstractmethod
+    def __len__(self):
+        """Return the number of samples in the dataset."""
+        pass
+
+    @abstractmethod
+    def __getitem__(self, idx):
+        """Should return a dictionary with 'input_ids', 'attention_mask', and 'labels'."""
+        pass
+
+
+@registry.dataset("sft_dataset")
+class SFTDataset(BaseDataset):
+    """
+    A Supervised Fine-Tuning (SFT) dataset class for text data.
+
+    This class handles loading data from Hugging Face datasets or custom JSON files,
+    filtering out invalid samples, and applying a prompt/completion templating for SFT tasks.
+
+    Args:
+        dataset_name (str): The name of the dataset to load from Hugging Face datasets.
+                           Ignored if json_file_path is provided.
+        split (str): The dataset split to use (e.g., "train", "validation", "test").
+        split_ratio (float): Ratio for train/test split when only one split is available.
+        seed (int): Random seed for reproducibility.
+        json_file_path (str, optional): Path to a custom JSON file containing the dataset.
+                                       If provided, this takes precedence over dataset_name.
+        prompt_template (str): A string template for constructing the prompt. Variables in the
+                                template should be enclosed in curly braces, e.g., "Answer the question: {question}".
+        completion_template (str): A string template for constructing the completion (target).
+                                   Variables should be enclosed in curly braces, e.g., "{answer}".
+
+    Raises:
+        RuntimeError: If any variables specified in `prompt_template` or `completion_template`
+                      are not found as columns in the loaded dataset.
+    """
+
+    def __init__(
+        self,
+        dataset_name: str,
+        split: str,
+        split_ratio: float = 0.8,
+        seed: int = 42,
+        **kwargs,
+    ):
+        self.split_ratio = split_ratio
+        self.json_file_path = kwargs.get("json_file_path", None)
+        self.prompt_template = kwargs.get("prompt_template", None)
+        self.completion_template = kwargs.get("completion_template", None)
+        self.prompt_func_path = kwargs.get("prompt_func", None)
+        self.completion_func_path = kwargs.get("completion_func", None)
+        self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
+
+        if self.json_file_path not in (None, ""):
+            if not os.path.isfile(self.json_file_path):
+                raise FileNotFoundError(f"JSON file not found or invalid: '{self.json_file_path}'")
+        if (self.prompt_template is None and self.prompt_func_path is None) or (
+            self.prompt_template is not None and self.prompt_func_path is not None
+        ):
+            raise RuntimeError("Either provide prompt_template or prompt_func in the config.")
+        if (self.completion_template is None and self.completion_func_path is None) or (
+            self.completion_template is not None and self.completion_func_path is not None
+        ):
+            raise RuntimeError("Either provide completion_template or completion_func in the config.")
+
+        # Call parent class __init__ which will call _initialize_dataset
+        super().__init__(dataset_name, split, seed, **kwargs)
+
+    def _initialize_dataset(self):
+        """
+        Initialize the dataset from either HuggingFace or a custom JSON file.
+
+        This method loads the dataset, applies splitting if necessary, and prepares
+        it for preprocessing with prompt/completion templates.
+        """
+        if self.json_file_path:
+            # Load dataset from JSON file
+            self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
+
+            # Apply train/test split if needed
+            if self.split in ["train", "test"]:
+                self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
+        else:
+            # Load dataset from HuggingFace
+            db = load_dataset_builder(self.dataset_name)
+            available_splits = []
+            if db.info.splits is not None:
+                available_splits = list(db.info.splits.keys())
+
+            if self.split not in available_splits:
+                raise ValueError(f"Split {self.split} is not available for dataset {self.dataset_name}.")
+
+            # FIXME: Add streaming support for larger datasets.
+            self.dataset = load_dataset(self.dataset_name, split=self.split)
+
+            if len(available_splits) == 1:
+                self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
+
+        self.dataset = self._setup_templates(self.dataset, self.dataset.column_names)
+
+    def _setup_templates(self, dataset, dataset_columns):
+        """
+        Set up prompt/completion templates or functions and apply preprocessing.
+        """
+        if self.prompt_template:
+            self.prompt_func = None
+            # Extract variables from templates and check if they exist in dataset columns
+            prompt_variables = re.findall(r"\{(.*?)\}", self.prompt_template)
+            for var in prompt_variables:
+                if var not in dataset_columns:
+                    raise RuntimeError(
+                        f"Prompt template variable '{var}' not found in dataset columns: {dataset_columns}."
+                    )
+        else:
+            prompt_variables = dataset_columns
+            self.prompt_func = self.import_func(self.prompt_func_path)
+
+        if self.completion_template:
+            self.completion_func = None
+            # Extract variables from templates and check if they exist in dataset columns
+            completion_variables = re.findall(r"\{(.*?)\}", self.completion_template)
+            for var in completion_variables:
+                if var not in dataset_columns:
+                    raise RuntimeError(
+                        f"Completion template variable '{var}' not found in dataset columns: {dataset_columns}."
+                    )
+        else:
+            completion_variables = dataset_columns
+            self.completion_func = self.import_func(self.completion_func_path)
+
+        # Filter out samples with None or empty strings in relevant columns
+        relevant_columns = list(set(prompt_variables + completion_variables))
+        if self.remove_samples_with_empty_columns:
+            dataset = dataset.filter(lambda example: self._filter_empty_or_none_samples(example, relevant_columns))
+        return dataset
+
+    def import_func(self, func_path: str) -> Callable:
+        if ":" not in func_path:
+            raise ValueError("func_path must be in the format 'module_file_path:function_name'.")
+        module_file_path, function_name = func_path.split(":")
+
+        try:
+            module = importlib.import_module(module_file_path)
+        except Exception:
+            raise RuntimeError(f"Unable to import module : {module_file_path}.")
+        if not hasattr(module, function_name):
+            raise ValueError(f"Function {function_name} not found in module {module_file_path}.")
+        return getattr(module, function_name)
+
+    def _filter_empty_or_none_samples(self, example: Dict[str, Any], relevant_columns: list) -> bool:
+        """
+        Filters out samples where any of the relevant columns are None or contain only whitespace.
+
+        Args:
+            example (Dict[str, Any]): A single sample from the dataset.
+            relevant_columns (list): List of column names to check for empty or None values.
+
+        Returns:
+            bool: True if the sample should be kept, False otherwise.
+        """
+        for column in relevant_columns:
+            value = example.get(column)
+            if value is None or (isinstance(value, str) and not value.strip()):
+                return False
+        return True
+
+    def _preprocess_sample(self, example: Dict[str, Any]) -> Dict[str, str]:
+        """
+        Applies the prompt and completion templates to a single example.
+
+        Args:
+            example (Dict[str, Any]): A single sample from the dataset.
+
+        Returns:
+            Dict[str, str]: A dictionary containing the 'prompt' and 'completion' strings.
+        """
+        prompt_text = (
+            self.prompt_func(example) if self.prompt_func is not None else self.prompt_template.format(**example)
+        )
+        completion_text = (
+            self.completion_func(example)
+            if self.completion_func is not None
+            else self.completion_template.format(**example)
+        )
+        return {
+            "prompt": prompt_text,
+            "completion": completion_text,
+        }
+
+    def __len__(self) -> int:
+        """
+        Returns the number of samples in the dataset.
+
+        Returns:
+            int: The total number of samples.
+        """
+        return self.dataset.num_rows
+
+    def __getitem__(self, idx: int) -> Dict[str, str]:
+        """
+        Retrieves a processed sample from the dataset at the given index.
+        This method doesn't tokenize the input items, it is expected that the SFTTrainer will handle tokenization.
+
+        Args:
+            idx (int): The index of the sample to retrieve.
+
+        Returns:
+            Dict[str, str]: A dictionary containing the processed 'prompt' and 'completion' for the sample.
+        """
+        # Get the raw example using .select and access the first element
+        example = self.dataset.select(indices=[int(idx)])[0]
+
+        # Apply preprocessing (templating) on the fly
+        processed_example = self._preprocess_sample(example)
+
+        return processed_example
diff --git a/QEfficient/finetune/experimental/core/model.py b/QEfficient/finetune/experimental/core/model.py
index d647b73a65..0f087e6653 100644
--- a/QEfficient/finetune/experimental/core/model.py
+++ b/QEfficient/finetune/experimental/core/model.py
@@ -4,3 +4,135 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import warnings
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Type
+
+import torch.nn as nn
+import transformers
+from transformers import AutoTokenizer
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.logger import Logger
+from QEfficient.finetune.experimental.core.utils.dataset_utils import insert_pad_token
+
+logger = Logger(__name__)
+
+
+class BaseModel(nn.Module, ABC):
+    """Shared skeleton for every finetunable model in the system."""
+
+    def __init__(self, model_name: str, **model_kwargs: Any) -> None:
+        super().__init__()
+        self.model_name = model_name
+        self.model_kwargs: Dict[str, Any] = model_kwargs
+        self._model: Optional[nn.Module] = None
+        self._tokenizer: Any = None  # HF tokenizers are not nn.Modules.
+
+    # Factory constructor: load model after __init__ finishes
+    @classmethod
+    def create(cls, model_name: str, **model_kwargs: Any) -> "BaseModel":
+        obj = cls(model_name, **model_kwargs)
+        # load model after __init__ finishes
+        module = obj.load_model()
+        if not isinstance(module, nn.Module):
+            raise TypeError(f"load_model() must return nn.Module, got {type(module)}")
+        obj._model = module
+        return obj
+
+    @abstractmethod
+    def load_model(self) -> nn.Module:
+        """Load and return the underlying torch.nn.Module."""
+        pass
+
+    def load_tokenizer(self) -> Any:
+        """Override if the model exposes a tokenizer."""
+        warnings.warn(f"{type(self).__name__} does not provide a tokenizer.", category=UserWarning)
+        return None
+
+    # Lazy accessors
+    @property
+    def model(self) -> nn.Module:
+        if self._model is None:
+            raise RuntimeError("Model not loaded; use .create(...) to load.")
+        return self._model
+
+    @property
+    def tokenizer(self) -> Any:
+        if self._tokenizer is None:
+            self._tokenizer = self.load_tokenizer()
+        return self._tokenizer
+
+    # nn.Module API surface
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def to(self, *args, **kwargs):
+        self.model.to(*args, **kwargs)
+        return self
+
+    def train(self, mode: bool = True):
+        self.model.train(mode)
+        return super().train(mode)
+
+    def eval(self):
+        return self.train(False)
+
+
+@registry.model("hf")
+class HFModel(BaseModel):
+    """HuggingFace-backed model with optional quantization."""
+
+    def __init__(
+        self,
+        model_name: str,
+        auto_class_name: str = "AutoModelForCausalLM",
+        *,
+        tokenizer_name: Optional[str] = None,
+        **model_kwargs: Any,
+    ) -> None:
+        super().__init__(model_name, **model_kwargs)
+        self.tokenizer_name = tokenizer_name or model_name
+        self.auto_class: Type = self._resolve_auto_class(auto_class_name)
+
+    @staticmethod
+    def _resolve_auto_class(auto_class_name: str) -> Type:
+        if not hasattr(transformers, auto_class_name):
+            candidates = sorted(name for name in dir(transformers) if name.startswith("AutoModel"))
+            raise ValueError(
+                f"Unsupported Auto class '{auto_class_name}'. Available candidates: {', '.join(candidates)}"
+            )
+        return getattr(transformers, auto_class_name)
+
+    # def _build_quant_config(self) -> Optional[BitsAndBytesConfig]:
+    #     if not self.model_kwargs.get("load_in_4bit"):
+    #         return None
+    #     return BitsAndBytesConfig(
+    #         load_in_4bit=True,
+    #         bnb_4bit_quant_type=self.model_kwargs.get("bnb_4bit_quant_type", "nf4"),
+    #         bnb_4bit_compute_dtype=self.model_kwargs.get("bnb_4bit_compute_dtype", torch.float16),
+    #         bnb_4bit_use_double_quant=self.model_kwargs.get("bnb_4bit_use_double_quant", True),
+    #     )
+
+    def configure_model_kwargs(self) -> Dict[str, Any]:
+        """Hook for subclasses to tweak HF `.from_pretrained` kwargs."""
+
+        extra = dict(self.model_kwargs)
+        # extra["quantization_config"] = self._build_quant_config()
+        return extra
+
+    def load_model(self) -> nn.Module:
+        logger.log_rank_zero(f"Loading HuggingFace model '{self.model_name}' via {self.auto_class.__name__}")
+
+        return self.auto_class.from_pretrained(
+            self.model_name,
+            **self.configure_model_kwargs(),
+        )
+
+    def load_tokenizer(self) -> AutoTokenizer:
+        """Load Hugging Face tokenizer."""
+        logger.log_rank_zero(f"Loading tokenizer '{self.tokenizer_name}'")
+        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
+        insert_pad_token(tokenizer)
+        return tokenizer
diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py
index d647b73a65..d4f82cbebb 100644
--- a/QEfficient/finetune/experimental/core/optimizer.py
+++ b/QEfficient/finetune/experimental/core/optimizer.py
@@ -4,3 +4,28 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+"""
+Optimizer components for the training system.
+"""
+
+import torch.optim as optim
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+
+registry.optimizer("Adam")(optim.Adam)
+registry.optimizer("AdamW")(optim.AdamW)
+registry.optimizer("SGD")(optim.SGD)
+
+
+def prepare_optimizer(opt_config):
+    """
+    Create optimizer from config.
+    Args: opt_config: Dictionary containing optimizer configuration.
+    Returns: Tuple of optimizer class and its arguments.
+    """
+    opt_name = opt_config.pop("optimizer_name")
+    opt_cls = registry.get_optimizer(opt_name)
+    opt_config["lr"] = float(opt_config["lr"])
+    optimizer_cls_and_kwargs = (opt_cls, opt_config)
+    return optimizer_cls_and_kwargs
diff --git a/QEfficient/finetune/experimental/core/trainer/base_trainer.py b/QEfficient/finetune/experimental/core/trainer/base_trainer.py
index d647b73a65..0a3c50f7f1 100644
--- a/QEfficient/finetune/experimental/core/trainer/base_trainer.py
+++ b/QEfficient/finetune/experimental/core/trainer/base_trainer.py
@@ -4,3 +4,76 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+from typing import Optional
+
+from peft import get_peft_model
+from transformers import Trainer, TrainingArguments
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.config_manager import PeftConfig
+
+
+@registry.trainer_module(name="base", args_cls=TrainingArguments, required_kwargs={"peft_config": PeftConfig})
+class BaseTrainer(Trainer):
+    """
+    Extended Trainer class that supports PEFT (Parameter-Efficient Fine-Tuning).
+
+    This trainer extends the standard HuggingFace Trainer to optionally apply
+    PEFT configurations to the model before training.
+    """
+
+    def __init__(
+        self,
+        model=None,
+        args=None,
+        data_collator=None,
+        train_dataset=None,
+        eval_dataset=None,
+        processing_class=None,
+        model_init=None,
+        compute_metrics=None,
+        callbacks=None,
+        optimizers=(None, None),
+        preprocess_logits_for_metrics=None,
+        peft_config: Optional[PeftConfig] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the BaseTrainer with optional PEFT support.
+
+        Args:
+            model: The model to train
+            args: Training arguments
+            data_collator: Data collator for batching
+            train_dataset: Training dataset
+            eval_dataset: Evaluation dataset
+            processing_class: Tokenizer or processor
+            model_init: Function to initialize model
+            compute_metrics: Function to compute metrics
+            callbacks: List of callbacks
+            optimizers: Tuple of (optimizer, scheduler)
+            preprocess_logits_for_metrics: Function to preprocess logits
+            peft_config: Optional PEFT configuration. If provided, the model will be
+                        wrapped with PEFT before training.
+            **kwargs: Additional keyword arguments
+        """
+        # Apply PEFT to model if peft_config is provided
+        if peft_config is not None and model is not None:
+            model = get_peft_model(model, peft_config)
+            model.print_trainable_parameters()
+
+        # Initialize the parent Trainer class
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            **kwargs,
+        )
diff --git a/QEfficient/finetune/experimental/core/trainer/sft_trainer.py b/QEfficient/finetune/experimental/core/trainer/sft_trainer.py
index d647b73a65..3223c5966b 100644
--- a/QEfficient/finetune/experimental/core/trainer/sft_trainer.py
+++ b/QEfficient/finetune/experimental/core/trainer/sft_trainer.py
@@ -4,3 +4,12 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+from trl import SFTConfig, SFTTrainer
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.config_manager import PeftConfig
+
+
+@registry.trainer_module(name="sft", args_cls=SFTConfig, required_kwargs={"peft_config": PeftConfig})
+class SFTTrainerModule(SFTTrainer):
+    pass  # Just using the standard SFTTrainer
diff --git a/QEfficient/finetune/experimental/core/utils/dataset_utils.py b/QEfficient/finetune/experimental/core/utils/dataset_utils.py
index d647b73a65..11e2fecfc3 100644
--- a/QEfficient/finetune/experimental/core/utils/dataset_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/dataset_utils.py
@@ -4,3 +4,28 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+def insert_pad_token(tokenizer):
+    # Add pad token if it doesn't exist
+    if tokenizer.pad_token is None:
+        # Try to use existing special token as pad token
+        if tokenizer.eos_token is not None:
+            tokenizer.pad_token = tokenizer.eos_token
+        elif tokenizer.bos_token is not None:
+            tokenizer.pad_token = tokenizer.bos_token
+        elif tokenizer.sep_token is not None:
+            tokenizer.pad_token = tokenizer.sep_token
+        else:
+            # Add a new pad token
+            tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+
+def apply_train_test_split(dataset, split_ratio, split, seed):
+    """
+    Apply train/test split to the dataset based on split_ratio.
+    """
+    splitted_dataset = dataset.train_test_split(test_size=(1 - split_ratio), seed=seed)
+    if split == "test":
+        dataset = splitted_dataset["test"]
+    else:
+        dataset = splitted_dataset["train"]
+    return dataset
diff --git a/QEfficient/finetune/experimental/core/utils/profiler_utils.py b/QEfficient/finetune/experimental/core/utils/profiler_utils.py
index d647b73a65..e24508e831 100644
--- a/QEfficient/finetune/experimental/core/utils/profiler_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/profiler_utils.py
@@ -4,3 +4,91 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+
+from contextlib import nullcontext
+from typing import ContextManager
+
+import torch
+
+
+def get_op_verifier_ctx(
+    use_op_by_op_verifier: bool,
+    device_type: str,
+    dump_dir: str,
+    step: int,
+    ref_device: str = "cpu",
+    ref_dtype: torch.dtype = torch.float32,
+    atol: float = 1e-1,
+    rtol: float = 1e-5,
+    use_ref_output_on_mismatch: bool = True,
+) -> ContextManager:
+    """Get the op-by-op verifier context manager when op-by-op verification is
+    enabled. It helps in debuging operator related issues by matching the
+    operator execution on qaic v/s cpu. This is meant only for qaic backend.
+
+    Args:
+        use_op_by_op_verifier (bool): Boolean flag to enable op-by-op verifier.
+        device_type (str): Device on which the model is being executed.
+        dump_dir (str): Directory to dump the op-by-op verification results.
+        step (int): Step number for which the op-by-op verification is to be performed.
+        ref_device (str, optional): Device to use as reference for verification.
+            Defaults to "cpu".
+        ref_dtype (torch.dtype, optional): Data type to use as reference
+            datatype for verification. Defaults to torch.float32.
+        atol (float, optional): Absolute tolerance to match the results. Defaults to 1e-1.
+        rtol (float, optional): Relative tolerance to match the results. Defaults to 1e-5.
+        use_ref_output_on_mismatch (bool, optional): If an operator has a
+            mismatch with respect to the reference device, use the reference
+            device outputs and continue rest of the verification. Defaults to True.
+
+    Returns:
+        ContextManager: Instance of context manager used to verify the operators.
+    """
+    if (not use_op_by_op_verifier) or ("qaic" in device_type):
+        return nullcontext()
+
+    # Lazily imported qaic_debug when it is actually needed.
+    import torch_qaic.debug as qaic_debug
+
+    filter_config = qaic_debug.DispatchFilterConfig.default(device_type)
+    dump_dir = dump_dir + "/mismatches/step_" + str(step)
+    return qaic_debug.OpByOpVerifierMode(
+        ref_device=ref_device,
+        ref_dtype=ref_dtype,
+        atol=atol,
+        rtol=rtol,
+        use_ref_output_on_mismatch=use_ref_output_on_mismatch,
+        filter_config=filter_config,
+        dump_root_dir=dump_dir,
+    )
+
+
+def init_qaic_profiling(use_profiler: bool, device_type: str) -> None:
+    """Initialize the qaic profiling tool. Note: The profiler is only works
+    for qaic backend.
+
+    Args:
+        use_profiler (bool): Boolean flag to enable profiler.
+        device_type (str): Device on which the model is being executed.
+    """
+    if (use_profiler) and ("qaic" in device_type):
+        # Lazily imported qaic's qaic_profile when it is actually needed.
+        import torch_qaic.profile as qaic_profile
+
+        qaic_profile.start_profiling(device_type, 1)
+
+
+def stop_qaic_profiling(use_profiler: bool, device_type: str) -> None:
+    """Stop the qaic profiling tool. Note: The profiler is only works
+    for qaic backend.
+
+    Args:
+        use_profiler (bool): Boolean flag to enable profiler.
+        device_type (str): Device on which the model is being executed.
+    """
+    if (use_profiler) and ("qaic" in device_type):
+        # Lazily imported qaic's qaic_profile when it is actually needed.
+        import torch_qaic.profile as qaic_profile
+
+        qaic_profile.stop_profiling(device_type)
diff --git a/QEfficient/finetune/experimental/tests/test_callback.py b/QEfficient/finetune/experimental/tests/test_callback.py
new file mode 100644
index 0000000000..59ff4d1173
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_callback.py
@@ -0,0 +1,63 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import pytest
+from transformers import TrainerCallback
+
+from QEfficient.finetune.experimental.core.callbacks import create_callbacks
+from QEfficient.finetune.experimental.core.component_registry import registry
+
+
+class ModelSummaryCallback(TrainerCallback):
+    def __init__(self):
+        pass
+
+
+# Setup test data
+CALLBACK_CONFIGS = {
+    "early_stopping": {
+        "name": "early_stopping",
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.001,
+    },
+    "tensorboard": {"name": "tensorboard", "tb_writer": "SummaryWriter"},
+    "model_summary": {
+        "name": "model_summary",
+        "max_depth": 1,
+    },
+}
+
+REGISTRY_CALLBACK_CONFIGS = {
+    "model_summary": {
+        "name": "model_summary",
+        "max_depth": 1,
+        "callback_class": ModelSummaryCallback,
+    },
+}
+
+
+@pytest.mark.parametrize("callback_name", CALLBACK_CONFIGS.keys())
+def test_callbacks(callback_name):
+    """Test that registered callbacks that can be created with their configs."""
+    # Create callbacks using the factory
+    config = CALLBACK_CONFIGS[callback_name]
+    try:
+        callback_inst = create_callbacks(**config)
+    except ValueError as e:
+        assert "Unknown callback" in str(e)
+        return
+    assert callback_inst is not None
+    assert isinstance(callback_inst, TrainerCallback)
+
+
+@pytest.mark.parametrize("callback_name,callback_class", REGISTRY_CALLBACK_CONFIGS.items())
+def test_callbacks_registery(callback_name, callback_class):
+    """Test that a callback registered correctly."""
+    registry.callback(callback_name)(callback_class)
+    callback = registry.get_callback(callback_name)
+    assert callback is not None
+    assert callback == callback_class
diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml
new file mode 100644
index 0000000000..e97e99d583
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_config.yaml
@@ -0,0 +1,104 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+# model configuration
+model:
+  model_type: "hf"  
+  auto_class_name: "AutoModelForCausalLM"
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  load_in_4bit: false
+  use_peft: true
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0.1
+    target_modules: ["q_proj", "v_proj"]
+    bias: "none" 
+    task_type: "CAUSAL_LM" 
+    peft_type: "LORA" 
+
+# Dataset configuration
+dataset:
+  tokenizer_name: "HuggingFaceTB/SmolLM-135M"
+  dataset_type: "seq_completion"
+  # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M"
+  dataset_name: "knkarthick/samsum"
+  train_split: "train"
+  max_seq_length: 512
+  split_ratio: 0.8  # Ratio for train/test split, used when only train_split is provided
+  test_split: "test"
+  group_by_length: True
+  num_workers: 4
+  dataloader_pin_memory: True
+  dataloader_persistent_workers: True
+  dataloader_prefetch_factor: 1
+  dataloader_drop_last: False
+
+# Training configuration
+training:
+  type: "sft"
+  output_dir: "./training_results"
+  overwrite_output_dir: False
+  seed: 42
+  device: "qaic"
+  do_eval: True
+  eval_strategy: "epoch"
+  eval_steps: 100
+
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 1
+  num_train_epochs: 1
+  max_steps: -1
+
+  log_level: "info"
+  log_on_each_node: True
+  logging_strategy: "steps"
+  logging_steps: 10
+
+  save_strategy: "epoch"
+  save_total_limit: 5
+  metric_for_best_model: "eval_loss"
+
+  dtype: "fp16"
+  completion_only_loss: True
+  report_to: "trackio"
+
+  ddp_config:
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: null
+    ddp_timeout: 1800
+
+  use_cpu: False
+
+  gradient_checkpointing: False
+  gradient_checkpointing_kwargs:
+    preserve_rng_state : True
+    use_reenrant: False
+
+  torch_compile: True
+  include_num_input_tokens_seen: True
+  average_tokens_across_devices: True
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 5e-5
+  weight_decay: 0.01
+
+scheduler:
+  scheduler_name: "cosine"
+  warmup_steps: 100   # warmup_steps or warmup_ratio
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3
+    early_stopping_threshold: 0.001
+  tensorboard:
+
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
new file mode 100644
index 0000000000..fd2abfd482
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -0,0 +1,62 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+from pathlib import Path
+
+import pytest
+
+from QEfficient.finetune.experimental.core.config_manager import ConfigManager, parse_arguments
+
+
+@pytest.fixture
+def config_path() -> Path:
+    here = Path(__file__).resolve().parent
+    return (here / "test_config.yaml").resolve()
+
+
+def test_config(config_path):
+    master_config = parse_arguments(args=[])
+    config_manager = ConfigManager(master_config)
+    assert isinstance(config_manager, ConfigManager)
+    config_manager.load_config(config_path)
+    try:
+        config_manager.validate_config()
+    except Exception as e:
+        pytest.fail(f"Config validation failed with error: {e}")
+
+    # Test that all required fields are present
+    missing = [
+        a
+        for a in ("model", "dataset", "optimizers", "scheduler", "callbacks", "training")
+        if not hasattr(config_manager, a)
+    ]
+    assert not missing, f"Missing attributes: {missing}"
+    trainer_config = config_manager.get_training_config()
+    assert trainer_config is not None
+    assert isinstance(trainer_config, dict)
+    assert (hasattr(trainer_config, attr) for attr in ("output_dir", "train_batch_size", "num_epochs", "ddp_config"))
+    dataset_config = config_manager.get_dataset_config()
+    assert dataset_config is not None
+    assert isinstance(dataset_config, dict)
+    assert (hasattr(dataset_config, attr) for attr in ("dataset_type", "dataset_name", "tokenizer_name"))
+    model_config = config_manager.get_model_config()
+    assert model_config is not None
+    assert isinstance(model_config, dict)
+    assert (hasattr(model_config, attr) for attr in ("model_type", "model_name", "use_peft", "peft_config"))
+    scheduler_config = config_manager.get_scheduler_config()
+    assert scheduler_config is not None
+    assert isinstance(scheduler_config, dict)
+    assert (hasattr(scheduler_config, attr) for attr in ("scheduler_name"))
+    callback_config = config_manager.get_callback_config()
+    assert callback_config is not None
+    assert isinstance(callback_config, dict)
+    assert (hasattr(callback_config, attr) for attr in ("earlystopping"))
+    optimizer_config = config_manager.get_optimizer_config()
+    assert optimizer_config is not None
+    assert isinstance(optimizer_config, dict)
+    assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr"))
diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py
new file mode 100644
index 0000000000..ca2fc14505
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_dataset.py
@@ -0,0 +1,528 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Tests for dataset components.
+"""
+
+import json
+import os
+import tempfile
+import unittest
+from unittest.mock import MagicMock, patch
+
+from QEfficient.finetune.experimental.core.dataset import BaseDataset, SFTDataset
+
+SEED = 42
+SPLIT_RATIO = 0.8
+
+
+class TestBaseDataset(unittest.TestCase):
+    """Tests for BaseDataset abstract class."""
+
+    def test_base_dataset_cannot_be_instantiated(self):
+        """Test that BaseDataset cannot be instantiated directly."""
+        with self.assertRaises(TypeError):
+            BaseDataset(dataset_name="test", split="train")
+
+
+class TestSFTDataset(unittest.TestCase):
+    """Tests for SFTDataset class."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        # Create a temporary directory for test files
+        self.test_dir = tempfile.mkdtemp()
+        self.json_file_path = os.path.join(self.test_dir, "test_dataset.json")
+
+        # Create a dummy JSON dataset
+        self.dummy_data = [
+            {"question": "What is AI?", "answer": "Artificial Intelligence"},
+            {"question": "What is ML?", "answer": "Machine Learning"},
+            {"question": "What is DL?", "answer": "Deep Learning"},
+            {"question": "What is NLP?", "answer": "Natural Language Processing"},
+            {"question": "", "answer": "Empty question"},  # Empty question
+            {"question": "Valid question", "answer": ""},  # Empty answer
+            {"question": None, "answer": "None question"},  # None question
+            {"question": "Valid question 2", "answer": None},  # None answer
+        ]
+
+        with open(self.json_file_path, "w") as f:
+            json.dump(self.dummy_data, f)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        # Remove temporary files and directories
+        import shutil
+
+        if os.path.exists(self.test_dir):
+            shutil.rmtree(self.test_dir)
+
+    @patch("QEfficient.finetune.experimental.core.dataset.load_dataset")
+    @patch("QEfficient.finetune.experimental.core.dataset.load_dataset_builder")
+    def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder, mock_load):
+        """Test loading from HuggingFace dataset with templates using mocked data."""
+        # Create mock dataset with dummy data
+        mock_dataset = MagicMock()
+        mock_dataset.column_names = ["text", "label"]
+        mock_dataset.num_rows = 3
+
+        # Mock the select method to return individual samples
+        def mock_select(indices):
+            sample_data = [
+                {"text": "Sample text 1", "label": "Label 1"},
+                {"text": "Sample text 2", "label": "Label 2"},
+                {"text": "Sample text 3", "label": "Label 3"},
+            ]
+            return [sample_data[indices[0]]]
+
+        mock_dataset.select = mock_select
+        mock_dataset.filter = lambda func: mock_dataset  # Return self for filtering
+
+        # Mock train_test_split to return a dict with train/test splits
+        mock_split_result = {"train": mock_dataset, "test": mock_dataset}
+        mock_dataset.train_test_split = lambda test_size, seed: mock_split_result
+
+        # Mock the dataset builder to indicate multiple splits are available
+        mock_info = MagicMock()
+        mock_info.splits = {"train": MagicMock(), "test": MagicMock()}
+        mock_builder.return_value.info = mock_info
+
+        # Mock load_dataset to return our mock dataset
+        mock_load.return_value = mock_dataset
+
+        # Create the dataset
+        dataset = SFTDataset(
+            dataset_name="dummy_hf_dataset",
+            split="train",
+            prompt_template="Text: {text}",
+            completion_template="Label: {label}",
+        )
+
+        self.assertIsNotNone(dataset)
+        self.assertEqual(len(dataset), 3)
+
+        # Test __getitem__
+        sample = dataset[0]
+        self.assertIn("prompt", sample)
+        self.assertIn("completion", sample)
+        self.assertTrue(sample["prompt"].startswith("Text:"))
+        self.assertTrue(sample["completion"].startswith("Label:"))
+
+    def test_sft_dataset_with_json_file_and_templates(self):
+        """Test loading from JSON file with templates."""
+        dataset = SFTDataset(
+            dataset_name="dummy",  # Ignored when json_file_path is provided
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+        )
+
+        self.assertIsNotNone(dataset)
+        # After filtering empty/None values and applying train split (default 0.8)
+        # we get a subset of the 4 valid samples
+        self.assertGreater(len(dataset), 0)
+        self.assertLessEqual(len(dataset), 4)
+
+        # Test __getitem__
+        sample = dataset[0]
+        self.assertIn("prompt", sample)
+        self.assertIn("completion", sample)
+        self.assertTrue(sample["prompt"].startswith("Q:"))
+        self.assertTrue(sample["completion"].startswith("A:"))
+
+    def test_sft_dataset_json_file_without_filtering(self):
+        """Test loading from JSON file without filtering empty samples."""
+        dataset = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            remove_samples_with_empty_columns=False,
+        )
+
+        # When filtering is disabled and split="train" is used, it still applies train/test split
+        # So we get ~80% of 8 samples = ~6 samples
+        self.assertGreater(len(dataset), 0)
+        self.assertLessEqual(len(dataset), 8)
+
+    def test_sft_dataset_train_test_split_from_json(self):
+        """Test train/test split when loading from JSON file."""
+        train_dataset = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            split_ratio=SPLIT_RATIO,
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            seed=SEED,
+        )
+
+        test_dataset = SFTDataset(
+            dataset_name="dummy",
+            split="test",
+            split_ratio=SPLIT_RATIO,
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            seed=SEED,
+        )
+
+        # After filtering, we have 4 valid samples
+        # With split ratio, train should have ~3 samples, test should have ~1 sample
+        self.assertGreater(len(train_dataset), 0)
+        self.assertGreater(len(test_dataset), 0)
+        # Total should equal the filtered dataset size
+        self.assertEqual(len(train_dataset) + len(test_dataset), 4)
+
+    def test_sft_dataset_with_custom_prompt_function(self):
+        """Test loading with custom prompt function."""
+        # Create a temporary module file with custom functions
+        func_file_path = os.path.join(self.test_dir, "custom_funcs.py")
+        with open(func_file_path, "w") as f:
+            f.write("""
+def custom_prompt(example):
+    return f"Custom prompt: {example['question']}"
+
+def custom_completion(example):
+    return f"Custom completion: {example['answer']}"
+""")
+
+        # Add the test directory to sys.path temporarily
+        import sys
+
+        sys.path.insert(0, self.test_dir)
+
+        try:
+            dataset = SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_func="custom_funcs:custom_prompt",
+                completion_func="custom_funcs:custom_completion",
+            )
+
+            self.assertIsNotNone(dataset)
+            self.assertGreater(len(dataset), 0)
+
+            # Test that custom functions are applied
+            sample = dataset[0]
+            self.assertTrue(sample["prompt"].startswith("Custom prompt:"))
+            self.assertTrue(sample["completion"].startswith("Custom completion:"))
+        finally:
+            # Clean up
+            sys.path.remove(self.test_dir)
+            if os.path.exists(func_file_path):
+                os.remove(func_file_path)
+
+    def test_sft_dataset_missing_template_variable(self):
+        """Test error when template variable is not in dataset columns."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_template="Q: {nonexistent_column}",
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("not found in dataset columns", str(context.exception))
+
+    def test_sft_dataset_missing_completion_template_variable(self):
+        """Test error when completion template variable is not in dataset columns."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_template="Q: {question}",
+                completion_template="A: {nonexistent_column}",
+            )
+
+        self.assertIn("not found in dataset columns", str(context.exception))
+
+    def test_sft_dataset_no_prompt_template_or_func(self):
+        """Test error when neither prompt_template nor prompt_func is provided."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("Either provide prompt_template or prompt_func", str(context.exception))
+
+    def test_sft_dataset_both_prompt_template_and_func(self):
+        """Test error when both prompt_template and prompt_func are provided."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_template="Q: {question}",
+                prompt_func="module:function",
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("Either provide prompt_template or prompt_func", str(context.exception))
+
+    def test_sft_dataset_no_completion_template_or_func(self):
+        """Test error when neither completion_template nor completion_func is provided."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_template="Q: {question}",
+            )
+
+        self.assertIn(
+            "Either provide completion_template or completion_func",
+            str(context.exception),
+        )
+
+    def test_sft_dataset_both_completion_template_and_func(self):
+        """Test error when both completion_template and completion_func are provided."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_template="Q: {question}",
+                completion_template="A: {answer}",
+                completion_func="module:function",
+            )
+
+        self.assertIn(
+            "Either provide completion_template or completion_func",
+            str(context.exception),
+        )
+
+    def test_sft_dataset_invalid_func_path_format(self):
+        """Test error when func_path doesn't contain colon separator."""
+        with self.assertRaises(ValueError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_func="invalid_format",
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("must be in the format", str(context.exception))
+
+    def test_sft_dataset_invalid_module_import(self):
+        """Test error when module cannot be imported."""
+        with self.assertRaises(RuntimeError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=self.json_file_path,
+                prompt_func="nonexistent_module:function",
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("Unable to import module", str(context.exception))
+
+    def test_sft_dataset_invalid_function_name(self):
+        """Test error when function doesn't exist in module."""
+        # Create a temporary module file without the expected function
+        func_file_path = os.path.join(self.test_dir, "test_module.py")
+        with open(func_file_path, "w") as f:
+            f.write("def some_other_function():\n    pass\n")
+
+        import sys
+
+        sys.path.insert(0, self.test_dir)
+
+        try:
+            with self.assertRaises(ValueError) as context:
+                SFTDataset(
+                    dataset_name="dummy",
+                    split="train",
+                    json_file_path=self.json_file_path,
+                    prompt_func="test_module:nonexistent_function",
+                    completion_template="A: {answer}",
+                )
+
+            self.assertIn("not found in module", str(context.exception))
+        finally:
+            sys.path.remove(self.test_dir)
+            if os.path.exists(func_file_path):
+                os.remove(func_file_path)
+
+    def test_sft_dataset_filter_empty_or_none_samples(self):
+        """Test filtering of samples with empty or None values."""
+        dataset = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            remove_samples_with_empty_columns=True,
+        )
+
+        # Verify that all samples have valid (non-empty) questions and answers
+        for i in range(len(dataset)):
+            sample = dataset[i]
+            # Extract the actual question and answer from the formatted strings
+            question = sample["prompt"].replace("Q: ", "").strip()
+            answer = sample["completion"].replace("A: ", "").strip()
+            # Verify neither is empty
+            self.assertTrue(len(question) > 0, f"Question should not be empty: {sample['prompt']}")
+            self.assertTrue(len(answer) > 0, f"Answer should not be empty: {sample['completion']}")
+
+    def test_sft_dataset_getitem_returns_correct_format(self):
+        """Test that __getitem__ returns the correct format."""
+        dataset = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+        )
+
+        sample = dataset[0]
+
+        # Check that sample is a dictionary
+        self.assertIsInstance(sample, dict)
+
+        # Check that it has the required keys
+        self.assertIn("prompt", sample)
+        self.assertIn("completion", sample)
+
+        # Check that values are strings
+        self.assertIsInstance(sample["prompt"], str)
+        self.assertIsInstance(sample["completion"], str)
+
+    def test_sft_dataset_len(self):
+        """Test __len__ method."""
+        dataset = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+        )
+
+        # Check that len returns an integer
+        self.assertIsInstance(len(dataset), int)
+
+        # Check that len is positive
+        self.assertGreater(len(dataset), 0)
+
+        # Check that we can iterate through all samples
+        for i in range(len(dataset)):
+            sample = dataset[i]
+            self.assertIsNotNone(sample)
+
+    def test_sft_dataset_with_multiple_template_variables(self):
+        """Test templates with multiple variables."""
+        # Create a more complex JSON dataset
+        complex_data = [
+            {"context": "The sky", "question": "What color?", "answer": "Blue"},
+            {"context": "Math", "question": "What is 2+2?", "answer": "4"},
+        ]
+
+        complex_json_path = os.path.join(self.test_dir, "complex_dataset.json")
+        with open(complex_json_path, "w") as f:
+            json.dump(complex_data, f)
+
+        try:
+            dataset = SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=complex_json_path,
+                prompt_template="Context: {context}\nQuestion: {question}",
+                completion_template="Answer: {answer}",
+            )
+
+            # With split="train", it applies train/test split, so we get ~80% of 2 samples
+            self.assertGreater(len(dataset), 0)
+            self.assertLessEqual(len(dataset), 2)
+
+            sample = dataset[0]
+            self.assertIn("Context:", sample["prompt"])
+            self.assertIn("Question:", sample["prompt"])
+            self.assertIn("Answer:", sample["completion"])
+        finally:
+            if os.path.exists(complex_json_path):
+                os.remove(complex_json_path)
+
+    def test_sft_dataset_seed_reproducibility(self):
+        """Test that using the same seed produces the same split."""
+        dataset1 = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            split_ratio=SPLIT_RATIO,
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            seed=SEED,
+        )
+
+        dataset2 = SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            split_ratio=SPLIT_RATIO,
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            seed=SEED,
+        )
+
+        # Both datasets should have the same length
+        self.assertEqual(len(dataset1), len(dataset2))
+
+        # Both datasets should have the same samples
+        for i in range(len(dataset1)):
+            sample1 = dataset1[i]
+            sample2 = dataset2[i]
+            self.assertEqual(sample1["prompt"], sample2["prompt"])
+            self.assertEqual(sample1["completion"], sample2["completion"])
+
+    @patch("QEfficient.finetune.experimental.core.dataset.load_dataset")
+    @patch("QEfficient.finetune.experimental.core.dataset.load_dataset_builder")
+    def test_sft_dataset_invalid_split(self, mock_builder, mock_load):
+        """Test error when requesting an invalid split."""
+        # Mock the dataset builder to return specific splits
+        mock_info = MagicMock()
+        mock_info.splits = {"train": MagicMock(), "validation": MagicMock()}
+        mock_builder.return_value.info = mock_info
+
+        with self.assertRaises(ValueError) as context:
+            SFTDataset(
+                dataset_name="dummy_dataset",
+                split="nonexistent_split",
+                prompt_template="Q: {question}",
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("not available", str(context.exception))
+
+    def test_sft_dataset_invalid_json_path(self):
+        """Test error when an invalid JSON file path is provided."""
+        invalid_path = "/path/to/nonexistent/file.json"
+
+        with self.assertRaises(FileNotFoundError) as context:
+            SFTDataset(
+                dataset_name="dummy",
+                split="train",
+                json_file_path=invalid_path,
+                prompt_template="Q: {question}",
+                completion_template="A: {answer}",
+            )
+
+        self.assertIn("JSON file not found or invalid", str(context.exception))
+        self.assertIn(invalid_path, str(context.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/QEfficient/finetune/experimental/tests/test_model.py b/QEfficient/finetune/experimental/tests/test_model.py
new file mode 100644
index 0000000000..e83abf3898
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_model.py
@@ -0,0 +1,136 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from unittest import mock
+
+import pytest
+import torch
+import torch.nn as nn
+
+from QEfficient.finetune.experimental.core import model
+from QEfficient.finetune.experimental.core.component_registry import ComponentFactory, registry
+from QEfficient.finetune.experimental.core.model import BaseModel
+
+
+class TestMockModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(2, 2)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+@registry.model("testcustom")
+class TestCustomModel(BaseModel):
+    def __init__(self, model_name):
+        super().__init__(model_name)
+        print("init of custom class")
+
+    def load_model(self) -> nn.Module:
+        return TestMockModel()
+
+    def load_tokenizer(self):
+        return "dummy-tokenizer"
+
+
+# BaseModel tests
+def test_model_property_errors_if_not_created():
+    m = TestCustomModel("dummy")
+    with pytest.raises(RuntimeError):
+        _ = m.model  # must call .create()
+
+
+def test_create_builds_and_registers():
+    m = ComponentFactory.create_model("testcustom", "dummy")
+    # inner model exists and registered
+    assert "_model" in m._modules
+    assert isinstance(m.model, TestMockModel)
+    # forward works
+    out = m(torch.zeros(1, 2))
+    assert out.shape == (1, 2)
+
+
+def test_tokenizer_lazy_loading():
+    m = ComponentFactory.create_model("testcustom", "dummy")
+    assert m._tokenizer is None
+    tok = m.tokenizer
+    assert tok == "dummy-tokenizer"
+    assert m._tokenizer == tok
+
+
+def test_to_moves_inner_and_returns_self():
+    m = ComponentFactory.create_model("testcustom", "dummy")
+    with mock.patch.object(TestMockModel, "to", autospec=True) as mocked_to:
+        ret = m.to("cpu:0")
+    assert mocked_to.call_args[0][0] is m.model
+    assert mocked_to.call_args[0][1] == "cpu:0"
+    assert ret is m
+
+
+def test_train_eval_sync_flags():
+    m = ComponentFactory.create_model("testcustom", "dummy")
+    m.eval()
+    assert m.training is False
+    assert m.model.training is False
+    m.train()
+    assert m.training is True
+    assert m.model.training is True
+
+
+def test_state_dict_contains_inner_params():
+    m = ComponentFactory.create_model("testcustom", "dummy")
+    sd = m.state_dict()
+    # should contain params from TestMockModel.linear
+    assert any("linear.weight" in k for k in sd)
+    assert any("linear.bias" in k for k in sd)
+
+
+# HFModel tests
+def test_hfmodel_invalid_auto_class_raises():
+    with pytest.raises(ValueError):
+        ComponentFactory.create_model("hf", "hf-name", auto_class_name="AutoDoesNotExist")
+
+
+def test_hfmodel_loads_auto_and_tokenizer(monkeypatch):
+    # fake HF Auto class
+    class FakeAuto(nn.Module):
+        @classmethod
+        def from_pretrained(cls, name, **kwargs):
+            inst = cls()
+            inst.loaded = (name, kwargs)
+            return inst
+
+        def forward(self, x):
+            return x
+
+    fake_tok = mock.Mock()
+
+    # Monkeypatch transformer classes used in HFModel
+    monkeypatch.setattr(
+        "QEfficient.finetune.experimental.core.model.transformers.AutoModelForCausalLM",
+        FakeAuto,
+        raising=False,
+    )
+    monkeypatch.setattr(
+        model,
+        "AutoTokenizer",
+        mock.Mock(from_pretrained=mock.Mock(return_value=fake_tok)),
+    )
+    monkeypatch.setattr(
+        "QEfficient.finetune.experimental.core.model.insert_pad_token",
+        mock.Mock(),
+        raising=False,
+    )
+    m = ComponentFactory.create_model("hf", "hf-name")
+    assert isinstance(m.model, FakeAuto)
+
+    # load tokenizer
+    tok = m.load_tokenizer()
+
+    assert hasattr(tok, "pad_token_id")
+    assert m.model.loaded[0] == "hf-name"
diff --git a/QEfficient/finetune/experimental/tests/test_optimizer.py b/QEfficient/finetune/experimental/tests/test_optimizer.py
new file mode 100644
index 0000000000..e105d5ddf9
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_optimizer.py
@@ -0,0 +1,96 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import copy
+
+import pytest
+import torch.nn as nn
+import torch.optim as optim
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.optimizer import prepare_optimizer
+
+OPTIMIZER_CONFIGS = {
+    "Adam": {
+        "optimizer_name": "Adam",
+        "opt_cls": optim.Adam,
+        "lr": 1e-4,
+        "weight_decay": 0.01,
+        "betas": (0.9, 0.999),
+        "eps": 1e-8,
+        "amsgrad": False,
+    },
+    "AdamW": {
+        "optimizer_name": "AdamW",
+        "opt_cls": optim.AdamW,
+        "lr": 1e-4,
+        "weight_decay": 0.01,
+        "betas": (0.9, 0.999),
+        "eps": 1e-8,
+        "amsgrad": False,
+    },
+    "SGD": {
+        "optimizer_name": "SGD",
+        "opt_cls": optim.SGD,
+        "lr": 1e-4,
+        "momentum": 0.9,
+        "weight_decay": 0.01,
+        "dampening": 0.0,
+        "nesterov": False,
+    },
+    "RMSprop": {
+        "optimizer_name": "RMSprop",
+        "opt_cls": optim.RMSprop,
+    },
+}
+
+REGISTRY_CONFIG = {
+    "RMSprop": {
+        "optimizer_name": "RMSprop",
+        "opt_cls": optim.RMSprop,
+    },
+}
+
+
+@pytest.fixture
+def dummy_model():
+    return nn.Sequential(
+        nn.Linear(10, 5),
+        nn.ReLU(),
+        nn.Linear(5, 1),
+    )
+
+
+@pytest.mark.parametrize("opt_name", OPTIMIZER_CONFIGS.keys())
+def test_optimizers(opt_name, dummy_model):
+    """Test that all registered optimizers can be created with their configs."""
+    config = copy.deepcopy(OPTIMIZER_CONFIGS[opt_name])
+
+    config.pop("opt_cls")
+    try:
+        optimizer_class_and_kwargs = prepare_optimizer(config)
+        assert optimizer_class_and_kwargs is not None
+    except ValueError as e:
+        assert "Unknown optimizer" in str(e)
+        return
+    optimizer_class = optimizer_class_and_kwargs[0]
+    opt_inst = optimizer_class(dummy_model.parameters(), **optimizer_class_and_kwargs[1])
+    assert isinstance(opt_inst, optim.Optimizer)
+    assert len(list(opt_inst.param_groups)) == 1
+
+    for key in ["lr", "weight_decay", "betas", "eps", "momentum", "dampening", "nesterov", "amsgrad"]:
+        if key in config:
+            assert opt_inst.param_groups[0][key] == config[key], f"{key} mismatch"
+
+
+@pytest.mark.parametrize("opt_name, opt_cls", REGISTRY_CONFIG.items())
+def test_registered_optimizer(opt_name, opt_cls):
+    """Test that the optimizer registerd correctly."""
+    registry.optimizer(opt_name)(opt_cls)
+    optimizer_class = registry.get_optimizer(opt_name)
+    assert optimizer_class is not None
+    assert optimizer_class == opt_cls
diff --git a/QEfficient/finetune/experimental/tests/test_registry.py b/QEfficient/finetune/experimental/tests/test_registry.py
new file mode 100644
index 0000000000..3e10aa8208
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_registry.py
@@ -0,0 +1,167 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import pytest
+
+from QEfficient.finetune.experimental.core.component_registry import ComponentRegistry, get_object, registry
+
+
+class TestComponentRegistry:
+    @pytest.fixture(autouse=True)
+    def setUp(self):
+        """Set up test fixtures before each test method."""
+        self.registry = ComponentRegistry()
+
+    @pytest.mark.parametrize(
+        "register_method, get_method, object_name",
+        [
+            ("trainer_module", "get_trainer_module", "test_trainer"),
+            ("optimizer", "get_optimizer", "test_optimizer"),
+            ("scheduler", "get_scheduler", "test_scheduler"),
+            ("dataset", "get_dataset", "test_dataset"),
+            ("model", "get_model", "test_model"),
+            ("data_collator", "get_data_collator", "test_collator"),
+            ("loss_function", "get_loss_function", "test_loss"),
+            ("callback", "get_callback", "test_callback"),
+        ],
+    )
+    def test_object_success(self, register_method: str, get_method: str, object_name: str):
+        """Test object registration decorator."""
+
+        class MockObject:
+            pass
+
+        # Register with decorator
+        getattr(self.registry, register_method)(object_name)(MockObject)
+
+        # Verify registration
+        retrieved = getattr(self.registry, get_method)(object_name)
+        if register_method == "trainer_module":
+            retrieved = retrieved["trainer_cls"]
+        assert retrieved == MockObject
+
+    @pytest.mark.parametrize(
+        "object_type, get_method",
+        [
+            ("trainer module", "get_trainer_module"),
+            ("optimizer", "get_optimizer"),
+            ("scheduler", "get_scheduler"),
+            ("dataset", "get_dataset"),
+            ("model", "get_model"),
+            ("data collator", "get_data_collator"),
+            ("loss function", "get_loss_function"),
+            ("callback", "get_callback"),
+        ],
+    )
+    def test_object_failure(self, object_type: str, get_method: str, object_name: str = "non_existent"):
+        """Test failure when retrieving non-existent object."""
+        with pytest.raises(ValueError) as exc_info:
+            getattr(self.registry, get_method)(object_name)
+
+        assert f"Unknown {object_type}" in str(exc_info.value)
+
+    def test_init_empty_registries(self):
+        """Test that all registries are initialized as empty dictionaries."""
+        assert len(self.registry._optimizers) == 0
+        assert len(self.registry._schedulers) == 0
+        assert len(self.registry._datasets) == 0
+        assert len(self.registry._models) == 0
+        assert len(self.registry._data_collators) == 0
+        assert len(self.registry._metrics) == 0
+        assert len(self.registry._loss_functions) == 0
+        assert len(self.registry._callbacks) == 0
+        assert len(self.registry._hooks) == 0
+        assert len(self.registry._trainer_modules) == 0
+
+    def test_trainer_module_with_args_and_kwargs(self):
+        """Test trainer module registration with args class and required kwargs."""
+
+        class MockArgs:
+            pass
+
+        class MockTrainer:
+            pass
+
+        # Register with decorator including args class and required kwargs
+        self.registry.trainer_module(
+            "test_trainer_with_args", args_cls=MockArgs, required_kwargs={"param1": "default1", "param2": "default2"}
+        )(MockTrainer)
+
+        # Verify registration details
+        module_info = self.registry.get_trainer_module("test_trainer_with_args")
+        assert module_info["trainer_cls"] == MockTrainer
+        assert module_info["args_cls"] == MockArgs
+        assert module_info["required_kwargs"] == {"param1": "default1", "param2": "default2"}
+
+    def test_list_methods(self):
+        """Test all list methods return correct keys."""
+
+        # Register some dummy items
+        class DummyClass:
+            pass
+
+        self.registry.optimizer("opt1")(DummyClass)
+        self.registry.scheduler("sched1")(DummyClass)
+        self.registry.dataset("ds1")(DummyClass)
+        self.registry.model("model1")(DummyClass)
+        self.registry.data_collator("coll1")(lambda x: x)
+        self.registry.loss_function("loss1")(DummyClass)
+        self.registry.callback("cb1")(DummyClass)
+        self.registry.trainer_module("tm1")(DummyClass)
+
+        # Test lists
+        assert self.registry.list_optimizers() == ["opt1"]
+        assert self.registry.list_schedulers() == ["sched1"]
+        assert self.registry.list_datasets() == ["ds1"]
+        assert self.registry.list_models() == ["model1"]
+        assert self.registry.list_data_collators() == ["coll1"]
+        assert self.registry.list_loss_functions() == ["loss1"]
+        assert self.registry.list_callbacks() == ["cb1"]
+        assert self.registry.list_trainer_modules() == ["tm1"]
+
+    def test_logging_on_registration(self, mocker):
+        """Test that registration logs messages."""
+        mock_logger = mocker.patch("QEfficient.finetune.experimental.core.component_registry.logger")
+
+        class MockClass:
+            pass
+
+        # Test optimizer registration logging
+        self.registry.optimizer("test_opt")(MockClass)
+        mock_logger.info.assert_called_with("Registered optimizer: test_opt")
+
+        # Reset mock
+        mock_logger.reset_mock()
+
+        # Test trainer module registration logging
+        self.registry.trainer_module("test_tm")(MockClass)
+        mock_logger.info.assert_called_with("Registered trainer module: test_tm")
+
+
+class TestGetObjectFunction:
+    def test_get_object_success(self):
+        """Test get_object function success case."""
+        test_dict = {"key1": "value1", "key2": "value2"}
+
+        result = get_object(test_dict, "key1", "test_type", lambda: ["key1", "key2"])
+        assert result == "value1"
+
+    def test_get_object_failure(self):
+        """Test get_object function failure case."""
+        test_dict = {"key1": "value1"}
+
+        with pytest.raises(ValueError) as exc_info:
+            get_object(test_dict, "nonexistent", "test_type", lambda: ["key1", "key2"])
+
+        assert "Unknown test_type: nonexistent" in str(exc_info.value)
+        assert "Available: ['key1', 'key2']" in str(exc_info.value)
+
+
+class TestGlobalRegistry:
+    def test_global_registry_instance(self):
+        """Test that global registry instance exists and is of correct type."""
+        assert isinstance(registry, ComponentRegistry)
diff --git a/QEfficient/finetune/experimental/tests/test_trainer.py b/QEfficient/finetune/experimental/tests/test_trainer.py
new file mode 100644
index 0000000000..20af61e36c
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_trainer.py
@@ -0,0 +1,493 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+import shutil
+
+import pytest
+import torch
+from datasets import Dataset
+from peft import LoraConfig, PeftModel
+from transformers import Trainer, TrainingArguments
+from trl import SFTConfig, SFTTrainer
+
+from QEfficient.finetune.experimental.core.component_registry import ComponentFactory, registry
+from QEfficient.finetune.experimental.core.model import HFModel  # noqa: F401 - needed for registration
+from QEfficient.finetune.experimental.core.trainer.base_trainer import BaseTrainer
+from QEfficient.finetune.experimental.core.trainer.sft_trainer import (
+    SFTTrainerModule,
+)
+
+LORA_R = 8
+LORA_ALPHA = 16
+LORA_DROPOUT = 0.1
+MAX_LENGTH = 128
+
+
+class TestBaseTrainer:
+    """Test suite for BaseTrainer class."""
+
+    def test_base_trainer_registered(self):
+        """Test that BaseTrainer is registered in the registry."""
+        trainer_list = registry.list_trainer_modules()
+        assert "base" in trainer_list
+
+    def test_base_trainer_info_structure(self):
+        """Test that BaseTrainer registration has correct structure."""
+        trainer_info = registry.get_trainer_module("base")
+
+        assert isinstance(trainer_info, dict)
+        assert "trainer_cls" in trainer_info
+        assert "args_cls" in trainer_info
+        assert "required_kwargs" in trainer_info
+
+    def test_base_trainer_class(self):
+        """Test that BaseTrainer class is correct."""
+
+        trainer_info = registry.get_trainer_module("base")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # The decorator returns the dict, but BaseTrainer is the original class
+        assert trainer_cls.__name__ == "BaseTrainer"
+        assert issubclass(trainer_cls, Trainer)
+        assert trainer_info["args_cls"] == TrainingArguments
+
+    def test_base_trainer_required_kwargs(self):
+        """Test that BaseTrainer has peft_config in required_kwargs."""
+        trainer_info = registry.get_trainer_module("base")
+
+        assert "peft_config" in trainer_info["required_kwargs"]
+        assert callable(trainer_info["required_kwargs"]["peft_config"])
+
+
+class TestSFTTrainerModule:
+    """Test suite for SFTTrainerModule class."""
+
+    def test_sft_trainer_registered(self):
+        """Test that SFTTrainerModule is registered in the registry."""
+        trainer_list = registry.list_trainer_modules()
+        assert "sft" in trainer_list
+
+    def test_sft_trainer_info_structure(self):
+        """Test that SFTTrainerModule registration has correct structure."""
+        trainer_info = registry.get_trainer_module("sft")
+
+        assert isinstance(trainer_info, dict)
+        assert "trainer_cls" in trainer_info
+        assert "args_cls" in trainer_info
+        assert "required_kwargs" in trainer_info
+
+    def test_sft_trainer_class(self):
+        """Test that SFTTrainerModule class is correct."""
+
+        trainer_info = registry.get_trainer_module("sft")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        assert trainer_cls == SFTTrainerModule["trainer_cls"]
+        assert issubclass(trainer_cls, SFTTrainer)
+        assert trainer_info["args_cls"] == SFTConfig
+
+    def test_sft_trainer_required_kwargs(self):
+        """Test that SFTTrainerModule has peft_config in required_kwargs."""
+        trainer_info = registry.get_trainer_module("sft")
+
+        assert "peft_config" in trainer_info["required_kwargs"]
+        assert callable(trainer_info["required_kwargs"]["peft_config"])
+
+
+class TestTrainerRegistry:
+    """Test suite for trainer registration in the component registry."""
+
+    def test_both_trainers_registered(self):
+        """Test that both base and sft trainers are registered."""
+        trainer_list = registry.list_trainer_modules()
+
+        assert "base" in trainer_list
+        assert "sft" in trainer_list
+        assert len(trainer_list) >= 2
+
+    def test_registry_returns_dict(self):
+        """Test that registry returns dict for trainer modules."""
+        base_info = registry.get_trainer_module("base")
+        sft_info = registry.get_trainer_module("sft")
+
+        assert isinstance(base_info, dict)
+        assert isinstance(sft_info, dict)
+
+    def test_trainer_classes_correct(self):
+        """Test that trainer classes are correctly stored."""
+        base_info = registry.get_trainer_module("base")
+        sft_info = registry.get_trainer_module("sft")
+        assert base_info["trainer_cls"] == BaseTrainer["trainer_cls"]
+        assert sft_info["trainer_cls"] == SFTTrainerModule["trainer_cls"]
+
+
+class TestBaseTrainerWithModel:
+    """Test suite for BaseTrainer integration with model loading and PEFT."""
+
+    @pytest.fixture(autouse=True)
+    def cleanup_output_dirs(self):
+        """Fixture to clean up test output directories after each test."""
+        # Setup: yield control to the test
+        yield
+
+        # Teardown: clean up output directories
+        output_dirs = ["./test_output", "./test_output_peft", "./test_output_base", "./test_output_base_peft"]
+        for output_dir in output_dirs:
+            if os.path.exists(output_dir):
+                try:
+                    shutil.rmtree(output_dir)
+                    print(f"\nCleaned up: {output_dir}")
+                except Exception as e:
+                    print(f"\nWarning: Failed to clean up {output_dir}: {e}")
+
+    @pytest.fixture
+    def model_config(self):
+        """Fixture for basic model configuration."""
+        return {
+            "model_name": "HuggingFaceTB/SmolLM-135M",
+            "auto_class_name": "AutoModelForCausalLM",
+            "use_cache": False,
+            "torch_dtype": "float16",
+            "attn_implementation": "eager",
+            "device_map": None,
+            "num_hidden_layers": 1,
+        }
+
+    @pytest.fixture
+    def peft_model_config(self):
+        """Fixture for PEFT configuration."""
+        return {
+            "r": LORA_R,
+            "lora_alpha": LORA_ALPHA,
+            "lora_dropout": LORA_DROPOUT,
+            "target_modules": ["q_proj", "v_proj"],
+            "bias": "none",
+        }
+
+    @pytest.fixture
+    def dummy_dataset(self):
+        """Fixture for creating a dummy dataset."""
+        data = {
+            "text": [
+                "This is a test sentence for training.",
+                "Another example text for the model.",
+                "Third sample to ensure proper batching.",
+            ]
+        }
+        return Dataset.from_dict(data)
+
+    def test_base_trainer_instantiation_with_model(self, model_config, dummy_dataset):
+        """Test that BaseTrainer can be instantiated with a loaded model."""
+        # Load model and tokenizer
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        model = hf_model.model
+        tokenizer = hf_model.tokenizer
+
+        # Create training config
+        training_args = TrainingArguments(
+            output_dir="./test_output_base",
+            per_device_train_batch_size=1,
+            num_train_epochs=1,
+            logging_steps=1,
+            save_strategy="no",
+            bf16=False,
+            fp16=True,
+        )
+
+        # Get BaseTrainer from registry
+        trainer_info = registry.get_trainer_module("base")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # Instantiate trainer without PEFT
+        trainer = trainer_cls(
+            model=model,
+            args=training_args,
+            train_dataset=dummy_dataset,
+            processing_class=tokenizer,
+        )
+
+        assert trainer is not None
+        assert trainer.model is not None
+        assert trainer.processing_class is not None
+
+    def test_base_trainer_with_peft_model(self, model_config, peft_model_config, dummy_dataset):
+        """Test that BaseTrainer works with PEFT-enabled models."""
+        # Load model and tokenizer
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        model = hf_model.model
+        tokenizer = hf_model.tokenizer
+
+        # Load PEFT Config
+        peft_config = LoraConfig(**peft_model_config)
+
+        # Create training config
+        training_args = TrainingArguments(
+            output_dir="./test_output_base_peft",
+            per_device_train_batch_size=1,
+            num_train_epochs=1,
+            logging_steps=1,
+            save_strategy="no",
+            bf16=False,
+            fp16=True,
+        )
+
+        # Get BaseTrainer from registry
+        trainer_info = registry.get_trainer_module("base")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # Instantiate trainer with PEFT config
+        trainer = trainer_cls(
+            model=model,
+            args=training_args,
+            train_dataset=dummy_dataset,
+            processing_class=tokenizer,
+            peft_config=peft_config,
+        )
+
+        assert trainer is not None
+        assert trainer.model is not None
+
+        # Verify that the model is now a PEFT model
+        assert isinstance(trainer.model, PeftModel), "Model should be wrapped as a PeftModel"
+
+        # Verify that the model has the expected PEFT config
+        assert hasattr(trainer.model, "peft_config"), "Model should have peft_config attribute"
+        assert trainer.model.peft_config is not None, "PEFT config should not be None"
+
+        # Verify trainable parameters are reduced (PEFT should make only a subset trainable)
+        trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
+        total_params = sum(p.numel() for p in trainer.model.parameters())
+
+        assert trainable_params < total_params, "PEFT should reduce the number of trainable parameters"
+        print(f"\nTrainable params: {trainable_params:,} / Total params: {total_params:,}")
+
+    def test_base_trainer_without_peft_config(self, model_config, dummy_dataset):
+        """Test that BaseTrainer works without PEFT config (standard training)."""
+        # Load model and tokenizer
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        model = hf_model.model
+        tokenizer = hf_model.tokenizer
+
+        # Create training config
+        training_args = TrainingArguments(
+            output_dir="./test_output_base",
+            per_device_train_batch_size=1,
+            num_train_epochs=1,
+            logging_steps=1,
+            save_strategy="no",
+            bf16=False,
+            fp16=True,
+        )
+
+        # Get BaseTrainer from registry
+        trainer_info = registry.get_trainer_module("base")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # Instantiate trainer without PEFT config
+        trainer = trainer_cls(
+            model=model,
+            args=training_args,
+            train_dataset=dummy_dataset,
+            processing_class=tokenizer,
+            peft_config=None,  # Explicitly pass None
+        )
+
+        assert trainer is not None
+        assert trainer.model is not None
+
+        # Verify that the model is NOT a PEFT model
+        assert not isinstance(trainer.model, PeftModel), (
+            "Model should not be wrapped as a PeftModel when peft_config is None"
+        )
+
+
+class TestSFTTrainerWithModel:
+    """Test suite for SFTTrainer integration with model loading."""
+
+    @pytest.fixture(autouse=True)
+    def cleanup_output_dirs(self):
+        """Fixture to clean up test output directories after each test."""
+        # Setup: yield control to the test
+        yield
+
+        # Teardown: clean up output directories
+        output_dirs = ["./test_output", "./test_output_peft"]
+        for output_dir in output_dirs:
+            if os.path.exists(output_dir):
+                try:
+                    shutil.rmtree(output_dir)
+                    print(f"\nCleaned up: {output_dir}")
+                except Exception as e:
+                    print(f"\nWarning: Failed to clean up {output_dir}: {e}")
+
+    @pytest.fixture
+    def model_config(self):
+        """Fixture for basic model configuration."""
+        return {
+            "model_name": "HuggingFaceTB/SmolLM-135M",
+            "auto_class_name": "AutoModelForCausalLM",
+            "use_cache": False,
+            "torch_dtype": "float16",
+            "attn_implementation": "eager",
+            "device_map": None,
+            "num_hidden_layers": 1,
+        }
+
+    @pytest.fixture
+    def peft_model_config(self):
+        """Fixture for PEFT configuration."""
+        return {
+            "lora_r": LORA_R,
+            "lora_alpha": LORA_ALPHA,
+            "lora_dropout": LORA_DROPOUT,
+            "target_modules": ["q_proj", "v_proj"],
+            "bias": "none",
+        }
+
+    @pytest.fixture
+    def dummy_dataset(self):
+        """Fixture for creating a dummy dataset."""
+
+        data = {
+            "text": [
+                "This is a test sentence for training.",
+                "Another example text for the model.",
+                "Third sample to ensure proper batching.",
+            ]
+        }
+        return Dataset.from_dict(data)
+
+    def test_model_forward_pass(self, model_config):
+        """Test that the loaded model can perform a forward pass."""
+
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        loaded_model = hf_model.model
+        tokenizer = hf_model.tokenizer
+
+        # Prepare input
+        text = "This is a test."
+        inputs = tokenizer(text, return_tensors="pt")
+
+        # Perform forward pass
+        with torch.no_grad():
+            outputs = loaded_model(**inputs)
+
+        assert outputs is not None
+        assert hasattr(outputs, "logits")
+        assert outputs.logits.shape[0] == 1  # batch size
+
+    def test_sft_trainer_instantiation_with_model(self, model_config, dummy_dataset):
+        """Test that SFTTrainer can be instantiated with a loaded model."""
+
+        # Load model and tokenizer
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        model = hf_model.model
+        tokenizer = hf_model.tokenizer
+
+        # Create SFT config
+        sft_config = SFTConfig(
+            output_dir="./test_output",
+            max_length=MAX_LENGTH,
+            per_device_train_batch_size=1,
+            num_train_epochs=1,
+            logging_steps=1,
+            save_strategy="no",
+            bf16=False,
+            fp16=True,
+        )
+
+        # Get SFTTrainer from registry
+        trainer_info = registry.get_trainer_module("sft")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # Instantiate trainer
+        trainer = trainer_cls(
+            model=model,
+            args=sft_config,
+            train_dataset=dummy_dataset,
+            processing_class=tokenizer,
+        )
+
+        assert trainer is not None
+        assert trainer.model is not None
+        assert trainer.tokenizer is not None
+
+    def test_sft_trainer_with_peft_model(self, model_config, peft_model_config, dummy_dataset):
+        """Test that SFTTrainer works with PEFT-enabled models."""
+
+        # Load model and tokenizer
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        model = hf_model.model
+        # Load PEFT Config
+        peft_config = LoraConfig(peft_model_config)
+        tokenizer = hf_model.tokenizer
+
+        # Create SFT config
+        sft_config = SFTConfig(
+            output_dir="./test_output_peft",
+            max_length=MAX_LENGTH,
+            per_device_train_batch_size=1,
+            num_train_epochs=1,
+            logging_steps=1,
+            save_strategy="no",
+            bf16=False,
+            fp16=True,
+        )
+
+        # Get SFTTrainer from registry
+        trainer_info = registry.get_trainer_module("sft")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # Instantiate trainer with PEFT config
+        trainer = trainer_cls(
+            model=model,
+            args=sft_config,
+            train_dataset=dummy_dataset,
+            processing_class=tokenizer,
+            peft_config=peft_config,
+        )
+
+        assert trainer is not None
+        assert trainer.model is not None
+
+    def test_sft_trainer_train_dataset_required(self, model_config):
+        """Test that SFTTrainer requires a training dataset."""
+
+        # Load model and tokenizer
+        model_name = model_config.pop("model_name")
+        hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
+        model = hf_model.model
+        tokenizer = hf_model.tokenizer
+
+        # Create SFT config
+        sft_config = SFTConfig(
+            output_dir="./test_output",
+            max_length=MAX_LENGTH,
+            per_device_train_batch_size=1,
+            num_train_epochs=1,
+            bf16=False,
+            fp16=True,
+        )
+
+        # Get SFTTrainer from registry
+        trainer_info = registry.get_trainer_module("sft")
+        trainer_cls = trainer_info["trainer_cls"]
+
+        # Attempt to instantiate without dataset should raise TypeError
+        with pytest.raises(TypeError, match="'NoneType' object is not iterable"):
+            trainer_cls(
+                model=model,
+                args=sft_config,
+                processing_class=tokenizer,
+            )

From 888fbb113231b0445ba5991a9fcdadef0052a5b9 Mon Sep 17 00:00:00 2001
From: smedhe <smedhe@qti.qualcomm.com>
Date: Wed, 25 Mar 2026 10:14:48 +0530
Subject: [PATCH 03/23] [QEff. finetune_experimnetal] cherry picking PR-787,
 791,813,795 (#872)

we are only cherry-picking PR-787, 791,813,795, skipping rebasing PR
785, cherry-picking experimental related branches from PR 692,747

---------

Signed-off-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Co-authored-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     | 309 +++++++++++++
 .../experimental/configs/sample_config.yaml   |   0
 .../experimental/configs/sft_ddp_config.yaml  |  55 +++
 .../sft_single_device_alpaca_config.yaml      |  48 ++
 .../sft_single_device_gsm8k_config.yaml       |  49 ++
 .../finetune/experimental/core/callbacks.py   |  44 +-
 .../experimental/core/component_registry.py   |  74 ++-
 .../experimental/core/config_manager.py       | 349 +++++++++-----
 .../finetune/experimental/core/dataset.py     |  93 +++-
 .../finetune/experimental/core/logger.py      |  41 +-
 .../finetune/experimental/core/optimizer.py   |   6 +-
 .../experimental/core/trainer/base_trainer.py |   3 +
 .../experimental/core/trainer/sft_trainer.py  |   7 +-
 .../experimental/core/utils/dataset_utils.py  |  11 +
 .../core/utils/device_map_utils.py            | 169 +++++++
 .../experimental/core/utils/dist_utils.py     |  17 +
 .../experimental/core/utils/peft_utils.py     |  47 ++
 .../core/utils/training_config_utils.py       |  84 ++++
 .../finetune/experimental/examples/ReadMe.md  |  65 +++
 .../experimental/examples/custom_dataset.py   | 272 +++++++++++
 .../experimental/examples/example_config.yaml |  60 +++
 .../example_finetune.py}                      |   9 +
 .../experimental/preprocessing/alpaca_func.py |  24 +
 .../finetune/experimental/tests/constants.py  | 109 +++++
 .../experimental/tests/test_callback.py       |   5 +-
 .../experimental/tests/test_config.yaml       |  22 +-
 .../experimental/tests/test_config_manager.py | 142 +++++-
 .../experimental/tests/test_dataset.py        | 117 +++--
 .../experimental/tests/test_finetune.py       | 425 ++++++++++++++++++
 .../experimental/tests/test_integrated.py     | 368 +++++++++++++++
 .../experimental/tests/test_logger.py         |  24 +-
 .../experimental/tests/test_optimizer.py      |  18 +-
 .../experimental/tests/test_trainer.py        |  11 +-
 QEfficient/utils/device_utils.py              |  25 ++
 docs/source/config.md                         | 268 +++++++++++
 docs/source/hf_finetune.md                    | 332 ++++++++++++++
 36 files changed, 3443 insertions(+), 259 deletions(-)
 delete mode 100644 QEfficient/finetune/experimental/configs/sample_config.yaml
 create mode 100644 QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
 create mode 100644 QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
 create mode 100644 QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
 create mode 100644 QEfficient/finetune/experimental/core/utils/device_map_utils.py
 create mode 100644 QEfficient/finetune/experimental/core/utils/peft_utils.py
 create mode 100644 QEfficient/finetune/experimental/core/utils/training_config_utils.py
 create mode 100644 QEfficient/finetune/experimental/examples/custom_dataset.py
 create mode 100644 QEfficient/finetune/experimental/examples/example_config.yaml
 rename QEfficient/finetune/experimental/{extensions/preprocessing/__init__.py => examples/example_finetune.py} (53%)
 create mode 100644 QEfficient/finetune/experimental/preprocessing/alpaca_func.py
 create mode 100644 QEfficient/finetune/experimental/tests/constants.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_finetune.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_integrated.py
 create mode 100644 docs/source/config.md
 create mode 100644 docs/source/hf_finetune.md

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index d647b73a65..08ea8f5e5b 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -4,3 +4,312 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+"""
+Main entry point for fine-tuning LLMs using the experimental finetune framework.
+"""
+
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+from QEfficient.finetune.experimental.core.callbacks import replace_progress_callback
+from QEfficient.finetune.experimental.core.component_registry import ComponentFactory
+from QEfficient.finetune.experimental.core.config_manager import (
+    ConfigManager,
+)
+from QEfficient.finetune.experimental.core.dataset import SFTDataset  # noqa: F401
+from QEfficient.finetune.experimental.core.logger import Logger
+from QEfficient.finetune.experimental.core.model import HFModel  # noqa: F401
+from QEfficient.finetune.experimental.core.optimizer import prepare_optimizer
+from QEfficient.finetune.experimental.core.trainer import sft_trainer  # noqa: F401
+from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
+from QEfficient.finetune.experimental.core.utils.peft_utils import convert_peft_config_to_lora_config
+from QEfficient.finetune.experimental.core.utils.training_config_utils import prepare_training_config
+
+logger = Logger(__name__)
+
+# Try importing QAIC-specific module, proceed without it if it's unavailable
+try:
+    import torch_qaic  # noqa: F401
+except ImportError as e:
+    logger.log_rank_zero(
+        f"Unable to import 'torch_qaic' package due to exception: {e}. Moving ahead without the torch_qaic extension.",
+        level=logging.WARNING,
+    )
+
+
+class FineTuningPipeline:
+    """
+    Main pipeline class for fine-tuning LLMs.
+    """
+
+    def __init__(self, config_manager: ConfigManager):
+        """
+        Initialize the fine-tuning pipeline with configuration.
+
+        Args:
+            config_manager: ConfigManager instance with loaded and validated configuration
+        """
+        self.config_manager = config_manager
+        self.config = self.config_manager.config
+        self.output_dir = Path(self.config.training["output_dir"])
+        self._setup_environment()
+
+        # Prepare training configuration
+        self.training_config = prepare_training_config(config_manager=self.config_manager)
+
+        # Create datasets
+        logger.log_rank_zero("Creating datasets...")
+        self.train_dataset, self.eval_dataset = self._create_datasets()
+
+        # Create model and tokenizer
+        logger.log_rank_zero("Loading model and tokenizer...")
+        model_instance = self._create_model()
+        self.model = model_instance.model
+        self.tokenizer = model_instance.tokenizer
+
+        # Create optimizer
+        logger.log_rank_zero("Preparing optimizer...")
+        self.optimizer_cls_and_kwargs = self._create_optimizer()
+
+        # Create callbacks
+        logger.log_rank_zero("Creating callbacks...")
+        self.callbacks = self._create_callbacks()
+
+        # Create trainer
+        logger.log_rank_zero("Initializing trainer...")
+        self.trainer = self._create_trainer(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            optimizer_cls_and_kwargs=self.optimizer_cls_and_kwargs,
+            callbacks=self.callbacks,
+            training_config=self.training_config,
+        )
+
+    def get_model_and_tokenizer(self):
+        return self.model, self.tokenizer
+
+    def get_trainer(self):
+        return self.trainer
+
+    def _setup_environment(self) -> None:
+        """Set up environment variables for output directories."""
+        os.environ["OUTPUT_DIR"] = str(self.output_dir)
+        os.environ["TRACKIO_DIR"] = str(self.output_dir / "trackio_logs")
+        os.environ["TENSORBOARD_LOGGING_DIR"] = str(self.output_dir)
+
+    def _create_datasets(self) -> Tuple[Any, Any]:
+        """
+        Create training and evaluation datasets.
+
+        Returns:
+            Tuple of (train_dataset, eval_dataset)
+        """
+        dataset_config = self.config_manager.get_dataset_config()
+
+        dataset_type = dataset_config.get("dataset_type")
+        dataset_name = dataset_config.get("dataset_name")
+        train_split = dataset_config.get("train_split", "train")
+        test_split = dataset_config.get("test_split", "test")
+        seed = self.config.training["seed"]
+
+        # Create a copy of dataset_config excluding keys that are passed explicitly
+        # to avoid duplicate keyword arguments when unpacking
+        excluded_keys = ("dataset_type", "dataset_name", "split", "seed", "train_split", "test_split")
+        dataset_config_copy = {k: v for k, v in dataset_config.items() if k not in excluded_keys}
+
+        # Helper function to create a dataset for a specific split
+        def create_dataset_for_split(split_name: str) -> Any:
+            return ComponentFactory.create_dataset(
+                dataset_type=dataset_type,
+                dataset_name=dataset_name,
+                split=split_name,
+                seed=seed,
+                **dataset_config_copy,
+            )
+
+        # Create training and evaluation datasets using config values
+        train_dataset = create_dataset_for_split(train_split)
+        eval_dataset = create_dataset_for_split(test_split)
+        return train_dataset, eval_dataset
+
+    def _create_model(self) -> Any:
+        """
+        Create and load the model instance.
+
+        Returns:
+            Model instance with loaded model and tokenizer
+        """
+        # Get model config as dict
+        model_config = self.config_manager.get_model_config()
+
+        # Extract required fields
+        model_type = model_config.pop("model_type")
+        model_name = model_config.pop("model_name")
+
+        # Get training config for PP settings
+        training_config = self.config.training
+        pp_degree = training_config.get("pp_degree", 1)
+        device = training_config.get("device", "qaic")
+
+        # Generate device_map for pipeline parallelism if pp_degree > 1
+        if pp_degree > 1:
+            device_map = get_device_map(
+                model_name=model_name,
+                device=device,
+                pp_degree=pp_degree,
+            )
+            # Pass device_map via model_config kwargs for model loading
+            model_config["device_map"] = device_map
+            logger.log_rank_zero(f"Pipeline Parallelism enabled: Using device_map for {pp_degree} stages")
+
+        # Filter out PEFT-related fields, these shouldn't be passed to model creation
+        excluded_keys = {"use_peft", "peft_config"}
+        model_config_kwargs = {k: v for k, v in model_config.items() if k not in excluded_keys}
+
+        model_instance = ComponentFactory.create_model(model_type, model_name, **model_config_kwargs)
+        return model_instance
+
+    def _create_optimizer(self) -> Tuple[Any, Dict[str, Any]]:
+        """
+        Create optimizer configuration.
+
+        Returns:
+            Tuple of (optimizer_class, optimizer_kwargs)
+        """
+        optimizer_config = self.config_manager.get_optimizer_config()
+        return prepare_optimizer(optimizer_config)
+
+    def _create_callbacks(self) -> List[Any]:
+        """
+        Create callback instances from configuration.
+
+        Returns:
+            List of callback instances
+        """
+        callback_config = self.config_manager.get_callback_config()
+        callbacks = []
+
+        # callback_config.callbacks is a dictionary of callback configurations
+        for callback_name, callback_kwargs in callback_config["callbacks"].items():
+            if callback_kwargs is None:
+                callback_kwargs = {}
+            try:
+                callback_instance = ComponentFactory.create_callback(callback_name, **callback_kwargs)
+                callbacks.append(callback_instance)
+            except ValueError as e:
+                logger.log_rank_zero(f"Warning: Failed to create callback '{callback_name}': {e}", level="warning")
+
+        return callbacks
+
+    def _create_trainer(
+        self,
+        model: Any,
+        tokenizer: Any,
+        train_dataset: Any,
+        eval_dataset: Any,
+        optimizer_cls_and_kwargs: Tuple[Any, Dict[str, Any]],
+        callbacks: List[Any],
+        training_config: Dict[str, Any],
+    ) -> Any:
+        """
+        Create and configure the trainer instance.
+
+        Args:
+            model: The model to train
+            tokenizer: Tokenizer for processing
+            train_dataset: Training dataset
+            eval_dataset: Evaluation dataset
+            optimizer_cls_and_kwargs: Optimizer class and kwargs tuple
+            callbacks: List of callbacks
+            training_config: Training configuration dictionary
+
+        Returns:
+            Trainer instance
+        """
+        trainer_type = training_config.pop("type")
+
+        # Get PEFT config if enabled
+        model_config_dict = self.config_manager.get_model_config()
+        peft_config = None
+        if model_config_dict.get("use_peft", False):
+            peft_config_dataclass = model_config_dict.get("peft_config")
+            if peft_config_dataclass is not None:
+                peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
+
+        # Build dependencies for trainer configuration
+        dependencies = {}
+        if peft_config is not None:
+            dependencies["peft_config"] = peft_config
+        trainer_cls, args_cls, additional_kwargs = ComponentFactory.create_trainer_config(trainer_type, **dependencies)
+
+        # Clean up training config: remove fields that shouldn't be passed to TrainingArguments
+        training_config.pop("device", None)
+        # Note: torch_dtype was already converted to fp16/bf16 flag in prepare_training_config
+        training_config.pop("deepspeed_config", None)
+        training_config.pop("torch_dtype", None)
+        # Remove PP-specific fields as they're handled via device_map in model loading
+        training_config.pop("pp_degree", None)
+
+        # Create trainer arguments instance
+        args = args_cls(**training_config)
+        dataset_config_dict = self.config_manager.get_dataset_config()
+        split_ratio = dataset_config_dict.get("split_ratio", 0.8)
+        num_samples = dataset_config_dict.get("dataset_num_samples", -1)
+        train_dataset = train_dataset.dataset
+        eval_dataset = eval_dataset.dataset
+        if num_samples > 0:
+            # Truncating datasets to a smaller number of samples.
+            # If you want to use all data, set dataset_num_samples to -1 or remove it from config.
+            if (num_samples * split_ratio) / len(train_dataset) <= 0.05:
+                logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING)
+            subset_train_indices = list(range(0, int(num_samples * split_ratio)))
+            subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio)))
+            eval_dataset = eval_dataset.select(subset_eval_indices)
+            train_dataset = train_dataset.select(subset_train_indices)
+        trainer = trainer_cls(
+            model=model,
+            processing_class=tokenizer,
+            args=args,
+            compute_loss_func=None,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
+            callbacks=callbacks,
+            **additional_kwargs,
+        )
+
+        replace_progress_callback(trainer, callbacks, logger)
+
+        return trainer
+
+    def run(self) -> None:
+        # Start training
+        logger.log_rank_zero("Starting training...")
+        self.trainer.train()
+
+
+def main():
+    """
+    Main entry point for fine-tuning.
+
+    Parses command-line arguments or config file and runs the fine-tuning pipeline.
+    """
+    # ConfigManager now handles argument parsing internally via its __init__
+    # It will automatically detect and parse:
+    # - Command-line args (if len(sys.argv) > 1)
+    # - Config file path (if sys.argv[1] ends with .yaml)
+    # - Or use defaults if no args provided
+    config_manager = ConfigManager()
+
+    # Create and run pipeline - pass ConfigManager directly to avoid redundant wrapping
+    pipeline = FineTuningPipeline(config_manager)
+    pipeline.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/QEfficient/finetune/experimental/configs/sample_config.yaml b/QEfficient/finetune/experimental/configs/sample_config.yaml
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
new file mode 100644
index 0000000000..f7a0f6b1a9
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -0,0 +1,55 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc..
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset
+
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  torch_compile: False # Whether to use torch.compile
+  ddp_config: # DDP configuration
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: True
+    ddp_timeout: 1800
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+  
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
new file mode 100644
index 0000000000..dfc5bd09c3
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
@@ -0,0 +1,48 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 16
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
+  prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
+  completion_template: "{output}"    # Model will be trained on this part. 
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 2  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 2  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
new file mode 100644
index 0000000000..f8627f6dad
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -0,0 +1,49 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/core/callbacks.py b/QEfficient/finetune/experimental/core/callbacks.py
index 30659e3bbd..bd1ce91c2e 100644
--- a/QEfficient/finetune/experimental/core/callbacks.py
+++ b/QEfficient/finetune/experimental/core/callbacks.py
@@ -19,7 +19,7 @@
 from transformers.integrations.integration_utils import TensorBoardCallback
 from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 
-from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.component_registry import ComponentFactory, registry
 from QEfficient.finetune.experimental.core.utils.profiler_utils import (
     get_op_verifier_ctx,
     init_qaic_profiling,
@@ -197,9 +197,39 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra
                 self.op_verifier_ctx_step.__exit__(None, None, None)
 
 
-def create_callbacks(name: str, **kwargs) -> Any:
-    """Create a callback instance."""
-    callback_class = registry.get_callback(name)
-    if callback_class is None:
-        raise ValueError(f"Unknown callback: {name}. Available: {registry.list_callbacks()}")
-    return callback_class(**kwargs)
+def replace_progress_callback(trainer: Any, callbacks: list[Any], logger: Any = None) -> None:
+    """
+    Replace default ProgressCallback with EnhancedProgressCallback if not already present.
+
+    Args:
+        trainer: Trainer instance
+        callbacks: List of callbacks already added
+        logger: Optional logger instance for warning messages
+    """
+    # Check if EnhancedProgressCallback is already in callbacks
+    has_enhanced = any(callback.__class__.__name__ == "EnhancedProgressCallback" for callback in callbacks)
+
+    if not has_enhanced:
+        try:
+            # Remove default ProgressCallback if present
+            trainer.remove_callback(ProgressCallback)
+        except (AttributeError, ValueError) as e:
+            # Callback not present or method doesn't exist, continue
+            if logger:
+                logger.log_rank_zero(
+                    f"Debug: Could not remove default ProgressCallback: {e}. This is expected if callback is not present.",
+                    level="debug",
+                )
+            pass
+
+        try:
+            # Add EnhancedProgressCallback
+            enhanced_callback = ComponentFactory.create_callback("enhanced_progressbar")
+            trainer.add_callback(enhanced_callback)
+        except Exception as e:
+            if logger:
+                logger.log_rank_zero(f"Warning: Could not add enhanced progress callback: {e}", level="warning")
+            else:
+                import warnings
+
+                warnings.warn(f"Could not add enhanced progress callback: {e}")
diff --git a/QEfficient/finetune/experimental/core/component_registry.py b/QEfficient/finetune/experimental/core/component_registry.py
index d1f9480311..59bd3598dd 100644
--- a/QEfficient/finetune/experimental/core/component_registry.py
+++ b/QEfficient/finetune/experimental/core/component_registry.py
@@ -6,11 +6,8 @@
 # -----------------------------------------------------------------------------
 
 import logging
-from typing import Callable, Dict, Optional, Type
+from typing import Any, Callable, Dict, Optional, Type
 
-# from QEfficient.finetune.experimental.core.logger import get_logger
-
-# logger = get_logger()
 logger = logging.getLogger(__name__)
 
 
@@ -201,10 +198,77 @@ def list_callbacks(self) -> list[str]:
 
 class ComponentFactory:
     @staticmethod
-    def create_model(model_type: str, model_name: str, **kwargs) -> any:
+    def create_model(model_type: str, model_name: str, **kwargs) -> Any:
         """Create a model instance."""
         model_class = registry.get_model(model_type)
         if model_class is None:
             raise ValueError(f"Unknown model: {model_type}. Available: {registry.list_models()}")
         model_instance = model_class.create(model_name, **kwargs)
         return model_instance
+
+    @staticmethod
+    def create_trainer_config(name: str, **dependencies) -> tuple:
+        """
+        Create trainer configuration based on registered trainer modules.
+
+        Args:
+            name: Name of the trainer type
+            **dependencies: Any dependencies needed to configure the trainer
+
+        Returns:
+            tuple: (trainer_class, args_class, additional_kwargs)
+        """
+        config = registry.get_trainer_module(name)
+
+        # Process required kwargs based on available dependencies
+        additional_kwargs = {}
+        for kwarg, default in config["required_kwargs"].items():
+            if kwarg in dependencies:
+                additional_kwargs[kwarg] = dependencies[kwarg]
+            elif default != "REQUIRED":
+                additional_kwargs[kwarg] = default
+
+        # Check for missing required arguments
+        for kwarg, default in config["required_kwargs"].items():
+            if kwarg not in additional_kwargs and default == "REQUIRED":
+                raise ValueError(f"Required argument '{kwarg}' not provided for trainer '{name}'")
+
+        return config["trainer_cls"], config["args_cls"], additional_kwargs
+
+    @staticmethod
+    def create_dataset(dataset_type: str, dataset_name: str, split: str, seed: int = 42, **kwargs) -> Any:
+        """
+        Create a dataset instance.
+
+        Args:
+            dataset_type: Type of dataset to create (e.g., 'sft_dataset')
+            dataset_name: Name of the dataset to load
+            split: Dataset split ("train", "test", etc.)
+            seed: Random seed for reproducibility
+            **kwargs: Additional dataset configuration parameters
+
+        Returns:
+            Dataset instance
+        """
+        dataset_class = registry.get_dataset(dataset_type)
+        if dataset_class is None:
+            raise ValueError(f"Unknown dataset type: {dataset_type}. Available: {registry.list_datasets()}")
+        dataset_instance = dataset_class(dataset_name=dataset_name, split=split, seed=seed, **kwargs)
+        return dataset_instance
+
+    @staticmethod
+    def create_callback(name: str, **kwargs) -> Any:
+        """
+        Create a callback instance.
+
+        Args:
+            name: Name of the callback to create
+            **kwargs: Additional callback configuration parameters
+
+        Returns:
+            Callback instance
+        """
+        callback_class = registry.get_callback(name)
+        if callback_class is None:
+            raise ValueError(f"Unknown callback: {name}. Available: {registry.list_callbacks()}")
+        return callback_class(**kwargs)
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index b28c2e1e33..a3e0a3cd2f 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -10,15 +10,21 @@
 """
 
 import json
+import logging
 import os
+import sys
 from dataclasses import asdict, dataclass, field, fields, is_dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 import yaml
 from transformers.hf_argparser import HfArgumentParser
 
-from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.logger import Logger
+from QEfficient.finetune.experimental.core.utils.dist_utils import is_main_process
+from QEfficient.utils.device_utils import is_nsp_free
+
+logger = Logger(__name__)
 
 
 @dataclass
@@ -55,6 +61,10 @@ class SchedulerConfig:
             "ratio of total training steps for the warmup phase."
         },
     )
+    warmup_ratio: int = field(
+        default=0.1,
+        metadata={"help": "ratio of total training steps for the warmup phase. value is within [0-1) range."},
+    )
 
 
 @dataclass
@@ -66,17 +76,21 @@ class DatasetConfig:
         metadata={"help": "The name or path of the tokenizer to use."},
     )
     dataset_type: str = field(
-        default="seq_completion",
+        default="sft_dataset",
         metadata={"help": "The type of dataset (e.g., 'seq_completion')."},
     )
     dataset_name: str = field(
-        default="knkarthick/samsum",
+        default="yahma/alpaca-cleaned",
         metadata={"help": "The name or path of the dataset."},
     )
     dataset_subset: str = field(
         default="default",
         metadata={"help": "The subset of the dataset to use, if applicable."},
     )
+    dataset_num_samples: int = field(
+        default=-1,
+        metadata={"help": "Number of samples to use from the dataset. -1 means all samples."},
+    )
     train_split: str = field(
         default="train",
         metadata={"help": "The name of the training split."},
@@ -93,7 +107,7 @@ class DatasetConfig:
         default=0.8,
         metadata={"help": "Ratio for train/test split, used when only train_split is provided."},
     )
-    input_columns: list[str] = field(
+    input_columns: List[str] = field(
         default_factory=lambda: ["text"],
         metadata={"help": "List of column names containing input text."},
     )
@@ -113,6 +127,22 @@ class DatasetConfig:
         default=4,
         metadata={"help": "Number of workers for dataset processing."},
     )
+    prompt_template: str = field(
+        default=None,
+        metadata={"help": "Template for formatting prompts (e.g., 'User: {input} Assistant: ')."},
+    )
+    prompt_func: str = field(
+        default=None,
+        metadata={"help": "Function for formatting prompts (e.g., 'User: {input} Assistant: ')."},
+    )
+    completion_template: str = field(
+        default=None,
+        metadata={"help": "Template for formatting output completions (e.g., '{output}')."},
+    )
+    completion_func: str = field(
+        default=None,
+        metadata={"help": "Function for formatting output completions (e.g., '{output}')."},
+    )
     collate_fn: str = field(
         default="dynamic_padding",
         metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
@@ -145,6 +175,15 @@ class DatasetConfig:
         default=1,
         metadata={"help": "Number of workers for the DataLoader."},
     )
+    remove_samples_with_empty_columns: bool = field(
+        default=True,
+        metadata={"help": "Whether to remove samples with empty columns."},
+    )
+    config_name: str = field(
+        default="default",
+        metadata={"help": "Name of the hf configuration file."},
+    )
+    json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."})
 
 
 @dataclass
@@ -163,7 +202,7 @@ class PeftConfig:
         default=0.1,
         metadata={"help": "The dropout probability for Lora layers."},
     )
-    target_modules: list[str] = field(
+    target_modules: List[str] = field(
         default_factory=lambda: ["q_proj", "v_proj"],
         metadata={"help": "The modules to apply Lora to."},
     )
@@ -241,7 +280,7 @@ class GradientCheckpointingKwargs:
         default=True,
         metadata={"help": "Whether to preserve the RNG state when checkpointing."},
     )
-    use_reenrant: bool = field(
+    use_reentrant: bool = field(
         default=False,
         metadata={"help": "Whether to use reentrant gradient checkpointing."},
     )
@@ -252,7 +291,7 @@ class DdpConfig:
     """Arguments for Distributed Data Parallel (DDP) training."""
 
     ddp_backend: str = field(
-        default="qccl",
+        default=None,
         metadata={"help": "The DDP backend to use (e.g., 'nccl', 'gloo', 'qccl')."},
     )
     ddp_find_unused_parameters: bool = field(
@@ -293,10 +332,6 @@ class TrainingConfig:
         default=42,
         metadata={"help": "Random seed for reproducibility."},
     )
-    device: str = field(
-        default="qaic",
-        metadata={"help": "The device to use for training ('cuda', 'cpu', etc.)."},
-    )
     do_eval: bool = field(
         default=True,
         metadata={"help": "Whether to run evaluation during training."},
@@ -329,7 +364,6 @@ class TrainingConfig:
         default=-1,
         metadata={"help": "If > 0: set total number of training steps to perform."},
     )
-
     log_level: str = field(
         default="info",
         metadata={"help": "Set the verbosity level of the logs ('debug', 'info', 'warning', 'error')."},
@@ -363,12 +397,6 @@ class TrainingConfig:
         default="eval_loss",
         metadata={"help": "The metric to use to compare two models ('eval_loss', etc.)."},
     )
-
-    dtype: str = field(
-        default="fp16",
-        metadata={"help": "The data type to use for training (e.g., 'fp16', 'bf16')."},
-    )
-
     gradient_checkpointing: bool = field(
         default=False,
         metadata={"help": "Whether to use gradient checkpointing."},
@@ -377,9 +405,16 @@ class TrainingConfig:
         default_factory=GradientCheckpointingKwargs,
         metadata={"help": "Arguments for gradient checkpointing."},
     )
-
+    device: str = field(
+        default="qaic",
+        metadata={"help": "The device to use for training ('cuda', 'cpu', etc.)."},
+    )
+    torch_dtype: str = field(
+        default="fp16",
+        metadata={"help": "The torch data type to use for model weights (e.g., 'fp32', 'fp16', 'bf16')."},
+    )
     torch_compile: bool = field(
-        default=True,
+        default=False,
         metadata={"help": "Whether to compile the model with `torch.compile`."},
     )
     include_num_input_tokens_seen: bool = field(
@@ -412,7 +447,7 @@ class TrainingConfig:
         metadata={"help": "DDP configuration dictionary."},
     )
     use_cpu: Optional[bool] = field(
-        default=None,
+        default=False,
         metadata={"help": "Whether to explicitly run training on CPU."},
     )
     resume_from_checkpoint: Optional[str] = field(
@@ -424,13 +459,17 @@ class TrainingConfig:
         metadata={"help": "Whether to restore callback states from checkpoint."},
     )
     report_to: Optional[List[str]] = field(
-        default=None,
+        default="tensorboard",
         metadata={"help": "The list of integrations to report the results and logs to."},
     )
     completion_only_loss: Optional[bool] = field(
         default=False,
         metadata={"help": "Whether to compute loss only on completion tokens."},
     )
+    pp_degree: int = field(
+        default=1,
+        metadata={"help": "Pipeline parallelism degree (number of pipeline stages). Set > 1 to enable PP."},
+    )
 
 
 @dataclass
@@ -460,47 +499,85 @@ class MasterConfig:
     )
 
 
-def parse_arguments(config_path: Optional[str] = None, args: Optional[List[str]] = None) -> MasterConfig:
-    """Create argument parser for the new finetuning interface."""
-    parser = HfArgumentParser(MasterConfig)
-
-    if config_path:
-        config_path = os.path.abspath(config_path)
-        if not os.path.exists(config_path):
-            raise FileNotFoundError(f"Config file not found: {config_path}")
-        if not (config_path.endswith(".yaml") or config_path.endswith(".yml")):
-            raise ValueError(f"Expected a .yaml/.yml file, got: {config_path}")
-
-        try:
-            (master_config,) = parser.parse_yaml_file(yaml_file=config_path)
-            return master_config
-        except Exception as e:
-            raise ValueError(f"Failed to parse YAML config '{config_path}': {e}")
-
-    args = [] if args is None else args
-    # If a single positional YAML file was passed via args, parse it as YAML
-    if len(args) == 1 and (args[0].endswith(".yaml") or args[0].endswith(".yml")):
-        yaml_path = os.path.abspath(args[0])
-        (master_config,) = parser.parse_yaml_file(yaml_file=yaml_path)
-    else:
-        (master_config,) = parser.parse_args_into_dataclasses(args=args)
-        master_config = asdict(master_config)
-        master_config = MasterConfig(**master_config)
-
-    return master_config
-
-
 class ConfigManager:
     """Manages configuration loading, validation, and updates."""
 
-    def __init__(self, config: MasterConfig):
+    def __init__(self, config: Optional[MasterConfig] = None, config_path: Optional[str] = None):
         """
         Initialize ConfigManager with either:
         - Path to config file (str or Path)
         - Configuration dictionary
-        - None (creates empty config)
         """
-        self.config = config
+        if config:
+            self.config = config
+        else:
+            self.config = MasterConfig()
+
+        if config_path and not config:
+            logger.log_rank_zero("Loading configuration from config_path...")
+            config_path = os.path.abspath(config_path)
+            if not os.path.exists(config_path):
+                raise FileNotFoundError(f"Config file not found: {config_path}")
+            if not (config_path.endswith(".yaml") or config_path.endswith(".yml")):
+                raise ValueError(f"Expected a .yaml/.yml file, got: {config_path}")
+            try:
+                self.load_config(config_path)
+            except Exception as e:
+                raise ValueError(f"Failed to parse YAML config '{config_path}': {e}")
+
+        elif config and not config_path:
+            logger.log_rank_zero("Loading configuration from config object...")
+
+        elif len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
+            logger.log_rank_zero("Loading configuration from config_path from CLI...")
+            config_path = os.path.abspath(sys.argv[1])
+            if not os.path.exists(config_path):
+                raise FileNotFoundError(f"Config file not found: {config_path}")
+            try:
+                self.load_config(config_path)
+            except Exception as e:
+                raise ValueError(f"Failed to parse YAML config '{config_path}': {e}")
+
+        elif len(sys.argv) > 2:
+            logger.log_rank_zero("Loading configuration flags from CLI...")
+            parser = HfArgumentParser(
+                (
+                    TrainingConfig,
+                    ModelConfig,
+                    DatasetConfig,
+                    OptimizerConfig,
+                    SchedulerConfig,
+                    CallbackConfig,
+                    PeftConfig,
+                    DdpConfig,
+                    GradientCheckpointingKwargs,
+                )
+            )
+            train_args, model_args, data_args, opt_args, schd_args, call_args, peft_args, ddp_args, gck_args, extra = (
+                parser.parse_args_into_dataclasses(return_remaining_strings=True)
+            )
+            train_args.ddp_config = ddp_args
+            train_args.gradient_checkpointing_kwargs = gck_args
+            model_args.peft_config = peft_args
+            self.config = MasterConfig(
+                model=model_args,
+                dataset=data_args,
+                training=train_args,
+                callbacks=call_args,
+                optimizers=opt_args,
+                scheduler=schd_args,
+                extra_params=extra,
+            )
+
+        else:
+            logger.log_rank_zero("Using default configuration...")
+        self.config = asdict(self.config)
+        self.config = MasterConfig(**self.config)
+        # Validate loaded config
+        try:
+            self.validate_config()
+        except Exception as e:
+            logger.log_rank_zero(f"Config validation failed with error: {e}")
 
     def load_config(self, config_path: Union[str, Path]) -> None:
         """Load configuration from file."""
@@ -517,9 +594,41 @@ def load_config(self, config_path: Union[str, Path]) -> None:
                 config_dict = json.load(f)
         else:
             raise ValueError(f"Unsupported configuration file format: {config_path.suffix}")
-
         self.update_config(config_dict)
 
+    def _merge_dataclass_inplace(self, dc_obj: Any, updates: Dict[str, Any], parent_path: str = "") -> None:
+        """
+        Recursively merge 'updates' (dict) into the dataclass instance 'dc_obj',
+        preserving defaults by updating nested dataclasses/dicts in place.
+        """
+        if not is_dataclass(dc_obj):
+            raise TypeError("dc_obj must be a dataclass instance")
+        field_names = {f.name for f in fields(dc_obj)}
+        for key, value in updates.items():
+            path = f"{parent_path}.{key}" if parent_path else key
+
+            if key not in field_names:
+                self._stash_top_level_extra(parent_path or "__root__", key, value)
+                continue
+
+            current = getattr(dc_obj, key)
+
+            # Case A: current is dataclass, incoming is dict -> deep merge
+            if is_dataclass(current) and isinstance(value, Mapping):
+                self._merge_dataclass_inplace(current, value, path)
+
+            # Case B: both dicts -> shallow update
+            elif isinstance(current, dict) and isinstance(value, Mapping):
+                current.update(value)
+
+            # Case C: both lists -> by default replace; switch to extend if desired
+            elif isinstance(current, list) and isinstance(value, list):
+                setattr(dc_obj, key, value)
+
+            # Case D: simple assignment
+            else:
+                setattr(dc_obj, key, value)
+
     def _ensure_extra_params(self, obj) -> Dict[str, Any]:
         """Ensure obj.extra_params exists and is a dict; return it."""
         ep = getattr(obj, "extra_params", None)
@@ -554,21 +663,7 @@ def update_config(self, config_dict: Dict[str, Any]) -> None:
                     else:
                         self._stash_top_level_extra(key, "__all__", value)
                     continue
-
-                if isinstance(value, dict) and is_dataclass(target):
-                    known = {f.name for f in fields(target)}
-                    for nested_key, nested_value in value.items():
-                        if nested_key in known:
-                            setattr(target, nested_key, nested_value)
-                        else:
-                            self._stash_top_level_extra(key, nested_key, nested_value)
-                    continue
-
-                if isinstance(value, dict) and isinstance(target, dict):
-                    target.update(value)
-                    continue
-                setattr(self.config, key, value)
-
+                self._merge_dataclass_inplace(target, value, parent_path=key)
             else:
                 ep = self._ensure_extra_params(self.config)
                 ep[key] = value
@@ -598,16 +693,33 @@ def validate_config(self) -> None:
         """
         Validate configuration parameters for MasterConfig.
         """
+        cfg = self.config
         errors: List[str] = []
 
-        cfg = self.config
         model = getattr(cfg, "model", {})
         dataset = getattr(cfg, "dataset", {})
         training = getattr(cfg, "training", {})
 
         # ---------- Model ----------
         self._push(errors, not model.get("model_name"), "model.model_name is required.")
-
+        # Device
+        valid_devices = ["cpu", "cuda", "qaic"]
+        training_device = model.get("device", "qaic")
+        if training_device not in valid_devices:
+            self._push(errors, training_device not in valid_devices, f"training.device must be one of {valid_devices}.")
+        if training_device == "qaic":
+            try:
+                import torch_qaic  # noqa: F401
+
+                logger.log_rank_zero("torch_qaic package found. Using QAIC devices...")
+                if is_main_process():
+                    is_nsp_free()
+
+            except ImportError as e:
+                logger.log_rank_zero(
+                    f"Unable to import 'torch_qaic' package due to exception: {e}. Moving ahead without the torch_qaic extension.",
+                    logging.WARNING,
+                )
         # PEFT validation
         if model.get("use_peft"):
             pc = model.get("peft_config", {})
@@ -632,34 +744,46 @@ def validate_config(self) -> None:
         # ---------- Dataset ----------
         self._push(errors, not dataset.get("dataset_name"), "dataset.dataset_name is required.")
         self._push(errors, not dataset.get("tokenizer_name"), "dataset.tokenizer_name is required.")
-        self._push(errors, dataset.get("max_seq_length", 0) <= 0, "dataset.max_seq_length must be positive.")
 
         # ---------- Training ----------
+        # torch_dtype validation
+        torch_dtype = training.get("torch_dtype")
+        valid_dtypes = {"fp16", "bf16", "fp32"}
+        self._push(
+            errors,
+            not torch_dtype,
+            "training.torch_dtype is required.",
+        )
+        self._push(
+            errors,
+            torch_dtype and torch_dtype not in valid_dtypes,
+            f"training.torch_dtype must be one of {valid_dtypes}.",
+        )
+
         # Batch sizes
         self._push(
             errors,
-            training.get("per_device_train_batch_size", 0) <= 0,
+            training.get("per_device_train_batch_size", 1) <= 0,
             "training.per_device_train_batch_size must be positive.",
         )
         self._push(
             errors,
-            training.get("per_device_eval_batch_size", 0) <= 0,
+            training.get("per_device_eval_batch_size", 1) <= 0,
             "training.per_device_eval_batch_size must be positive.",
         )
 
         # Epochs / steps
-        n_epochs = training.get("num_train_epochs", 0)
-        max_steps = training.get("max_steps", -1)
+        n_epochs = training.get("num_train_epochs", 1)
         self._push(
             errors,
-            n_epochs <= 0 and max_steps <= 0,
-            "Either training.num_train_epochs > 0 or training.max_steps > 0 must be set.",
+            n_epochs <= 0,
+            "Either training.num_train_epochs > 0  must be set.",
         )
 
         # Gradient accumulation
         self._push(
             errors,
-            training.get("gradient_accumulation_steps", 0) <= 0,
+            training.get("gradient_accumulation_steps", 1) <= 0,
             "training.gradient_accumulation_steps must be positive.",
         )
 
@@ -667,11 +791,13 @@ def validate_config(self) -> None:
         self._push(errors, training.get("logging_steps", 0) < 0, "training.logging_steps must be >= 0.")
         self._push(errors, training.get("save_total_limit", 0) < 0, "training.save_total_limit must be >= 0.")
 
-        # Device
-        valid_devices = ["cpu", "cuda", "qaic"]
-        training_device = training.get("device", None)
-        if training_device not in valid_devices:
-            self._push(errors, training_device not in valid_devices, f"training.device must be one of {valid_devices}.")
+        # Pipeline Parallelism (PP) config
+        pp_degree = training.get("pp_degree", 1)
+        self._push(
+            errors,
+            not isinstance(pp_degree, int) or pp_degree < 1,
+            "training.pp_degree must be a positive integer (default 1 = no PP; > 1 enables PP).",
+        )
 
         # DDP config
         ddp = training.get("ddp_config", {})
@@ -710,8 +836,24 @@ def get_dataset_config(self) -> Dict[str, Any]:
         return self.config.dataset
 
     def get_model_config(self) -> Dict[str, Any]:
-        """Get model configuration as dictionary."""
-        return self.config.model
+        """
+        Get model configuration as dictionary.
+
+        Automatically handles torch_dtype conversion from training config if not set in model config.
+        """
+        model_config = self.config.model
+
+        # Get torch_dtype from training config and convert
+        # To do: check if it can be moved from training config to model config instead
+        if model_config.get("torch_dtype") is None:
+            training_config = self.get_training_config()
+            training_dtype = training_config.get("torch_dtype")
+            if training_dtype:
+                # Convert from training format (fp16/bf16) to model format (float16/bfloat16)
+                dtype_mapping = {"fp16": "float16", "bf16": "bfloat16"}
+                model_config["torch_dtype"] = dtype_mapping.get(training_dtype, "auto")
+
+        return model_config
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert configuration to dictionary."""
@@ -722,32 +864,3 @@ def __getattr__(self, name: str) -> Any:
         if hasattr(self.config, name):
             return getattr(self.config, name)
         raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
-
-
-def create_trainer_config(name: str, **dependencies) -> tuple:
-    """
-    Create trainer configuration based on registered trainer modules.
-
-    Args:
-        name: Name of the trainer type
-        **dependencies: Any dependencies needed to configure the trainer
-
-    Returns:
-        tuple: (trainer_class, args_class, additional_kwargs)
-    """
-    config = registry.get_trainer_module(name)
-
-    # Process required kwargs based on available dependencies
-    additional_kwargs = {}
-    for kwarg, default in config["required_kwargs"].items():
-        if kwarg in dependencies:
-            additional_kwargs[kwarg] = dependencies[kwarg]
-        elif default != "REQUIRED":
-            additional_kwargs[kwarg] = default
-
-    # Check for missing required arguments
-    for kwarg, default in config["required_kwargs"].items():
-        if kwarg not in additional_kwargs and default == "REQUIRED":
-            raise ValueError(f"Required argument '{kwarg}' not provided for trainer '{name}'")
-
-    return config["trainer_cls"], config["args_cls"], additional_kwargs
diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index 4a243c40b2..22594cb81b 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -19,10 +19,14 @@
 from torch.utils.data import Dataset
 
 from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.logger import Logger
 from QEfficient.finetune.experimental.core.utils.dataset_utils import (
     apply_train_test_split,
+    validate_json_structure,
 )
 
+logger = Logger(__name__)
+
 
 class BaseDataset(Dataset, ABC):
     """Base class for all datasets to ensure consistent interface."""
@@ -91,17 +95,22 @@ def __init__(
         self.prompt_func_path = kwargs.get("prompt_func", None)
         self.completion_func_path = kwargs.get("completion_func", None)
         self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
+        self.config_name = kwargs.get("config_name", None)
 
         if self.json_file_path not in (None, ""):
             if not os.path.isfile(self.json_file_path):
                 raise FileNotFoundError(f"JSON file not found or invalid: '{self.json_file_path}'")
-        if (self.prompt_template is None and self.prompt_func_path is None) or (
-            self.prompt_template is not None and self.prompt_func_path is not None
-        ):
+        if self.prompt_template and self.prompt_func_path:
+            logger.log_rank_zero(
+                "Both prompt_template and prompt_func are provided. Using prompt_template for preprocessing."
+            )
+        if self.completion_template and self.completion_func_path:
+            logger.log_rank_zero(
+                "Both completion_template and completion_func are provided. Using completion_template for preprocessing."
+            )
+        if self.prompt_template is None and self.prompt_func_path is None:
             raise RuntimeError("Either provide prompt_template or prompt_func in the config.")
-        if (self.completion_template is None and self.completion_func_path is None) or (
-            self.completion_template is not None and self.completion_func_path is not None
-        ):
+        if self.completion_template is None and self.completion_func_path is None:
             raise RuntimeError("Either provide completion_template or completion_func in the config.")
 
         # Call parent class __init__ which will call _initialize_dataset
@@ -116,29 +125,60 @@ def _initialize_dataset(self):
         """
         if self.json_file_path:
             # Load dataset from JSON file
+            validate_json_structure(self.json_file_path)
             self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
-
             # Apply train/test split if needed
             if self.split in ["train", "test"]:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
         else:
             # Load dataset from HuggingFace
-            db = load_dataset_builder(self.dataset_name)
+            # Pass config_name if provided (required for datasets with multiple configs like openai/gsm8k)
+            load_kwargs = {}
+            if self.config_name is not None:
+                load_kwargs["name"] = self.config_name
+
+            db = load_dataset_builder(self.dataset_name, **load_kwargs)
             available_splits = []
             if db.info.splits is not None:
                 available_splits = list(db.info.splits.keys())
 
-            if self.split not in available_splits:
+            if self.split not in available_splits and self.split == "train":
                 raise ValueError(f"Split {self.split} is not available for dataset {self.dataset_name}.")
-
+            load_split = self.split
+            if self.split not in available_splits:
+                load_split = "train"
             # FIXME: Add streaming support for larger datasets.
-            self.dataset = load_dataset(self.dataset_name, split=self.split)
+            self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
 
             if len(available_splits) == 1:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
 
         self.dataset = self._setup_templates(self.dataset, self.dataset.column_names)
 
+        # Preprocess the HuggingFace dataset to add 'text' field
+        # This is required because TRL SFTTrainer expects a Dataset with 'text' field
+        self.dataset = self._add_text_field(self.dataset)
+
+    def _add_text_field(self, dataset):
+        """
+        Add 'text' field to the HuggingFace dataset by combining prompt and completion.
+        This is required by TRL's SFTTrainer which expects a 'text' field in the dataset.
+        """
+
+        def add_text(example):
+            # Apply preprocessing to get prompt and completion
+            processed = self._preprocess_sample(example)
+            # Add the combined text field
+            example["text"] = processed["prompt"] + processed["completion"]
+            # Also add prompt and completion fields for __getitem__ to access
+            example["prompt"] = processed["prompt"]
+            example["completion"] = processed["completion"]
+            return example
+
+        # Map the function to add 'text' field to all examples
+        dataset = dataset.map(add_text, desc="Adding text field")
+        return dataset
+
     def _setup_templates(self, dataset, dataset_columns):
         """
         Set up prompt/completion templates or functions and apply preprocessing.
@@ -237,21 +277,36 @@ def __len__(self) -> int:
         """
         return self.dataset.num_rows
 
-    def __getitem__(self, idx: int) -> Dict[str, str]:
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
         """
         Retrieves a processed sample from the dataset at the given index.
-        This method doesn't tokenize the input items, it is expected that the SFTTrainer will handle tokenization.
 
         Args:
             idx (int): The index of the sample to retrieve.
 
         Returns:
-            Dict[str, str]: A dictionary containing the processed 'prompt' and 'completion' for the sample.
+            Dict[str, Any]: A dictionary containing either:
+                - Raw text format: 'text', 'prompt', 'completion' (before tokenization)
+                - Tokenized format: 'input_ids', 'attention_mask', 'labels' (after tokenization)
         """
-        # Get the raw example using .select and access the first element
-        example = self.dataset.select(indices=[int(idx)])[0]
+        # Get the example from the dataset
+        # Use __getitem__ if available (for HuggingFace datasets), otherwise use select
+        if hasattr(self.dataset, "__getitem__"):
+            example = self.dataset[int(idx)]
+        else:
+            example = self.dataset.select(indices=[int(idx)])[0]
+
+        # Convert to dict if it's not already
+        if not isinstance(example, dict):
+            example = dict(example)
 
-        # Apply preprocessing (templating) on the fly
-        processed_example = self._preprocess_sample(example)
+        if "input_ids" in example:
+            # Return tokenized data as-is (TRL has already tokenized it)
+            return example
 
-        return processed_example
+        # Otherwise, return raw text format (before tokenization)
+        return {
+            "text": example.get("text", ""),
+            "prompt": example.get("prompt", ""),
+            "completion": example.get("completion", ""),
+        }
diff --git a/QEfficient/finetune/experimental/core/logger.py b/QEfficient/finetune/experimental/core/logger.py
index a1b9c771f6..c4f5b47bd8 100644
--- a/QEfficient/finetune/experimental/core/logger.py
+++ b/QEfficient/finetune/experimental/core/logger.py
@@ -7,13 +7,13 @@
 
 
 import logging
-import sys
 from pathlib import Path
 from typing import Optional
 
 from transformers.utils.logging import get_logger as hf_get_logger
 
-from QEfficient.finetune.experimental.core.utils.dist_utils import get_local_rank
+from QEfficient.finetune.experimental.core.utils.dist_utils import is_global_rank_zero
+
 
 # -----------------------------------------------------------------------------
 # Logger usage:
@@ -27,6 +27,34 @@
 # Attach file handler later if needed:
 #   logger.prepare_for_logs(output_dir="logs", log_level="DEBUG")
 # -----------------------------------------------------------------------------
+class QEffFormatter(logging.Formatter):
+    """
+    Formatter class used to set colors for printing different logging levels of messages on console.
+    """
+
+    cyan: str = "\x1b[38;5;14m"
+    yellow: str = "\x1b[33;20m"
+    red: str = "\x1b[31;20m"
+    bold_red: str = "\x1b[31;1m"
+    reset: str = "\x1b[0m"
+    common_format: str = "%(levelname)s - %(name)s - %(message)s"  # type: ignore
+    format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)"  # type: ignore
+
+    FORMATS = {
+        logging.DEBUG: cyan + format_with_line_info + reset,
+        logging.INFO: cyan + common_format + reset,
+        logging.WARNING: yellow + common_format + reset,
+        logging.ERROR: red + format_with_line_info + reset,
+        logging.CRITICAL: bold_red + format_with_line_info + reset,
+    }
+
+    def format(self, record):
+        """
+        Overriding the base class method to Choose format based on log level.
+        """
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt)
+        return formatter.format(record)
 
 
 class Logger:
@@ -48,7 +76,7 @@ def __init__(
         """
         self.logger = hf_get_logger(name)
         self.logger.setLevel(level)
-
+        self.logger.propagate = False
         # Clear any existing handlers
         self.logger.handlers.clear()
 
@@ -56,9 +84,9 @@ def __init__(
         self.formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 
         # Console handler
-        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler = logging.StreamHandler()
         console_handler.setLevel(level)
-        console_handler.setFormatter(self.formatter)
+        console_handler.setFormatter(QEffFormatter())
         self.logger.addHandler(console_handler)
 
         # File handler (if log_file is provided)
@@ -100,7 +128,7 @@ def log_rank_zero(self, message: str, level: int = logging.INFO) -> None:
             message: Message to log
             level: Logging level
         """
-        if get_local_rank() == 0:
+        if is_global_rank_zero():
             self.logger.log(level, message)
 
     def log_exception(self, message: str, exception: Exception, raise_exception: bool = True) -> None:
@@ -130,6 +158,7 @@ def prepare_for_logs(self, output_dir: Optional[str] = None, log_level: str = "I
         # Convert string log level to logging constant
         level = getattr(logging, log_level.upper(), logging.INFO)
         self.logger.setLevel(level)
+        self.logger.propagate = False
 
         # Update existing handlers' levels
         for handler in self.logger.handlers:
diff --git a/QEfficient/finetune/experimental/core/optimizer.py b/QEfficient/finetune/experimental/core/optimizer.py
index d4f82cbebb..e0fc4211f7 100644
--- a/QEfficient/finetune/experimental/core/optimizer.py
+++ b/QEfficient/finetune/experimental/core/optimizer.py
@@ -13,9 +13,9 @@
 
 from QEfficient.finetune.experimental.core.component_registry import registry
 
-registry.optimizer("Adam")(optim.Adam)
-registry.optimizer("AdamW")(optim.AdamW)
-registry.optimizer("SGD")(optim.SGD)
+registry.optimizer("adam")(optim.Adam)
+registry.optimizer("adamw")(optim.AdamW)
+registry.optimizer("sgd")(optim.SGD)
 
 
 def prepare_optimizer(opt_config):
diff --git a/QEfficient/finetune/experimental/core/trainer/base_trainer.py b/QEfficient/finetune/experimental/core/trainer/base_trainer.py
index 0a3c50f7f1..b3aa2da902 100644
--- a/QEfficient/finetune/experimental/core/trainer/base_trainer.py
+++ b/QEfficient/finetune/experimental/core/trainer/base_trainer.py
@@ -77,3 +77,6 @@ def __init__(
             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
             **kwargs,
         )
+
+        # Disable DataParallel:  PP and DDP remain unaffected
+        self.args._n_gpu = 1
diff --git a/QEfficient/finetune/experimental/core/trainer/sft_trainer.py b/QEfficient/finetune/experimental/core/trainer/sft_trainer.py
index 3223c5966b..be72243fc7 100644
--- a/QEfficient/finetune/experimental/core/trainer/sft_trainer.py
+++ b/QEfficient/finetune/experimental/core/trainer/sft_trainer.py
@@ -12,4 +12,9 @@
 
 @registry.trainer_module(name="sft", args_cls=SFTConfig, required_kwargs={"peft_config": PeftConfig})
 class SFTTrainerModule(SFTTrainer):
-    pass  # Just using the standard SFTTrainer
+    """SFT Trainer that disbales DataParallel (single-device, PP, or DDP only)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Disbale DataParallel:  PP and DDP remain unaffected
+        self.args._n_gpu = 1
diff --git a/QEfficient/finetune/experimental/core/utils/dataset_utils.py b/QEfficient/finetune/experimental/core/utils/dataset_utils.py
index 11e2fecfc3..ed33d34f95 100644
--- a/QEfficient/finetune/experimental/core/utils/dataset_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/dataset_utils.py
@@ -4,6 +4,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+import json
+
+
 def insert_pad_token(tokenizer):
     # Add pad token if it doesn't exist
     if tokenizer.pad_token is None:
@@ -19,6 +22,14 @@ def insert_pad_token(tokenizer):
             tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
 
+def validate_json_structure(path):
+    with open(path, "r") as f:
+        data = json.load(f)
+
+    if not isinstance(data, list):
+        raise ValueError(f"Invalid format. Expected a list of objects. Got : {type(data).__name__}")
+
+
 def apply_train_test_split(dataset, split_ratio, split, seed):
     """
     Apply train/test split to the dataset based on split_ratio.
diff --git a/QEfficient/finetune/experimental/core/utils/device_map_utils.py b/QEfficient/finetune/experimental/core/utils/device_map_utils.py
new file mode 100644
index 0000000000..c9ac24bace
--- /dev/null
+++ b/QEfficient/finetune/experimental/core/utils/device_map_utils.py
@@ -0,0 +1,169 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Utility functions for creating device maps for pipeline parallelism.
+"""
+
+from typing import Dict, Optional
+
+import numpy as np
+import torch
+from transformers import AutoConfig
+
+from QEfficient.finetune.experimental.core.utils.dist_utils import get_local_rank
+from QEfficient.utils._utils import get_num_layers_from_config
+
+
+def get_device_map(
+    model_name: str,
+    device: str,
+    pp_degree: int = 1,
+) -> Optional[Dict[str, int]]:
+    """
+    Returns device map for the given model based on PP and DDP configuration.
+
+    Args:
+        model_name: Name of the model to load configuration from.
+        device: Device type (e.g., 'cuda', 'qaic').
+        pp_degree: Pipeline parallelism degree (number of pipeline stages). > 1 enables PP.
+    Returns:
+        Dict: A dictionary mapping layer names to device IDs, or None if no PP.
+    """
+    if pp_degree <= 1:
+        return None
+
+    torch_device = torch.device(device)
+    num_available_devices = getattr(torch, torch_device.type).device_count()
+
+    if pp_degree > num_available_devices:
+        raise ValueError(
+            f"pp_degree ({pp_degree}) cannot exceed the number of available {device} devices "
+            f"({num_available_devices}). Reduce pp_degree or use a node with more devices."
+        )
+    elif pp_degree == num_available_devices:
+        device_map = "auto"
+    else:  # pp_degree < num_available_devices
+        device_map = custom_device_map(model_name, device, pp_degree)
+
+    return device_map
+
+
+def custom_device_map(model_name: str, device: str, pp_degree: int) -> Dict[str, int]:
+    """
+    Returns custom device map for model layers based on number of pipeline stages and process rank.
+
+    Args:
+        model_name: Name of the model to load configuration from.
+        device: Device type (e.g., 'cuda', 'qaic').
+        pp_degree: Pipeline parallelism degree (number of pipeline stages).
+
+    Returns:
+        Dict: A dictionary mapping layer names to device IDs.
+
+    Notes:
+        - This device map structure is verified for llama models primarily.
+        - For other architectures, you may need to adjust the layer naming conventions.
+        - Layers are distributed as evenly as possible: the first (num_layers % pp_degree)
+          stages receive one extra layer each.
+
+    Example:
+        Example config for PP + DDP is provided below as it works for only PP as well.
+        Configuration for meta-llama/Llama-3.2-1B
+        Total devices: 4 (2x PP x 2x DDP)
+
+        PP (Pipeline Parallelism): Each copy of the model is split into 2 stages
+        DDP (Distributed Data Parallel): 2 model copies run in parallel
+
+        |--------------------------------------------------------------------------|
+        | Process Rank |   Assigned Device IDs  | Model Component                  |
+        |--------------------------------------------------------------------------|
+        | Rank 0       | 0                      | model.embed_tokens               |
+        |              |                        | model.lm_head                    |
+        |              |                        | model.layers.0 - model.layers.7  |
+        |--------------------------------------------------------------------------|
+        | Rank 0       | 1                      | model.norm                       |
+        |              |                        | model.rotary_emb                 |
+        |              |                        | model.layers.8 - model.layers.15 |
+        |--------------------------------------------------------------------------|
+        | Rank 1       | 2                      | model.embed_tokens               |
+        |              |                        | model.lm_head                    |
+        |              |                        | model.layers.0 - model.layers.7  |
+        |--------------------------------------------------------------------------|
+        | Rank 1       | 3                      | model.norm                       |
+        |              |                        | model.rotary_emb                 |
+        |              |                        | model.layers.8 - model.layers.15 |
+        |--------------------------------------------------------------------------|
+    """
+
+    model_config = AutoConfig.from_pretrained(model_name)
+    num_layers = get_num_layers_from_config(model_config)
+    local_rank = get_local_rank()
+
+    if num_layers < pp_degree:
+        raise ValueError(
+            f"Number of model layers ({num_layers}) must be >= pp_degree ({pp_degree}). "
+            f"Cannot split {num_layers} layers across {pp_degree} pipeline stages."
+        )
+
+    first_device = local_rank * pp_degree
+    last_device = local_rank * pp_degree + (pp_degree - 1)
+
+    # Handle tied embeddings
+    if model_config.tie_word_embeddings:
+        lm_head_device = first_device
+    else:
+        lm_head_device = last_device
+
+    device_map = {
+        "model.embed_tokens": first_device,
+        "lm_head": lm_head_device,
+        "model.norm": last_device,
+        "model.rotary_emb": last_device,
+    }
+
+    # Distribute layers as evenly as possible across stages.
+    # The first (num_layers % pp_degree) stages get one extra layer each.
+    base_layers, remainder = divmod(num_layers, pp_degree)
+    layers_per_stage = np.array([base_layers + (1 if i < remainder else 0) for i in range(pp_degree)])
+
+    # Create device assignment per layer
+    pp_device_map = np.repeat(np.arange(pp_degree), layers_per_stage)
+
+    # Assign each layer to a device
+    for i in range(num_layers):
+        device_map[f"model.layers.{i}"] = int(pp_device_map[i] + local_rank * pp_degree)
+
+    return device_map
+
+
+def validate_pp_config(
+    pp_degree: int,
+    device: str,
+    local_world_size: int = 1,
+) -> None:
+    """
+    Validate pipeline parallelism configuration.
+
+    Args:
+        pp_degree: Pipeline parallelism degree (number of pipeline stages). Must be > 1 to enable PP.
+        device: Device type (e.g., 'cuda', 'qaic').
+        local_world_size: Number of processes per node for DDP.
+
+    Raises:
+        AssertionError: If configuration is invalid.
+    """
+    if pp_degree > 1:
+        # Validate device availability
+        torch_device = torch.device(device)
+        num_available_devices = getattr(torch, torch_device.type).device_count()
+
+        assert local_world_size * pp_degree <= num_available_devices, (
+            f"Number of devices required per node (LOCAL_WORLD_SIZE * pp_degree = "
+            f"{local_world_size} * {pp_degree} = {local_world_size * pp_degree}) "
+            f"should be <= locally available devices ({num_available_devices})."
+        )
diff --git a/QEfficient/finetune/experimental/core/utils/dist_utils.py b/QEfficient/finetune/experimental/core/utils/dist_utils.py
index aed88862d8..069d91445a 100644
--- a/QEfficient/finetune/experimental/core/utils/dist_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/dist_utils.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+import os
 
 import torch.distributed as dist
 
@@ -37,3 +38,19 @@ def get_world_size() -> int:
 def is_main_process() -> bool:
     """Check if the current process is the main process (rank 0)."""
     return get_rank() == 0
+
+
+def get_global_rank() -> int:
+    """Return global rank if available (torchrun/deepspeed), else fall back to local rank."""
+    r = os.environ.get("RANK")
+    if r is not None:
+        try:
+            return int(r)
+        except ValueError:
+            return 0
+    # Fallback to local rank
+    return int(get_local_rank())
+
+
+def is_global_rank_zero() -> bool:
+    return get_global_rank() == 0
diff --git a/QEfficient/finetune/experimental/core/utils/peft_utils.py b/QEfficient/finetune/experimental/core/utils/peft_utils.py
new file mode 100644
index 0000000000..9c6cfaf3c0
--- /dev/null
+++ b/QEfficient/finetune/experimental/core/utils/peft_utils.py
@@ -0,0 +1,47 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Utility functions for PEFT (Parameter-Efficient Fine-Tuning) configuration.
+"""
+
+from dataclasses import asdict
+from typing import Any, Optional
+
+from peft import LoraConfig
+
+
+def convert_peft_config_to_lora_config(peft_config: Any) -> Optional[LoraConfig]:
+    """
+    Convert PeftConfig (dataclass or dict) to LoraConfig from peft library.
+
+    Args:
+        peft_config: PeftConfig dataclass instance or dict
+
+    Returns:
+        LoraConfig instance or None if PEFT is not enabled
+    """
+    if peft_config is None:
+        return None
+
+    # Convert dataclass to dictionary if needed
+    if hasattr(peft_config, "__dict__") and not isinstance(peft_config, dict):
+        peft_dict = asdict(peft_config)
+    else:
+        peft_dict = peft_config
+
+    # Map PeftConfig fields to LoraConfig fields
+    lora_config_dict = {
+        "r": peft_dict.get("lora_r"),
+        "lora_alpha": peft_dict.get("lora_alpha"),
+        "lora_dropout": peft_dict.get("lora_dropout"),
+        "target_modules": peft_dict.get("target_modules"),
+        "bias": peft_dict.get("bias"),
+        "task_type": peft_dict.get("task_type"),
+    }
+
+    return LoraConfig(**lora_config_dict)
diff --git a/QEfficient/finetune/experimental/core/utils/training_config_utils.py b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
new file mode 100644
index 0000000000..1cd6704e44
--- /dev/null
+++ b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
@@ -0,0 +1,84 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Utility functions for preparing training configurations.
+"""
+
+from typing import Any, Dict
+
+from QEfficient.finetune.experimental.core.config_manager import ConfigManager
+
+
+def prepare_training_config(
+    config_manager: ConfigManager,
+    include_num_input_tokens_seen: bool = False,
+    use_cpu: bool = False,
+) -> Dict[str, Any]:
+    """
+    Prepare and transform training configuration for trainer initialization.
+
+    Args:
+        config_manager: ConfigManager instance with loaded configuration
+
+    Returns:
+        Dictionary of training arguments ready for trainer initialization
+    """
+    # Get training config as dict and create mutable copy to avoid mutating original
+    training_config = dict(config_manager.get_training_config())
+
+    # Handle dtype conversion
+    # To do: (For Tanisha) Check if torch_dtype should rather be added directly in model_config only in config_manager.py
+
+    torch_dtype = training_config.pop("torch_dtype", None)
+    if torch_dtype is None:
+        raise ValueError("'torch_dtype' field is required in training configuration. Expected one of: ['fp16', 'bf16']")
+    training_config[torch_dtype] = True
+    training_config["data_seed"] = training_config.get("seed")
+
+    # Restoring the "torch_dtype" after torch_dtype conversion using the saved value
+    training_config["torch_dtype"] = torch_dtype
+
+    # Handle scheduler configuration
+    scheduler_config = config_manager.get_scheduler_config()
+    training_config.setdefault("lr_scheduler_type", scheduler_config.get("scheduler_name"))
+
+    # Set warmup_ratio and warmup_steps from scheduler_config if they exist and are not None
+    warmup_ratio = scheduler_config.get("warmup_ratio")
+    if warmup_ratio is not None:
+        training_config["warmup_ratio"] = warmup_ratio
+    warmup_steps = scheduler_config.get("warmup_steps")
+    if warmup_steps is not None:
+        training_config["warmup_steps"] = warmup_steps
+
+    # Handle dataset configuration for dataloader settings
+    dataset_config = config_manager.get_dataset_config()
+    training_config.setdefault("dataloader_pin_memory", dataset_config.get("dataloader_pin_memory"))
+    training_config.setdefault("dataloader_persistent_workers", dataset_config.get("dataloader_persistent_workers"))
+    training_config.setdefault("dataloader_prefetch_factor", dataset_config.get("dataloader_prefetch_factor"))
+    training_config.setdefault("dataloader_drop_last", dataset_config.get("dataloader_drop_last"))
+    training_config.setdefault("dataloader_num_workers", dataset_config.get("dataloader_num_workers"))
+    training_config.setdefault("group_by_length", dataset_config.get("group_by_length"))
+
+    # Handle DDP configuration
+    if training_config.get("ddp_config") is not None:
+        ddp_config = training_config.pop("ddp_config")
+        if not isinstance(ddp_config, dict):
+            from dataclasses import asdict, is_dataclass
+
+            if is_dataclass(ddp_config):
+                ddp_config = asdict(ddp_config)
+            else:
+                raise TypeError(
+                    f"ddp_config must be a dict or DdpConfig dataclass instance, "
+                    f"got {type(ddp_config).__name__}: {ddp_config}"
+                )
+
+        # Merge ddp_config into training_config
+        training_config = {**training_config, **ddp_config}
+
+    return training_config
diff --git a/QEfficient/finetune/experimental/examples/ReadMe.md b/QEfficient/finetune/experimental/examples/ReadMe.md
index e69de29bb2..c44ea6179f 100644
--- a/QEfficient/finetune/experimental/examples/ReadMe.md
+++ b/QEfficient/finetune/experimental/examples/ReadMe.md
@@ -0,0 +1,65 @@
+
+# Custom Dataset Example
+
+This example demonstrates how to register a custom dataset type with the fine-tuning framework
+by mirroring the structure of the built-in `SFTDataset`.
+
+---
+
+## Files to Create
+
+
+```text
+examples/
+├── custom_dataset.py       # Custom dataset class
+├── example_config.yaml     # Training configuration
+└── example_finetune.py     # Entry point
+```
+
+---
+
+## 1. `custom_dataset.py`
+
+Create your dataset class by subclassing `BaseDataset` and registering it with the component
+registry using the `@registry.dataset(<name>)` decorator.
+
+The SeqCompletionDataset class in custom_dataset.py mirrors `SFTDataset` in structure.
+---
+
+## 2. `example_config.yaml`
+
+The main changes in the config are in the dataset config. 
+**dataset_type must exactly match the name passed to `@registry.dataset(...)` in your custom dataset file.**
+
+```yaml
+dataset:
+  dataset_type: "seq_completion"       # Must match @registry.dataset(<name>)
+  dataset_name: "Salesforce/wikitext"
+  config_name: "wikitext-103-raw-v1" 
+  prompt_template: "{text}"              
+  train_split: "train"
+  test_split: "test"                    
+  seed: 42
+  dataset_num_samples: 100
+```
+
+---
+
+## 3. `example_finetunepy`
+
+```python
+from QEfficient.finetune.experimental.examples.custom_dataset import CustomDataset  # noqa: F401
+from QEfficient.cloud.finetune_experimental import main
+
+if __name__ == "__main__":
+    main()
+```
+
+
+---
+
+## Run
+
+```bash
+python examples/example_finetune.py examples/example_config.yaml
+```
diff --git a/QEfficient/finetune/experimental/examples/custom_dataset.py b/QEfficient/finetune/experimental/examples/custom_dataset.py
new file mode 100644
index 0000000000..e0bc93aecf
--- /dev/null
+++ b/QEfficient/finetune/experimental/examples/custom_dataset.py
@@ -0,0 +1,272 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import importlib
+import logging
+import os
+import re
+from typing import Any, Callable, Dict
+
+from datasets import load_dataset, load_dataset_builder
+
+from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.dataset import BaseDataset
+from QEfficient.finetune.experimental.core.utils.dataset_utils import (
+    apply_train_test_split,
+    validate_json_structure,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@registry.dataset("seq_completion")
+class SeqCompletionDataset(BaseDataset):
+    """
+    A Sequence Completion dataset class for autoregressive (next-token prediction) training.
+
+    Unlike SFTDataset, there is NO prompt/completion split — loss is computed on ALL tokens.
+    The entire text is treated as both input and label.
+
+    Supports loading from HuggingFace datasets or local JSON files.
+
+    Args:
+        dataset_name (str): The name of the dataset to load from HuggingFace datasets.
+                            Ignored if json_file_path is provided.
+        split (str): The dataset split to use (e.g., "train", "validation", "test").
+        split_ratio (float): Ratio for train/test split when only one split is available.
+        seed (int): Random seed for reproducibility.
+        json_file_path (str, optional): Path to a custom JSON file containing the dataset.
+                                        If provided, this takes precedence over dataset_name.
+        prompt_template (str): A string template for constructing the full input text.
+                               Variables should be enclosed in curly braces, e.g., "{text}"
+                               or "{question} {answer}".
+        prompt_func (str, optional): Path to a custom function for constructing input text,
+                                    in the format "module_path:function_name".
+                                    Used if input_template is not provided.
+
+    Raises:
+        RuntimeError: If any variables specified in `input_template` are not found
+                      as columns in the loaded dataset.
+    """
+
+    def __init__(
+        self,
+        dataset_name: str,
+        split: str,
+        split_ratio: float = 0.8,
+        seed: int = 42,
+        **kwargs,
+    ):
+        self.split_ratio = split_ratio
+        self.json_file_path = kwargs.get("json_file_path", None)
+        self.input_template = kwargs.get("prompt_template", None)
+        self.input_func_path = kwargs.get("prompt_func", None)
+        self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
+        self.config_name = kwargs.get("config_name", None)
+
+        # Validate json_file_path if provided
+        if self.json_file_path not in (None, ""):
+            if not os.path.isfile(self.json_file_path):
+                raise FileNotFoundError(f"JSON file not found or invalid: '{self.json_file_path}'")
+
+        # Warn if both template and func are provided
+        if self.input_template and self.input_func_path:
+            logger.warning("Both input_template and input_func are provided. Using input_template for preprocessing.")
+
+        # Must have at least one way to build the input text
+        if self.input_template is None and self.input_func_path is None:
+            raise RuntimeError("Either provide input_template or input_func in the config.")
+
+        # Call parent __init__ which triggers _initialize_dataset()
+        super().__init__(dataset_name, split, seed, **kwargs)
+
+    # ------------------------------------------------------------------
+    # Dataset Initialization
+    # ------------------------------------------------------------------
+
+    def _initialize_dataset(self):
+        """
+        Initialize the dataset from either HuggingFace or a custom JSON file.
+
+        Mirrors SFTDataset._initialize_dataset() — same loading logic,
+        same split handling. Difference: calls _setup_input_column()
+        instead of _setup_templates(), and _add_text_field() only
+        builds a single 'text' field (no prompt/completion split).
+        """
+        if self.json_file_path:
+            # Load from local JSON file
+            validate_json_structure(self.json_file_path)
+            self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
+            # Apply train/test split if needed
+            if self.split in ["train", "test"]:
+                self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
+        else:
+            # Load from HuggingFace hub
+            load_kwargs = {}
+            if self.config_name is not None:
+                load_kwargs["name"] = self.config_name
+
+            db = load_dataset_builder(self.dataset_name, **load_kwargs)
+            available_splits = []
+            if db.info.splits is not None:
+                available_splits = list(db.info.splits.keys())
+
+            if self.split not in available_splits and self.split == "train":
+                raise ValueError(f"Split {self.split} is not available for dataset {self.dataset_name}.")
+
+            load_split = self.split
+            if self.split not in available_splits:
+                load_split = "train"
+
+            self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+
+            if len(available_splits) == 1:
+                self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
+
+        # Validate template variables and filter empty samples
+        self.dataset = self._setup_input_column(self.dataset, self.dataset.column_names)
+
+        # Add 'text' field — required by TRL SFTTrainer
+        self.dataset = self._add_text_field(self.dataset)
+
+    # ------------------------------------------------------------------
+    # Template / Function Setup  (mirrors _setup_templates in SFTDataset)
+    # ------------------------------------------------------------------
+
+    def _setup_input_column(self, dataset, dataset_columns):
+        """
+        Validate input_template variables exist in dataset columns,
+        set up input_func if template is not provided, and filter
+        out empty/None samples.
+
+        Mirrors SFTDataset._setup_templates() but for a single
+        input column instead of prompt + completion.
+        """
+        if self.input_template:
+            self.input_func = None
+            # Extract {variable} names from the template
+            input_variables = re.findall(r"\{(.*?)\}", self.input_template)
+            for var in input_variables:
+                if var not in dataset_columns:
+                    raise RuntimeError(
+                        f"Input template variable '{var}' not found in dataset columns: {dataset_columns}."
+                    )
+        else:
+            input_variables = dataset_columns
+            self.input_func = self.import_func(self.input_func_path)
+
+        # Filter out samples with empty/None values in relevant columns
+        if self.remove_samples_with_empty_columns:
+            dataset = dataset.filter(lambda example: self._filter_empty_or_none_samples(example, input_variables))
+        return dataset
+
+    def _add_text_field(self, dataset):
+        """
+        Add 'text' field to the dataset by applying the input template
+        or input function to each sample.
+
+        Mirrors SFTDataset._add_text_field() — but only builds ONE
+        field ('text') instead of three ('text', 'prompt', 'completion').
+        """
+
+        def add_text(example):
+            processed = self._preprocess_sample(example)
+            example["text"] = processed["text"]
+            return example
+
+        dataset = dataset.map(add_text, desc="Adding text field")
+        return dataset
+
+    # ------------------------------------------------------------------
+    # Per-Sample Preprocessing  (mirrors _preprocess_sample in SFTDataset)
+    # ------------------------------------------------------------------
+
+    def _preprocess_sample(self, example: Dict[str, Any]) -> Dict[str, str]:
+        """
+        Applies the input template or input function to a single example
+        to produce the full text string.
+
+        Mirrors SFTDataset._preprocess_sample() — but returns only
+        {'text'} instead of {'prompt', 'completion'}.
+
+        Args:
+            example (Dict[str, Any]): A single sample from the dataset.
+
+        Returns:
+            Dict[str, str]: A dictionary containing the 'text' string.
+        """
+        input_text = self.input_func(example) if self.input_func is not None else self.input_template.format(**example)
+        return {"text": input_text}
+
+    # ------------------------------------------------------------------
+    # Helpers  (identical to SFTDataset)
+    # ------------------------------------------------------------------
+
+    def import_func(self, func_path: str) -> Callable:
+        """
+        Dynamically import a function from a module path string.
+        Format: "module_path:function_name"
+        Identical to SFTDataset.import_func().
+        """
+        if ":" not in func_path:
+            raise ValueError("func_path must be in the format 'module_file_path:function_name'.")
+        module_file_path, function_name = func_path.split(":")
+
+        try:
+            module = importlib.import_module(module_file_path)
+        except Exception:
+            raise RuntimeError(f"Unable to import module: {module_file_path}.")
+
+        if not hasattr(module, function_name):
+            raise ValueError(f"Function {function_name} not found in module {module_file_path}.")
+        return getattr(module, function_name)
+
+    def _filter_empty_or_none_samples(self, example: Dict[str, Any], relevant_columns: list) -> bool:
+        """
+        Filter out samples where any relevant column is None or whitespace-only.
+        Identical to SFTDataset._filter_empty_or_none_samples().
+        """
+        for column in relevant_columns:
+            value = example.get(column)
+            if value is None or (isinstance(value, str) and not value.strip()):
+                return False
+        return True
+
+    # ------------------------------------------------------------------
+    # Dataset Protocol
+    # ------------------------------------------------------------------
+
+    def __len__(self) -> int:
+        """Returns the number of samples in the dataset."""
+        return self.dataset.num_rows
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Retrieves a processed sample at the given index.
+
+        Mirrors SFTDataset.__getitem__() — but returns only {'text'}
+        in the raw format (no prompt/completion split).
+
+        For seq_completion, labels = input_ids (set by the trainer/collator).
+        """
+        if hasattr(self.dataset, "__getitem__"):
+            example = self.dataset[int(idx)]
+        else:
+            example = self.dataset.select(indices=[int(idx)])[0]
+
+        if not isinstance(example, dict):
+            example = dict(example)
+
+        if "input_ids" in example:
+            # TRL has already tokenized — return as-is
+            return example
+
+        # Return raw text format
+        return {
+            "text": example.get("text", ""),
+        }
diff --git a/QEfficient/finetune/experimental/examples/example_config.yaml b/QEfficient/finetune/experimental/examples/example_config.yaml
new file mode 100644
index 0000000000..809a47ebd1
--- /dev/null
+++ b/QEfficient/finetune/experimental/examples/example_config.yaml
@@ -0,0 +1,60 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+# This example shows how developers can register and train on a new dataset type (seq_completion) 
+# via the dataset registry for other tasks like sequence‑completion or next‑token prediction tasks.
+
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 16
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+
+# Dataset config for the custom registered dataset type `seq_completion`.
+# The value of `dataset_type` must match the identifier used in the
+# `@registry.dataset(...)` decorator when defining the custom dataset class.
+dataset:
+  dataset_type: "seq_completion"
+  dataset_name: "Salesforce/wikitext"
+  config_name: "wikitext-103-raw-v1"    # required — wikitext has multiple configs
+  prompt_template: "{text}"              
+  train_split: "train"
+  test_split: "test"                    
+  seed: 42
+  dataset_num_samples: 100
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 2  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 2  # Batch size per device during training
+  num_train_epochs: 2
+  torch_compile: False # Whether to use torch.compile
+  
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "AdamW"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
+  tensorboard:
diff --git a/QEfficient/finetune/experimental/extensions/preprocessing/__init__.py b/QEfficient/finetune/experimental/examples/example_finetune.py
similarity index 53%
rename from QEfficient/finetune/experimental/extensions/preprocessing/__init__.py
rename to QEfficient/finetune/experimental/examples/example_finetune.py
index d647b73a65..d0ed822d9c 100644
--- a/QEfficient/finetune/experimental/extensions/preprocessing/__init__.py
+++ b/QEfficient/finetune/experimental/examples/example_finetune.py
@@ -4,3 +4,12 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+
+from QEfficient.cloud.finetune_experimental import main
+from QEfficient.finetune.experimental.examples.custom_dataset import (
+    SeqCompletionDataset,  # noqa: F401 - registers CustomDataset
+)
+
+if __name__ == "__main__":
+    main()
diff --git a/QEfficient/finetune/experimental/preprocessing/alpaca_func.py b/QEfficient/finetune/experimental/preprocessing/alpaca_func.py
new file mode 100644
index 0000000000..c82c97539f
--- /dev/null
+++ b/QEfficient/finetune/experimental/preprocessing/alpaca_func.py
@@ -0,0 +1,24 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+def prompt_no_input(row):
+    return (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:\n"
+    ).format_map(row)
+
+
+def prompt_input(row):
+    return (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+    ).format_map(row)
+
+
+def create_alpaca_prompt(row):
+    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)
diff --git a/QEfficient/finetune/experimental/tests/constants.py b/QEfficient/finetune/experimental/tests/constants.py
new file mode 100644
index 0000000000..578a165756
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/constants.py
@@ -0,0 +1,109 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Constants used across test files in the experimental finetuning pipeline.
+"""
+
+from enum import Enum
+
+# ============================================================================
+# Enums
+# ============================================================================
+
+
+class TaskType(str, Enum):
+    """Task types for model training."""
+
+    CAUSAL_LM = "CAUSAL_LM"
+    SEQ_CLS = "SEQ_CLS"
+    SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
+
+
+class DatasetType(str, Enum):
+    """Dataset types for training."""
+
+    SFT_DATASET = "sft_dataset"
+    SEQ_COMPLETION = "seq_completion"
+    SEQ_CLASSIFICATION = "seq_classification"
+
+
+class AutoClassName(str, Enum):
+    """Auto class names for model loading."""
+
+    CAUSAL_LM = "AutoModelForCausalLM"
+    SEQ_CLS = "AutoModelForSequenceClassification"
+    SEQ_2_SEQ_LM = "AutoModelForSeq2SeqLM"
+
+
+# ============================================================================
+# Test Seeds and Ratios
+# ============================================================================
+
+TEST_SEED = 42
+TEST_SPLIT_RATIO = 0.8
+
+# ============================================================================
+# PEFT/LoRA Configuration
+# ============================================================================
+
+TEST_LORA_R = 8
+TEST_LORA_ALPHA = 16
+TEST_LORA_DROPOUT = 0.1
+TEST_LORA_TARGET_MODULES_LLAMA = ["q_proj", "v_proj"]
+TEST_LORA_TARGET_MODULES_BERT = ["query", "value"]
+TEST_LORA_BIAS = "none"
+
+# ============================================================================
+# Training Parameters
+# ============================================================================
+
+TEST_LEARNING_RATE = 5e-5
+TEST_WEIGHT_DECAY = 0.01
+TEST_WARMUP_STEPS = 5
+TEST_NUM_TRAIN_EPOCHS = 1
+TEST_LOGGING_STEPS = 1
+TEST_PER_DEVICE_BATCH_SIZE = 1
+TEST_MAX_SEQ_LENGTH_CAUSAL = 256
+TEST_MAX_SEQ_LENGTH_SEQ_CLS = 128
+TEST_MAX_LENGTH = 128
+TEST_NUM_HIDDEN_LAYERS = 2
+
+# ============================================================================
+# Dataset Paths and Names
+# ============================================================================
+
+# HuggingFace Dataset Names
+HF_DATASET_ALPACA = "tatsu-lab/alpaca"
+HF_DATASET_GSM8K = "openai/gsm8k"
+HF_DATASET_GSM8K_CONFIG = "main"
+HF_DATASET_IMDB = "stanfordnlp/imdb"
+
+# Dataset subset size for testing
+TEST_DATASET_SUBSET_SIZE = 10
+
+# ============================================================================
+# Model Names
+# ============================================================================
+
+TEST_MODEL_LLAMA = "meta-llama/Llama-3.2-1B"
+TEST_MODEL_SMOLLM = "HuggingFaceTB/SmolLM-135M"
+
+# ============================================================================
+# Optimizer Parameters
+# ============================================================================
+
+OPT_LEARNING_RATE = 1e-4
+OPT_ADAM_BETAS = (0.9, 0.999)
+OPT_ADAM_EPS = 1e-8
+OPT_SGD_MOMENTUM = 0.9
+
+# ============================================================================
+# Loss Parameters
+# ============================================================================
+
+TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD = 2.0
diff --git a/QEfficient/finetune/experimental/tests/test_callback.py b/QEfficient/finetune/experimental/tests/test_callback.py
index 59ff4d1173..e085da9c9e 100644
--- a/QEfficient/finetune/experimental/tests/test_callback.py
+++ b/QEfficient/finetune/experimental/tests/test_callback.py
@@ -8,8 +8,7 @@
 import pytest
 from transformers import TrainerCallback
 
-from QEfficient.finetune.experimental.core.callbacks import create_callbacks
-from QEfficient.finetune.experimental.core.component_registry import registry
+from QEfficient.finetune.experimental.core.component_registry import ComponentFactory, registry
 
 
 class ModelSummaryCallback(TrainerCallback):
@@ -46,7 +45,7 @@ def test_callbacks(callback_name):
     # Create callbacks using the factory
     config = CALLBACK_CONFIGS[callback_name]
     try:
-        callback_inst = create_callbacks(**config)
+        callback_inst = ComponentFactory.create_callback(**config)
     except ValueError as e:
         assert "Unknown callback" in str(e)
         return
diff --git a/QEfficient/finetune/experimental/tests/test_config.yaml b/QEfficient/finetune/experimental/tests/test_config.yaml
index e97e99d583..aab402b483 100644
--- a/QEfficient/finetune/experimental/tests/test_config.yaml
+++ b/QEfficient/finetune/experimental/tests/test_config.yaml
@@ -10,12 +10,10 @@ model:
   model_type: "hf"  
   auto_class_name: "AutoModelForCausalLM"
   model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
-  load_in_4bit: false
   use_peft: true
   peft_config:
-    lora_r: 8
-    lora_alpha: 16
-    lora_dropout: 0.1
+    lora_r: 16
+    lora_alpha: 32
     target_modules: ["q_proj", "v_proj"]
     bias: "none" 
     task_type: "CAUSAL_LM" 
@@ -28,16 +26,13 @@ dataset:
   # dataset_name: "Arthur-LAGACHERIE/very-smollm-corpus-0.5M"
   dataset_name: "knkarthick/samsum"
   train_split: "train"
-  max_seq_length: 512
+  max_seq_length: 1024
   split_ratio: 0.8  # Ratio for train/test split, used when only train_split is provided
   test_split: "test"
   group_by_length: True
   num_workers: 4
-  dataloader_pin_memory: True
-  dataloader_persistent_workers: True
-  dataloader_prefetch_factor: 1
-  dataloader_drop_last: False
-
+  torch_dtype: "fp16"
+ 
 # Training configuration
 training:
   type: "sft"
@@ -46,25 +41,21 @@ training:
   seed: 42
   device: "qaic"
   do_eval: True
+  torch_dtype: "fp16"
   eval_strategy: "epoch"
   eval_steps: 100
-
   per_device_train_batch_size: 1
   per_device_eval_batch_size: 1
   gradient_accumulation_steps: 1
   num_train_epochs: 1
   max_steps: -1
-
   log_level: "info"
   log_on_each_node: True
   logging_strategy: "steps"
   logging_steps: 10
-
   save_strategy: "epoch"
   save_total_limit: 5
   metric_for_best_model: "eval_loss"
-
-  dtype: "fp16"
   completion_only_loss: True
   report_to: "trackio"
 
@@ -101,4 +92,3 @@ callbacks:
     early_stopping_patience: 3
     early_stopping_threshold: 0.001
   tensorboard:
-
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
index fd2abfd482..69d2db92af 100644
--- a/QEfficient/finetune/experimental/tests/test_config_manager.py
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -4,13 +4,20 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
-
-
 from pathlib import Path
 
 import pytest
 
-from QEfficient.finetune.experimental.core.config_manager import ConfigManager, parse_arguments
+from QEfficient.finetune.experimental.core.config_manager import (
+    ConfigManager,
+    DatasetConfig,
+    MasterConfig,
+    ModelConfig,
+    OptimizerConfig,
+    PeftConfig,
+    SchedulerConfig,
+    TrainingConfig,
+)
 
 
 @pytest.fixture
@@ -19,15 +26,103 @@ def config_path() -> Path:
     return (here / "test_config.yaml").resolve()
 
 
-def test_config(config_path):
-    master_config = parse_arguments(args=[])
+def create_master_config(
+    output_dir: str,
+) -> MasterConfig:
+    """
+    Args:
+        model_config: Test model configuration
+        dataset_config: Test dataset configuration
+        output_dir: Output directory for training results
+
+    Returns:
+        MasterConfig instance
+    """
+
+    return MasterConfig(
+        model=ModelConfig(
+            model_name="HuggingFaceTB/SmolLM-135M",
+            model_type="hf",
+            auto_class_name="AutoModelForCausalLM",
+            use_peft=True,
+            use_cache=False,
+            device_map=None,
+            peft_config=PeftConfig(
+                lora_r=8,
+                lora_alpha=16,
+                lora_dropout=0.05,
+                target_modules=["q_proj", "v_proj"],
+                bias="none",
+                task_type="CAUSAL_LM",
+                peft_type="LORA",
+            ),
+        ),
+        dataset=DatasetConfig(
+            tokenizer_name="HuggingFaceTB/SmolLM-135M",
+            dataset_type="sft_dataset",
+            dataset_name="openai/gsm8k",
+            max_seq_length=512,
+            train_batch_size=1,
+            prompt_template="Question: {question}\nAnswer: ",
+            completion_template="{answer}",
+            config_name="main",
+        ),
+        optimizers=OptimizerConfig(
+            optimizer_name="adamw",
+        ),
+        scheduler=SchedulerConfig(
+            scheduler_name="cosine",
+            warmup_steps=1,
+        ),
+        training=TrainingConfig(
+            type="sft",  # Using the "type" field from TrainingConfig
+            output_dir=output_dir,
+            num_train_epochs=1,
+            per_device_train_batch_size=1,
+            per_device_eval_batch_size=1,
+        ),
+    )
+
+
+def test_default_config():
+    config_manager = ConfigManager()
+    assert config_manager is not None
+    assert config_manager.config is not None
+
+
+def test_config_values(config_path):
+    config_manager = ConfigManager(config_path=config_path)
+    assert config_manager.config is not None
+    assert config_manager.config.model["model_name"] == "HuggingFaceTB/SmolLM-135M"
+    assert config_manager.config.model["peft_config"]["lora_dropout"] == 0.1
+    assert config_manager.config.model["peft_config"]["lora_r"] == 16
+    assert config_manager.config.dataset["dataset_name"] == "knkarthick/samsum"
+    assert config_manager.config.training["output_dir"] == "./training_results"
+    assert config_manager.config.training["per_device_train_batch_size"] == 1
+    assert config_manager.config.training["num_train_epochs"] == 1
+    assert not config_manager.config.training["gradient_checkpointing_kwargs"]["use_reentrant"]
+
+
+def test_config_missing_file():
+    with pytest.raises(FileNotFoundError):
+        ConfigManager(config_path="non_existent_file.yaml")
+
+
+def test_config_created_from_obj():
+    master_config = create_master_config(output_dir="./test_output")
     config_manager = ConfigManager(master_config)
+    config = config_manager.config
+    assert config is not None
+    assert config.model is not None
+    assert config.dataset is not None
+    assert config.training is not None
+    assert config.optimizers is not None
+    assert config.scheduler is not None
+
+
+def test_config(config_path):
+    config_manager = ConfigManager(config_path=config_path)
     assert isinstance(config_manager, ConfigManager)
-    config_manager.load_config(config_path)
-    try:
-        config_manager.validate_config()
-    except Exception as e:
-        pytest.fail(f"Config validation failed with error: {e}")
 
     # Test that all required fields are present
     missing = [
@@ -60,3 +155,30 @@ def test_config(config_path):
     assert optimizer_config is not None
     assert isinstance(optimizer_config, dict)
     assert (hasattr(optimizer_config, attr) for attr in ("optimizer_name", "lr"))
+
+
+def test_torch_dtype_validation():
+    """Test that torch_dtype validation works correctly."""
+    # Test with default config - should have torch_dtype set to fp16 by default
+    config_manager = ConfigManager()
+    training_config = config_manager.get_training_config()
+    assert training_config.get("torch_dtype") == "fp16"
+
+    # Validation should pass with default config
+    config_manager.validate_config()  # Should not raise
+
+
+def test_torch_dtype_invalid():
+    """Test that invalid torch_dtype raises validation error."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    # Create config with invalid torch_dtype
+    training_config = TrainingConfig(torch_dtype="invalid_dtype")
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    # Validation should fail
+    with pytest.raises(ValueError) as exc_info:
+        config_manager.validate_config()
+
+    assert "torch_dtype must be one of" in str(exc_info.value)
diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py
index ca2fc14505..d6dc5729cb 100644
--- a/QEfficient/finetune/experimental/tests/test_dataset.py
+++ b/QEfficient/finetune/experimental/tests/test_dataset.py
@@ -67,25 +67,54 @@ def tearDown(self):
     def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder, mock_load):
         """Test loading from HuggingFace dataset with templates using mocked data."""
         # Create mock dataset with dummy data
-        mock_dataset = MagicMock()
-        mock_dataset.column_names = ["text", "label"]
-        mock_dataset.num_rows = 3
-
-        # Mock the select method to return individual samples
-        def mock_select(indices):
-            sample_data = [
-                {"text": "Sample text 1", "label": "Label 1"},
-                {"text": "Sample text 2", "label": "Label 2"},
-                {"text": "Sample text 3", "label": "Label 3"},
-            ]
-            return [sample_data[indices[0]]]
-
-        mock_dataset.select = mock_select
-        mock_dataset.filter = lambda func: mock_dataset  # Return self for filtering
-
-        # Mock train_test_split to return a dict with train/test splits
-        mock_split_result = {"train": mock_dataset, "test": mock_dataset}
-        mock_dataset.train_test_split = lambda test_size, seed: mock_split_result
+        sample_data = [
+            {"text": "Sample text 1", "label": "Label 1"},
+            {"text": "Sample text 2", "label": "Label 2"},
+            {"text": "Sample text 3", "label": "Label 3"},
+        ]
+
+        processed_samples_container = [None]
+
+        def create_mock_dataset():
+            mock_dataset = MagicMock()
+            mock_dataset.column_names = ["text", "label"]
+            mock_dataset.num_rows = 3
+
+            # Mock __getitem__ to return processed samples
+            def mock_getitem(self, idx):
+                if processed_samples_container[0] is not None:
+                    return processed_samples_container[0][idx]
+                # Before map, return raw data
+                return sample_data[idx]
+
+            mock_dataset.__getitem__ = mock_getitem
+
+            # Mock the select method
+            def mock_select(indices):
+                idx = indices[0] if isinstance(indices, list) else indices
+                if processed_samples_container[0] is not None:
+                    return [processed_samples_container[0][idx]]
+                return [sample_data[idx]]
+
+            mock_dataset.select = mock_select
+            mock_dataset.filter = lambda func: mock_dataset  # Return self for filtering
+
+            # Mock map to apply the function and update processed_samples
+            def mock_map(func, desc=None):
+                # Apply the function to all samples
+                processed_samples_container[0] = [func(sample.copy()) for sample in sample_data]
+                # Return a new mock dataset with processed data
+                return create_mock_dataset()
+
+            mock_dataset.map = mock_map
+
+            # Mock train_test_split to return a dict with train/test splits
+            mock_split_result = {"train": mock_dataset, "test": mock_dataset}
+            mock_dataset.train_test_split = lambda test_size, seed: mock_split_result
+
+            return mock_dataset
+
+        mock_dataset = create_mock_dataset()
 
         # Mock the dataset builder to indicate multiple splits are available
         mock_info = MagicMock()
@@ -260,18 +289,15 @@ def test_sft_dataset_no_prompt_template_or_func(self):
         self.assertIn("Either provide prompt_template or prompt_func", str(context.exception))
 
     def test_sft_dataset_both_prompt_template_and_func(self):
-        """Test error when both prompt_template and prompt_func are provided."""
-        with self.assertRaises(RuntimeError) as context:
-            SFTDataset(
-                dataset_name="dummy",
-                split="train",
-                json_file_path=self.json_file_path,
-                prompt_template="Q: {question}",
-                prompt_func="module:function",
-                completion_template="A: {answer}",
-            )
-
-        self.assertIn("Either provide prompt_template or prompt_func", str(context.exception))
+        """Test when both prompt_template and prompt_func are provided."""
+        SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            prompt_func="module:function",
+            completion_template="A: {answer}",
+        )
 
     def test_sft_dataset_no_completion_template_or_func(self):
         """Test error when neither completion_template nor completion_func is provided."""
@@ -289,20 +315,14 @@ def test_sft_dataset_no_completion_template_or_func(self):
         )
 
     def test_sft_dataset_both_completion_template_and_func(self):
-        """Test error when both completion_template and completion_func are provided."""
-        with self.assertRaises(RuntimeError) as context:
-            SFTDataset(
-                dataset_name="dummy",
-                split="train",
-                json_file_path=self.json_file_path,
-                prompt_template="Q: {question}",
-                completion_template="A: {answer}",
-                completion_func="module:function",
-            )
-
-        self.assertIn(
-            "Either provide completion_template or completion_func",
-            str(context.exception),
+        """Test when both completion_template and completion_func are provided."""
+        SFTDataset(
+            dataset_name="dummy",
+            split="train",
+            json_file_path=self.json_file_path,
+            prompt_template="Q: {question}",
+            completion_template="A: {answer}",
+            completion_func="module:function",
         )
 
     def test_sft_dataset_invalid_func_path_format(self):
@@ -494,13 +514,14 @@ def test_sft_dataset_invalid_split(self, mock_builder, mock_load):
         """Test error when requesting an invalid split."""
         # Mock the dataset builder to return specific splits
         mock_info = MagicMock()
-        mock_info.splits = {"train": MagicMock(), "validation": MagicMock()}
+        mock_info.splits = {"test": MagicMock(), "validation": MagicMock()}
         mock_builder.return_value.info = mock_info
 
         with self.assertRaises(ValueError) as context:
             SFTDataset(
-                dataset_name="dummy_dataset",
-                split="nonexistent_split",
+                dataset_name="dummy",
+                split="train",
+                split_ratio=SPLIT_RATIO,
                 prompt_template="Q: {question}",
                 completion_template="A: {answer}",
             )
diff --git a/QEfficient/finetune/experimental/tests/test_finetune.py b/QEfficient/finetune/experimental/tests/test_finetune.py
new file mode 100644
index 0000000000..8e3ead3e98
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_finetune.py
@@ -0,0 +1,425 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+MODULE = "QEfficient.cloud.finetune_experimental"
+
+FineTuningPipeline = __import__(MODULE, fromlist=["FineTuningPipeline"]).FineTuningPipeline
+
+
+# ---------- Fixtures ----------
+
+
+@pytest.fixture
+def tmp_outdir(tmp_path):
+    return tmp_path / "out"
+
+
+@pytest.fixture
+def mock_config_manager(mocker, tmp_outdir):
+    """
+    Minimal ConfigManager double:
+      - .config.training is dict-like with 'output_dir'
+    """
+    cm = mocker.MagicMock(name="ConfigManager")
+    cm.config = mocker.MagicMock()
+    cm.config.training = {"output_dir": str(tmp_outdir)}
+    return cm
+
+
+@pytest.fixture
+def mock_logger(mocker):
+    """
+    Patch the module-level logger used inside the pipeline.
+    """
+    logger = __import__(MODULE, fromlist=["logger"]).logger
+    # Ensure log_rank_zero exists and is mockable
+    mocker.patch.object(logger, "log_rank_zero", autospec=True)
+    return logger
+
+
+@pytest.fixture
+def training_config_stub(mocker):
+    """
+    Patch prepare_training_config to avoid side effects and make it assertable.
+    """
+    return_value = {"some_training_key": "some_training_value"}
+    patcher = mocker.patch(
+        f"{MODULE}.prepare_training_config",
+        autospec=True,
+        return_value=return_value,
+    )
+    return patcher, return_value
+
+
+@pytest.fixture
+def model_bundle(mocker):
+    """
+    A tiny 'model instance' object that the pipeline expects from _create_model().
+    Must have .model and .tokenizer attributes.
+    """
+    bundle = mocker.MagicMock(name="ModelBundle")
+    bundle.model = mocker.MagicMock(name="model")
+    bundle.tokenizer = mocker.MagicMock(name="tokenizer")
+    return bundle
+
+
+# ---------- Tests ----------
+
+
+def test_initialization(
+    mocker,
+    mock_config_manager,
+    mock_logger,
+    training_config_stub,
+    model_bundle,
+):
+    # patch all internal factory steps to isolate the constructor
+    patch_prepare_training_config, training_cfg = training_config_stub
+
+    mock_setup_env = mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+
+    train_ds = mocker.MagicMock(name="train_dataset")
+    eval_ds = mocker.MagicMock(name="eval_dataset")
+    mock_create_datasets = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_datasets",
+        autospec=True,
+        return_value=(train_ds, eval_ds),
+    )
+
+    mock_create_model = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_model",
+        autospec=True,
+        return_value=model_bundle,
+    )
+
+    optim_cls = mocker.MagicMock(name="OptimizerClass")
+    optim_kwargs = {"lr": 1e-4}
+    mock_create_optimizer = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_optimizer",
+        autospec=True,
+        return_value=(optim_cls, optim_kwargs),
+    )
+
+    callbacks = [mocker.MagicMock(name="Callback")]
+    mock_create_callbacks = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_callbacks",
+        autospec=True,
+        return_value=callbacks,
+    )
+
+    trainer_obj = mocker.MagicMock(name="Trainer")
+    mock_create_trainer = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_trainer",
+        autospec=True,
+        return_value=trainer_obj,
+    )
+    pipeline = FineTuningPipeline(mock_config_manager)
+
+    # Assert: environment + training config prepared
+    mock_setup_env.assert_called_once()
+    patch_prepare_training_config.assert_called_once_with(config_manager=mock_config_manager)
+    assert pipeline.training_config == training_cfg
+
+    # Assert: datasets created and assigned
+    mock_create_datasets.assert_called_once()
+    assert pipeline.train_dataset is train_ds
+    assert pipeline.eval_dataset is eval_ds
+
+    # Assert: model/tokenizer assigned
+    mock_create_model.assert_called_once()
+    assert pipeline.model is model_bundle.model
+    assert pipeline.tokenizer is model_bundle.tokenizer
+
+    # Assert: optimizer + callbacks
+    mock_create_optimizer.assert_called_once()
+    mock_create_callbacks.assert_called_once()
+    assert pipeline.optimizer_cls_and_kwargs == (optim_cls, optim_kwargs)
+    assert pipeline.callbacks == callbacks
+
+    # Assert: trainer constructed with expected wiring
+    mock_create_trainer.assert_called_once_with(
+        mocker.ANY,  # self (bound by autospec)
+        model=model_bundle.model,
+        tokenizer=model_bundle.tokenizer,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        optimizer_cls_and_kwargs=(optim_cls, optim_kwargs),
+        callbacks=callbacks,
+        training_config=training_cfg,
+    )
+    assert pipeline.trainer is trainer_obj
+
+    # Assert: logger calls
+    lr0 = mock_logger.log_rank_zero
+    expected_msgs = [
+        mocker.call("Creating datasets..."),
+        mocker.call("Loading model and tokenizer..."),
+        mocker.call("Preparing optimizer..."),
+        mocker.call("Creating callbacks..."),
+        mocker.call("Initializing trainer..."),
+    ]
+    lr0.assert_has_calls(expected_msgs, any_order=False)
+
+
+# ---------- Tests: individual steps / behaviors ----------
+
+
+def test_setup_environment_called_and_output_dir_set(mocker, mock_config_manager, tmp_outdir):
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(None, None))
+    mocker.patch.object(
+        FineTuningPipeline, "_create_model", autospec=True, return_value=mocker.MagicMock(model=None, tokenizer=None)
+    )
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(None, {}))
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=[])
+    mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=mocker.MagicMock())
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value={})
+
+    pipe = FineTuningPipeline(mock_config_manager)
+
+    # Assert
+    assert Path(pipe.output_dir) == Path(tmp_outdir)
+
+
+@pytest.mark.parametrize(
+    "train_split,test_split,expected_train_split,expected_test_split",
+    [
+        ("train", "test", "train", "test"),  # Default splits
+        ("training", "testing", "training", "testing"),  # Custom splits
+    ],
+)
+def test_create_datasets_called_and_assigned(
+    mocker,
+    mock_config_manager,
+    train_split,
+    test_split,
+    expected_train_split,
+    expected_test_split,
+):
+    """Test dataset creation with default and custom split names."""
+    mocker.patch(
+        f"{MODULE}.prepare_training_config",
+        autospec=True,
+        return_value={"fp16": True, "torch_dtype": "fp16"},
+    )
+
+    mock_config_manager.config.training = {
+        "output_dir": "tmp_outdir",
+        "seed": 42,
+    }
+
+    mock_config_manager.get_dataset_config.return_value = {
+        "dataset_type": "sft_dataset",
+        "dataset_name": "test_dataset",
+        "train_split": train_split,
+        "test_split": test_split,
+    }
+
+    train_ds = MagicMock(name="train_ds")
+    eval_ds = MagicMock(name="eval_ds")
+
+    def create_dataset_side_effect(*args, **kwargs):
+        split = kwargs.get("split")
+        if split is None and args:
+            split = args[0]
+        split = split or ""
+        return train_ds if expected_train_split in split else eval_ds
+
+    with patch(f"{MODULE}.ComponentFactory") as mock_factory:
+        mock_factory.create_dataset.side_effect = create_dataset_side_effect
+        mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+        bundle = MagicMock(model=mocker.MagicMock(), tokenizer=mocker.MagicMock())
+        mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=bundle)
+        mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(None, {}))
+        mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=[])
+        mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=mocker.MagicMock())
+
+        pipeline = FineTuningPipeline(mock_config_manager)
+        assert pipeline.train_dataset == train_ds
+        assert pipeline.eval_dataset == eval_ds
+        calls = mock_factory.create_dataset.call_args_list
+        assert len(calls) == 2, f"Expected two calls (train/test), got {len(calls)}: {calls}"
+        assert calls[0].kwargs["split"] == expected_train_split
+        assert calls[1].kwargs["split"] == expected_test_split
+        assert calls[0].kwargs["seed"] == 42
+        assert calls[0].kwargs["dataset_type"] == "sft_dataset"
+        assert calls[0].kwargs["dataset_name"] == "test_dataset"
+
+
+def test_create_model_failure_stops_pipeline(mocker, mock_config_manager):
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value={})
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(None, None))
+
+    mock_create_model = mocker.patch.object(
+        FineTuningPipeline, "_create_model", autospec=True, side_effect=RuntimeError("model load failed")
+    )
+    mock_create_optimizer = mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True)
+    mock_create_callbacks = mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True)
+    mock_create_trainer = mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True)
+
+    with pytest.raises(RuntimeError, match="model load failed"):
+        _ = FineTuningPipeline(mock_config_manager)
+
+    mock_create_model.assert_called_once()
+    mock_create_optimizer.assert_not_called()
+    mock_create_callbacks.assert_not_called()
+    mock_create_trainer.assert_not_called()
+
+
+def test_trainer_receives_expected_arguments(mocker, mock_config_manager, model_bundle):
+    training_cfg = {"epochs": 1}
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value=training_cfg)
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+
+    train_ds = mocker.MagicMock(name="T")
+    eval_ds = mocker.MagicMock(name="E")
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(train_ds, eval_ds))
+    mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=model_bundle)
+
+    optim_cls = object()
+    optim_kwargs = {"weight_decay": 0.01}
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(optim_cls, optim_kwargs))
+
+    callbacks = [mocker.MagicMock()]
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=callbacks)
+
+    trainer_obj = mocker.MagicMock(name="Trainer")
+    mocked_trainer = mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=trainer_obj)
+
+    pipe = FineTuningPipeline(mock_config_manager)
+
+    # Assert: _create_trainer wiring
+    mocked_trainer.assert_called_once_with(
+        mocker.ANY,
+        model=model_bundle.model,
+        tokenizer=model_bundle.tokenizer,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        optimizer_cls_and_kwargs=(optim_cls, optim_kwargs),
+        callbacks=callbacks,
+        training_config=training_cfg,
+    )
+    assert pipe.trainer is trainer_obj
+
+
+def test_create_datasets_failure_stops_pipeline(mocker, mock_config_manager):
+    """
+    If _create_datasets raises, pipeline should not proceed to model/optimizer/trainer.
+    """
+
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value={})
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+
+    mock_create_datasets = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_datasets",
+        autospec=True,
+        side_effect=RuntimeError("dataset failure"),
+    )
+
+    mock_create_model = mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True)
+    mock_create_optimizer = mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True)
+    mock_create_callbacks = mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True)
+    mock_create_trainer = mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True)
+
+    with pytest.raises(RuntimeError, match="dataset failure"):
+        _ = FineTuningPipeline(mock_config_manager)
+
+    mock_create_datasets.assert_called_once()
+    mock_create_model.assert_not_called()
+    mock_create_optimizer.assert_not_called()
+    mock_create_callbacks.assert_not_called()
+    mock_create_trainer.assert_not_called()
+
+
+def test_create_trainer_failure_stops_pipeline(mocker, mock_config_manager):
+    """
+    If _create_trainer raises, ensure earlier steps ran and no further actions are taken.
+    """
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value={})
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+
+    train_ds = mocker.MagicMock(name="train_ds")
+    eval_ds = mocker.MagicMock(name="eval_ds")
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(train_ds, eval_ds))
+
+    bundle = mocker.MagicMock(name="ModelBundle")
+    bundle.model = mocker.MagicMock(name="model")
+    bundle.tokenizer = mocker.MagicMock(name="tokenizer")
+    mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=bundle)
+
+    optim_cls = mocker.MagicMock(name="OptimClass")
+    optim_kwargs = {"lr": 1e-4}
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(optim_cls, optim_kwargs))
+
+    callbacks = [mocker.MagicMock(name="Callback")]
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=callbacks)
+
+    mock_create_trainer = mocker.patch.object(
+        FineTuningPipeline,
+        "_create_trainer",
+        autospec=True,
+        side_effect=RuntimeError("trainer init failed"),
+    )
+
+    with pytest.raises(RuntimeError, match="trainer init failed"):
+        _ = FineTuningPipeline(mock_config_manager)
+
+    mock_create_trainer.assert_called_once()
+
+
+def test_config_manager_used_and_output_dir_set(mocker, mock_config_manager, tmp_outdir):
+    """
+    Ensure prepare_training_config is called with the provided config_manager
+    and that output_dir is read from config.training.
+    """
+    training_cfg = {"epochs": 1}
+    patch_prep = mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value=training_cfg)
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(None, None))
+    bundle = mocker.MagicMock(model=None, tokenizer=None)
+    mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=bundle)
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(None, {}))
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=[])
+    mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=mocker.MagicMock())
+
+    pipe = FineTuningPipeline(mock_config_manager)
+
+    patch_prep.assert_called_once_with(config_manager=mock_config_manager)
+    assert pipe.training_config == training_cfg
+    assert Path(pipe.output_dir) == Path(tmp_outdir)
+
+
+def test_complete_run_calls_trainer_train(mocker, mock_config_manager):
+    """
+    Tests trainer.train() is called during run().
+    This is a basic smoke test for the main execution flow.
+    """
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value={})
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(None, None))
+    bundle = mocker.MagicMock(model=mocker.MagicMock(), tokenizer=mocker.MagicMock())
+    mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=bundle)
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(None, {}))
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=[])
+    trainer_obj = mocker.MagicMock()
+    mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=trainer_obj)
+
+    pipe = FineTuningPipeline(mock_config_manager)
+    pipe.run()
+    trainer_obj.train.assert_called_once()
diff --git a/QEfficient/finetune/experimental/tests/test_integrated.py b/QEfficient/finetune/experimental/tests/test_integrated.py
new file mode 100644
index 0000000000..d13d237bc7
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_integrated.py
@@ -0,0 +1,368 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+End-to-end integration tests for the new experimental finetuning pipeline.
+Tests the complete workflow using all components from the core/ directory.
+"""
+
+import os
+import shutil
+import tempfile
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+import torch
+
+from QEfficient.cloud.finetune_experimental import FineTuningPipeline
+from QEfficient.finetune.experimental.core.config_manager import (
+    ConfigManager,
+    DatasetConfig,
+    MasterConfig,
+    ModelConfig,
+    OptimizerConfig,
+    PeftConfig,
+    SchedulerConfig,
+    TrainingConfig,
+)
+from QEfficient.finetune.experimental.core.logger import Logger
+from QEfficient.finetune.experimental.tests.constants import (
+    HF_DATASET_ALPACA,
+    HF_DATASET_GSM8K,
+    HF_DATASET_GSM8K_CONFIG,
+    HF_DATASET_IMDB,
+    TEST_DATASET_SUBSET_SIZE,
+    TEST_LEARNING_RATE,
+    TEST_LOGGING_STEPS,
+    TEST_LORA_ALPHA,
+    TEST_LORA_BIAS,
+    TEST_LORA_DROPOUT,
+    TEST_LORA_R,
+    TEST_LORA_TARGET_MODULES_BERT,
+    TEST_LORA_TARGET_MODULES_LLAMA,
+    TEST_MAX_SEQ_LENGTH_CAUSAL,
+    TEST_MAX_SEQ_LENGTH_SEQ_CLS,
+    TEST_MODEL_LLAMA,
+    TEST_NUM_HIDDEN_LAYERS,
+    TEST_NUM_TRAIN_EPOCHS,
+    TEST_PER_DEVICE_BATCH_SIZE,
+    TEST_SEED,
+    TEST_WARMUP_STEPS,
+    TEST_WEIGHT_DECAY,
+    TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD,
+    AutoClassName,
+    DatasetType,
+    TaskType,
+)
+
+logger = Logger(__name__)
+# ============================================================================
+# Test Configuration Dataclasses
+# ============================================================================
+
+
+@dataclass
+class TestModelConfig:
+    """Dataclass for test model configuration."""
+
+    model_name: str
+    task_type: TaskType
+    use_peft: bool
+    target_modules: list[str]
+
+
+@dataclass
+class TestDatasetConfig:
+    """Dataclass for test dataset configuration."""
+
+    dataset_name: str
+    hf_dataset_name: str
+    hf_dataset_config: Optional[str]
+    prompt_template: str
+    completion_template: str
+    max_seq_length: int
+
+
+@dataclass
+class TestTrainingConfig:
+    """Dataclass for test training configuration."""
+
+    max_eval_step: int
+    max_train_step: int
+    config_name: str
+
+
+# ============================================================================
+# Test Configuration Constants
+# ============================================================================
+
+# Model configurations
+LLAMA_MODEL_CONFIG = TestModelConfig(
+    model_name=TEST_MODEL_LLAMA,
+    task_type=TaskType.CAUSAL_LM,
+    use_peft=True,
+    target_modules=TEST_LORA_TARGET_MODULES_LLAMA,
+)
+
+BERT_MODEL_CONFIG = TestModelConfig(
+    model_name="google-bert/bert-base-uncased",
+    task_type=TaskType.SEQ_CLS,
+    use_peft=False,
+    target_modules=TEST_LORA_TARGET_MODULES_BERT,
+)
+
+# Dataset configurations
+GSM8K_DATASET_CONFIG = TestDatasetConfig(
+    dataset_name="openai/gsm8k",
+    hf_dataset_name=HF_DATASET_GSM8K,
+    hf_dataset_config=HF_DATASET_GSM8K_CONFIG,
+    prompt_template="Question: {question}\nAnswer: ",
+    completion_template="{answer}",
+    max_seq_length=TEST_MAX_SEQ_LENGTH_CAUSAL,
+)
+
+ALPACA_DATASET_CONFIG = TestDatasetConfig(
+    dataset_name="yahma/alpaca-cleaned",
+    hf_dataset_name=HF_DATASET_ALPACA,
+    hf_dataset_config=None,
+    prompt_template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
+    completion_template="{output}",
+    max_seq_length=TEST_MAX_SEQ_LENGTH_CAUSAL,
+)
+
+IMDB_DATASET_CONFIG = TestDatasetConfig(
+    dataset_name="imdb",
+    hf_dataset_name=HF_DATASET_IMDB,
+    hf_dataset_config=None,
+    prompt_template="Review: {text}\nSentiment: ",
+    completion_template="{label}",
+    max_seq_length=TEST_MAX_SEQ_LENGTH_SEQ_CLS,
+)
+
+# ============================================================================
+# Helper Functions
+# ============================================================================
+
+
+def create_master_config(
+    model_config: TestModelConfig,
+    dataset_config: TestDatasetConfig,
+    output_dir: str,
+) -> MasterConfig:
+    """
+    Create a MasterConfig instance from test configurations.
+
+    Args:
+        model_config: Test model configuration
+        dataset_config: Test dataset configuration
+        output_dir: Output directory for training results
+
+    Returns:
+        MasterConfig instance
+    """
+    # Determine auto_class_name and dataset_type based on task type
+    if model_config.task_type == TaskType.CAUSAL_LM:
+        auto_class_name = AutoClassName.CAUSAL_LM.value
+        dataset_type = DatasetType.SFT_DATASET.value
+    elif model_config.task_type == TaskType.SEQ_CLS:
+        auto_class_name = AutoClassName.SEQ_CLS.value
+        dataset_type = DatasetType.SFT_DATASET.value
+    else:
+        raise ValueError(f"Unsupported task type: {model_config.task_type}")
+    return MasterConfig(
+        model=ModelConfig(
+            model_name=model_config.model_name,
+            model_type="hf",
+            auto_class_name=auto_class_name,
+            use_peft=model_config.use_peft,
+            use_cache=False,
+            attn_implementation="eager",
+            device_map=None,
+            peft_config=PeftConfig(
+                lora_r=TEST_LORA_R,
+                lora_alpha=TEST_LORA_ALPHA,
+                lora_dropout=TEST_LORA_DROPOUT,
+                target_modules=model_config.target_modules,
+                bias=TEST_LORA_BIAS,
+                task_type=model_config.task_type.value,
+                peft_type="LORA",
+            )
+            if model_config.use_peft
+            else None,
+        ),
+        dataset=DatasetConfig(
+            tokenizer_name=model_config.model_name,
+            dataset_type=dataset_type,
+            dataset_name=dataset_config.dataset_name,
+            max_seq_length=dataset_config.max_seq_length,
+            train_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
+            eval_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
+            prompt_template=dataset_config.prompt_template,
+            completion_template=dataset_config.completion_template,
+            num_workers=1,
+            test_split="train",
+            config_name=dataset_config.hf_dataset_config,
+            dataset_num_samples=TEST_DATASET_SUBSET_SIZE,
+        ),
+        optimizers=OptimizerConfig(
+            optimizer_name="adamw",
+            lr=TEST_LEARNING_RATE,
+            weight_decay=TEST_WEIGHT_DECAY,
+        ),
+        scheduler=SchedulerConfig(
+            scheduler_name="cosine",
+            warmup_steps=TEST_WARMUP_STEPS,
+        ),
+        training=TrainingConfig(
+            type="sft",  # Using the "type" field from TrainingConfig
+            output_dir=output_dir,
+            num_train_epochs=TEST_NUM_TRAIN_EPOCHS,
+            per_device_train_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
+            per_device_eval_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
+            logging_steps=TEST_LOGGING_STEPS,
+            save_strategy="no",
+            eval_strategy="no",
+            seed=TEST_SEED,
+        ),
+    )
+
+
+def run_training(trainer, config_name: str):
+    """
+    Run training and return results.
+
+    Args:
+        trainer: Trainer instance
+        config_name: Configuration name for logging
+
+    Returns:
+        Training result, Evaluation result
+    """
+    logger.info(f"Starting training for {config_name}...")
+    train_result = trainer.train()
+    logger.info(f"Training completed for {config_name}!")
+    logger.info(f"Starting evaluation for {config_name}...")
+    eval_result = trainer.evaluate()
+    logger.info(f"Evaluation completed for {config_name}!")
+
+    return train_result, eval_result
+
+
+def verify_training_results(train_result, eval_result):
+    """
+    Verify training results.
+
+    Args:
+        train_result: Training result object
+        eval_result: Evaluation result dictionary
+    """
+    assert train_result is not None
+    assert hasattr(train_result, "training_loss")
+    assert "eval_loss" in eval_result
+    logger.info(f"Training loss: {train_result.training_loss:.4f}")
+    logger.info(f"Evaluation loss: {eval_result['eval_loss']:.4f}")
+    assert abs(train_result.training_loss - eval_result["eval_loss"]) < TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD
+
+
+def run_inference_causal_lm(model, tokenizer):
+    """
+    Run inference for causal language models.
+
+    Args:
+        model: Model instance
+        tokenizer: Tokenizer instance
+    """
+    test_prompt = "Test prompt for generation."
+    texts = tokenizer(test_prompt, return_tensors="pt")
+    texts = texts.to(model.device)
+    with torch.inference_mode():
+        outputs = model.generate(
+            **texts,
+            temperature=0.4,
+            max_new_tokens=10,
+            do_sample=False,
+        )
+    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    logger.info(f"Generated text: {generated_text}")
+
+
+# ============================================================================
+# Test Classes
+# ============================================================================
+
+
+class TestCausalLMIntegration:
+    """Integration tests for Causal Language Modeling tasks."""
+
+    def setup_method(self):
+        """Setup method executed before each test."""
+        self.test_output_dir = tempfile.mkdtemp(prefix="test_ft_causal_lm_")
+        logger.info(f"Created test directory: {self.test_output_dir}")
+
+    def teardown_method(self):
+        """Teardown method executed after each test."""
+        if os.path.exists(self.test_output_dir):
+            try:
+                shutil.rmtree(self.test_output_dir)
+                logger.info(f"Cleaned up test directory: {self.test_output_dir}")
+            except Exception as e:
+                logger.warning(f"Warning: Failed to clean up {self.test_output_dir}: {e}")
+
+    @pytest.mark.parametrize(
+        "dataset_config,config_name",
+        [
+            pytest.param(
+                GSM8K_DATASET_CONFIG,
+                "llama_3.2_1B_gsm8k",
+                id="llama_gsm8k",
+            ),
+            pytest.param(
+                ALPACA_DATASET_CONFIG,
+                "llama_3.2_1B_alpaca",
+                id="llama_alpaca",
+            ),
+        ],
+    )
+    def test_llama_causal_lm(self, dataset_config: TestDatasetConfig, config_name: str):
+        """
+        Test Llama model with different datasets for causal language modeling.
+
+        Args:
+            dataset_config: Dataset configuration
+            config_name: Configuration name for logging
+        """
+        # Create master configuration
+        master_config = create_master_config(
+            model_config=LLAMA_MODEL_CONFIG,
+            dataset_config=dataset_config,
+            output_dir=self.test_output_dir,
+        )
+        config_manager = ConfigManager(master_config)
+        model_config = config_manager.get_model_config()
+        # for fast testing
+        model_config["num_hidden_layers"] = TEST_NUM_HIDDEN_LAYERS
+        pipeline = FineTuningPipeline(config_manager)
+        model, tokenizer = pipeline.get_model_and_tokenizer()
+        trainer = pipeline.get_trainer()
+        # Verify model and tokenizer are loaded correctly
+        assert model is not None, "Model should be loaded"
+        assert tokenizer is not None, "Tokenizer should be loaded"
+        assert hasattr(model, "generate"), "Model should have generate method"
+        assert hasattr(tokenizer, "decode"), "Tokenizer should have decode method"
+        logger.info(f"Model and tokenizer loaded successfully for {config_name}")
+        # Verify model parameters
+        total_params = sum(p.numel() for p in model.parameters())
+        logger.info(f"Total parameters: {total_params:,}")
+        # Run training
+        train_result, eval_result = run_training(trainer, config_name)
+
+        # Verify training results
+        verify_training_results(train_result, eval_result)
+
+        # Test inference
+        run_inference_causal_lm(model, tokenizer)
diff --git a/QEfficient/finetune/experimental/tests/test_logger.py b/QEfficient/finetune/experimental/tests/test_logger.py
index 0af0c8b512..d976dc5c0a 100644
--- a/QEfficient/finetune/experimental/tests/test_logger.py
+++ b/QEfficient/finetune/experimental/tests/test_logger.py
@@ -48,6 +48,7 @@ def test_init_with_file(self, tmp_path):
     def test_log_levels(self, caplog):
         """Test all log levels work correctly"""
         logger = Logger("level_test_logger", level=logging.DEBUG)
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.DEBUG):
             logger.debug("Debug message")
@@ -63,22 +64,24 @@ def test_log_levels(self, caplog):
             assert "Error message" in caplog.text
             assert "Critical message" in caplog.text
 
-    @patch("QEfficient.finetune.experimental.core.logger.get_local_rank")
-    def test_log_rank_zero_positive_case(self, mock_get_local_rank, caplog):
+    @patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero")
+    def test_log_rank_zero_positive_case(self, mock_get_global_rank, caplog):
         """Test rank zero logging functionality"""
-        mock_get_local_rank.return_value = 0
+        mock_get_global_rank.return_value = True
         logger = Logger("rank_test_logger")
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.INFO):
             logger.log_rank_zero("Rank zero message")
 
             assert "Rank zero message" in caplog.text
 
-    @patch("QEfficient.finetune.experimental.core.logger.get_local_rank")
-    def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog):
+    @patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero")
+    def test_log_rank_zero_negative_case(self, mock_get_global_rank, caplog):
         """Test to verify that only rank‑zero messages are logged"""
-        mock_get_local_rank.return_value = 1
+        mock_get_global_rank.return_value = False
         logger = Logger("rank_test_logger")
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.INFO):
             logger.log_rank_zero("Should not appear")
@@ -88,6 +91,7 @@ def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog):
     def test_log_exception_raise(self, caplog):
         """Test exception logging with raising"""
         logger = Logger("exception_test_logger")
+        logger.logger.propagate = True
 
         with pytest.raises(ValueError), caplog.at_level(logging.ERROR):
             logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=True)
@@ -99,6 +103,7 @@ def test_log_exception_raise(self, caplog):
     def test_log_exception_no_raise(self, caplog):
         """Test exception logging without raising"""
         logger = Logger("exception_test_logger")
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.ERROR):
             logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=False)
@@ -168,7 +173,7 @@ def test_get_logger_with_file(self, tmp_path):
 
         # Check that we have 2 handlers (console + file)
         assert len(logger.logger.handlers) == 2  # Console + file
-        assert isinstance(logger.logger.handlers[1], logging.FileHandler)
+        any(isinstance(h, logging.FileHandler) for h in logger.logger.handlers)
 
         # Check file exists
         assert log_file.exists()
@@ -188,6 +193,7 @@ def test_complete_workflow(self, tmp_path, caplog):
         # Setup
         log_file = tmp_path / "workflow.log"
         logger = Logger("workflow_test", str(log_file), logging.DEBUG)
+        logger.logger.propagate = True
 
         # Test all methods
         logger.debug("Debug test")
@@ -203,8 +209,8 @@ def test_complete_workflow(self, tmp_path, caplog):
             logger.log_exception("Caught exception", e, raise_exception=False)
 
         # Test rank zero logging
-        with patch("QEfficient.finetune.experimental.core.logger.get_local_rank") as mock_rank:
-            mock_rank.return_value = 0
+        with patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero") as mock_rank:
+            mock_rank.return_value = True
             logger.log_rank_zero("Rank zero test")
 
         # Verify all messages were logged
diff --git a/QEfficient/finetune/experimental/tests/test_optimizer.py b/QEfficient/finetune/experimental/tests/test_optimizer.py
index e105d5ddf9..54c8494ceb 100644
--- a/QEfficient/finetune/experimental/tests/test_optimizer.py
+++ b/QEfficient/finetune/experimental/tests/test_optimizer.py
@@ -15,8 +15,8 @@
 from QEfficient.finetune.experimental.core.optimizer import prepare_optimizer
 
 OPTIMIZER_CONFIGS = {
-    "Adam": {
-        "optimizer_name": "Adam",
+    "adam": {
+        "optimizer_name": "adam",
         "opt_cls": optim.Adam,
         "lr": 1e-4,
         "weight_decay": 0.01,
@@ -24,7 +24,7 @@
         "eps": 1e-8,
         "amsgrad": False,
     },
-    "AdamW": {
+    "adamw": {
         "optimizer_name": "AdamW",
         "opt_cls": optim.AdamW,
         "lr": 1e-4,
@@ -33,8 +33,8 @@
         "eps": 1e-8,
         "amsgrad": False,
     },
-    "SGD": {
-        "optimizer_name": "SGD",
+    "sgd": {
+        "optimizer_name": "sgd",
         "opt_cls": optim.SGD,
         "lr": 1e-4,
         "momentum": 0.9,
@@ -42,15 +42,15 @@
         "dampening": 0.0,
         "nesterov": False,
     },
-    "RMSprop": {
-        "optimizer_name": "RMSprop",
+    "rmsprop": {
+        "optimizer_name": "rmsprop",
         "opt_cls": optim.RMSprop,
     },
 }
 
 REGISTRY_CONFIG = {
-    "RMSprop": {
-        "optimizer_name": "RMSprop",
+    "rmsprop": {
+        "optimizer_name": "rmsprop",
         "opt_cls": optim.RMSprop,
     },
 }
diff --git a/QEfficient/finetune/experimental/tests/test_trainer.py b/QEfficient/finetune/experimental/tests/test_trainer.py
index 20af61e36c..94b92e7156 100644
--- a/QEfficient/finetune/experimental/tests/test_trainer.py
+++ b/QEfficient/finetune/experimental/tests/test_trainer.py
@@ -345,11 +345,12 @@ def model_config(self):
     def peft_model_config(self):
         """Fixture for PEFT configuration."""
         return {
-            "lora_r": LORA_R,
-            "lora_alpha": LORA_ALPHA,
-            "lora_dropout": LORA_DROPOUT,
-            "target_modules": ["q_proj", "v_proj"],
+            "task_type": "CAUSAL_LM",
+            "r": 8,
+            "lora_alpha": 32,
+            "lora_dropout": 0.1,
             "bias": "none",
+            "target_modules": ["q_proj", "v_proj"],
         }
 
     @pytest.fixture
@@ -430,7 +431,7 @@ def test_sft_trainer_with_peft_model(self, model_config, peft_model_config, dumm
         hf_model = ComponentFactory.create_model("hf", model_name, **model_config)
         model = hf_model.model
         # Load PEFT Config
-        peft_config = LoraConfig(peft_model_config)
+        peft_config = LoraConfig(**peft_model_config)
         tokenizer = hf_model.tokenizer
 
         # Create SFT config
diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py
index a76dfae8af..149b12a8a0 100644
--- a/QEfficient/utils/device_utils.py
+++ b/QEfficient/utils/device_utils.py
@@ -9,6 +9,8 @@
 import re
 import subprocess
 
+import torch
+
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.logging_utils import logger
 
@@ -21,6 +23,29 @@ def is_networks_loaded(stdout):
     return False
 
 
+def is_nsp_free():
+    # FIXME: Give incorrect results when user doesn't have permission.
+    # To reproduce change the ownership of available devices.
+    device_count = torch.qaic.device_count()  # Get the number of available devices
+    if device_count == 0:
+        logger.warning("No QAIC devices found.")
+    for device_idx in range(device_count):
+        qid_idx = torch.qaic.get_device_info(device_idx).qid_index
+        command = ["/opt/qti-aic/tools/qaic-util", "-q", "-d", str(qid_idx)]
+        result = subprocess.run(command, capture_output=True, text=True)
+        text = result.stdout
+        free_nsp = re.search(r"Nsp Free:\s*(\d+)", text)
+        total_nsp = re.search(r"Nsp Total:\s*(\d+)", text)
+        if free_nsp and total_nsp:
+            nsp_free = int(free_nsp.group(1))
+            nsp_total = int(total_nsp.group(1))
+            # Check if NSP free is eqaul to total nsp
+            if nsp_free != nsp_total:
+                raise RuntimeError(f"QAIC device {qid_idx} does not have {nsp_total} NSP free")
+        else:
+            logger.warning("Failed to parse NSP free information from qaic-util output")
+
+
 def get_available_device_id():
     """
     API to check available device id.
diff --git a/docs/source/config.md b/docs/source/config.md
new file mode 100644
index 0000000000..7b5be6d0c3
--- /dev/null
+++ b/docs/source/config.md
@@ -0,0 +1,268 @@
+# Training Configuration
+(training-configuration)=
+## Overview
+
+This configuration file defines the setup for fine-tuning a Hugging Face causal language model using **LoRA (Low-Rank Adaptation)** and **PEFT (Parameter-Efficient Fine-Tuning)** techniques. It also includes dataset, training, optimizer, and scheduler settings.
+
+***
+## 1. Model Configuration
+
+Model-related parameters for loading and fine-tuning.
+
+*   **model\_type**: `default = hf` → Type of model (Use `hf` to load the model from huggingface. If the user has some custom model then user should inherit from BaseModel class and register the class under a particular key and use the key here).
+*   **auto\_class\_name**: `default = AutoModelForCausalLM` → AutoClass used to load the model (Only if `model_type : hf`).
+*   **model\_name**: `default = HuggingFaceTB/SmolLM-135M` → Pretrained model to fine-tune (Only if `model_type : hf`).
+*   **load\_in\_4bit**: `default = false` → If `true`, loads model in 4-bit quantization for memory efficiency.
+*   **use_cache**: `default = false`: Whether to use the **past key/values cache** in the model for faster decoding during generation.  
+    *Enabling this can significantly speed up autoregressive decoding by reusing previous attention computations.*
+
+*   **attn_implementation**: `default = "sdpa"`: The attention implementation to use. Common options:
+    *   `"sdpa"` → Scaled Dot-Product Attention (optimized for speed and memory).
+    *   `"eager"` → Standard eager-mode attention (simpler, but slower).
+
+*   **device_map**: `default= None`: Specifies how to distribute the model across devices.
+    *   `"auto"` → Automatically spreads layers across available GPUs/CPUs for memory efficiency.
+    *   `None` → No distribution; model stays on the default device.
+
+*   **use\_peft**:`default = true` → Enables PEFT for parameter-efficient fine-tuning.
+*   **peft\_config**: Defines LoRA parameters when `use_peft` is true`:
+    *   **lora_r**: `default = 8` Rank for LoRA adapters.
+    *   **lora_alpha**: `default = 16` Scaling factor for LoRA updates.
+    *   **lora_dropout**: `default = 0.1` Dropout applied to LoRA layers.
+    *   **target_modules**: `dafault = ["q_proj", "v_proj"]` Modules to apply LoRA (e.g., `q_proj`, `v_proj`,`o_proj`,`k_proj`,`up_proj`,`down_proj`,`gate_proj`).
+    *   **bias**: `default = None` Bias handling (`none`, `all`, `lora_only`).
+    *   **task_type**: `default = CAUSAL_LM` → Task type (e.g., `CAUSAL_LM`, `SEQ_2_SEQ_LM`).
+    *   **peft_type**: `default = LORA` → Fine-tuning method (e.g., `LORA`, `IA3`).
+
+***
+
+
+## 2. Dataset Configuration
+
+This section defines parameters for dataset handling during fine-tuning with Hugging Face models. It covers dataset type, splits, prompt formatting, and DataLoader settings.
+
+*   **tokenizer\_name**: `default = "HuggingFaceTB/SmolLM-135M"` → Matches model name.
+*   **dataset\_type**: `default = "seq_completion"` → Used for sequence continuation tasks, where the language model learns to generate the correct output (completion) step by step, given an input (prompt).
+*   **dataset\_name**: `default = "knkarthick/samsum"` → Dataset name for training.
+*   **json_file_path**: `default = None`→ Path to a custom JSON file containing the dataset.
+If provided, this takes precedence over dataset_name.
+*   **train\_split/test\_split**: `default = train/test` → Names of train and test splits to be used in case of dataset being loaded from Huggingface using dataset_name argument.
+*   **split\_ratio**: `default = 0.8` → For spliting the train/test dataset, only if train split is provided.
+*   **prompt\_func**: Path to python function to format prompts. Use when you need complex preprocessing or conditional logic to build the final prompt string from a dataset row (e.g alpaca dataset).
+*   **prompt\_template**: Template for formatting prompts from dataset rows.Prompt_template should contain the column names which are available in the dataset.
+
+     **Note** :If both prompt_template and prompt_func are provided, then prompt_template will take precedence over prompt_func.
+*  **completion\_func**: Path to python function to format completions. Use when you need complex preprocessing or conditional logic to build the final completion string from a dataset row.
+*   **completion\_template**: string pattern that tells the fine-tuning pipeline which part of the dataset should be treated as the target output (completion) for the model to learn.
+
+     **Note** :If both completion_template and completion_func are provided, then completion_template will take precedence over completion_func.
+*   **dataset_subset**: `default = "default"` → dataset_subset is used to pick a specific configuration of a dataset when the dataset provides multiple variants. The default is "default" but you can specify something like "en", "movies", "cleaned", etc., depending on the dataset.
+*   **max_seq_length**: `default = 512` → Maximum sequence length for tokenization. Longer inputs are truncated; shorter inputs may be padded depending on the collation.
+*   **input_columns**: `default = ["text"]` → Column names that contain input text to be tokenized.
+*   **target_column**: `default=None` → Column containing target labels (classification/regression). Set to `None` for generation-only workloads.
+*   **train_batch_size**: `default = 1` → Per-device batch size during training.
+*   **eval_batch_size**: `default = 1` → Per-device batch size during evaluation.
+*   **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch).
+*   **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching.
+*   **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field).
+*   **num_workers**: `default = 4` → Number of subprocesses to use for data loading.
+*   **dataloader_pin_memory**: `default = true` → Whether to pin memory for faster GPU transfer.
+*   **dataloader_drop_last**: `default = false` → Whether to drop the last incomplete batch.
+*   **dataset_num_samples**: `default = -1` → Number of samples to use from the dataset. If -1, all samples are used.
+*   **dataloader_prefetch_factor**: `default = 1` → Number of batches loaded in advance by the DataLoader to overlap I/O with computations.
+
+*   **dataloader_persistent_workers**: `default = true` → Whether to keep workers alive between epochs.
+*   **dataloader_num_workers**: `default = 1` → Number of workers used by the **DataLoader** to load batches in parallel.
+
+
+***
+### Example Dataset Configs 
+
+#### **1. Alpaca (yahma/alpaca-cleaned)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "sft_dataset"
+  dataset_name: "yahma/alpaca-cleaned"
+  train_split: "train"
+  test_split: "test"
+  max_seq_length: 512
+  prompt_func: "preprocess/alpaca_func:create_alpaca_prompt"
+  completion_template: "{output}"
+
+```
+(example-prompt-functions)=
+### Prompt Function Example
+
+```python
+# Alpaca
+#preprocess/alpaca_func.py
+def prompt_no_input(row):
+    return ("Below is an instruction that describes a task. "
+            "Write a response that appropriately completes the request.\n\n"
+            "### Instruction:\n{instruction}\n\n### Response:\n").format_map(row)
+
+
+def prompt_input(row):
+    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
+            "Write a response that appropriately completes the request.\n\n"
+            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)
+
+
+def create_alpaca_prompt(row):
+    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)
+```
+***
+
+#### **2. Samsum (knkarthick/samsum)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "sft_dataset"
+  dataset_name: "knkarthick/samsum"
+  train_split: "train"
+  test_split: "test"
+  prompt_template: "Summarize the following conversation:\n\n{'dialogue'}\n\nSummary:\n"
+  completion_template: "{summary}"
+
+```
+
+***
+#### **3. gsm8k (openai/gsm8k)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k"
+  config_name: "main"  # available config_name for gsm8k dataset: ["main", "socratic"]
+  train_split: "train"
+  test_split: "test"
+  prompt_template: "Solve the following math problem step by step:\n\n{'question'}\n\nAnswer:\n"
+  completion_template: "{answer}"
+
+```
+
+***
+#### **4. grammar (grammar_dataset)**
+
+```yaml
+dataset:
+  tokenizer_name: "meta-llama/Llama-3.2-1B"
+  dataset_type: "sft_dataset"
+  dataset_name: "grammar"
+  train_split: "train"
+  split_ratio: 0.8
+  prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n"
+  completion_template: "{target}"
+```
+
+***
+
+## 3. Training Configuration
+
+This section defines core parameters for fine-tuning and evaluation.
+
+*   **type**: `default = sft` → Specifies training type; `sft` will use trl's SFTTrainer infrastructure to perform PEFT based SFT training. `base' will use transformers' Trainer infrastructure. If user has written and registered some custom trainer then the same can be called by mentioning the registration key name here.
+*   **output\_dir**: `default = "./training_results"` → Directory where model checkpoints and logs are saved.
+*   **overwrite\_output\_dir**: `default = false` → Whether to overwrite the output directory if it already exists.
+*   **do\_eval**: `default = true` → Enables evaluation during training.
+*   **eval\_strategy**: `default = epoch` → When to run evaluation (e.g., per epoch or steps. In case of `steps` eval_strategy, include `eval_steps` to specify number of steps at which evaluation to be performed).
+*   **gradient\_accumulation\_steps**: `default = 1` → Accumulate gradients over multiple steps to simulate larger batch size.
+*   **dtype**: `default = fp16` → Mixed precision for faster training and reduced memory usage. FP16 dtype is recommended while training on QAIC backend.
+*   **seed**: `default = 42` → Random seed for reproducibility.
+*   **device**: `default = "qaic"` → The device to use for training (e.g., `"cuda"`, `"cpu"`, `"qaic"`).
+*   **per\_device\_train\_batch\_size**: `default = 1` → Batch size per device during training.
+*   **per\_device\_eval\_batch\_size**: `default = 1` → Batch size per device during evaluation.
+*   **num\_train\_epochs**: `default = 1` → Total number of training epochs.
+*   **max\_steps**: `default = -1` → If > 0, sets total number of training steps (overrides `num_train_epochs`).
+*   **log\_level**: `default = "info"` → Logging verbosity (`"debug"`, `"info"`, `"warning"`, `"error"`).
+*   **log\_on\_each\_node**: `default = true` → Whether to log on each node in distributed setups.
+*   **logging\_strategy**: `default = "steps"` → Logging strategy (`"no"`, `"steps"`, `"epoch"`).
+*   **logging\_steps**: `default = 10` → Steps between logging events.
+*   **save\_strategy**: `default = "epoch"` → Checkpoint save strategy (`"no"`, `"steps"`, `"epoch"`).
+*   **save\_steps**: `default = 100` → Steps between checkpoints (if `save_strategy="steps"`).
+*   **save\_total\_limit**: `default = 5` → Maximum number of checkpoints to keep (older ones are deleted).
+*   **metric\_for\_best\_model**: `default = "eval_loss"` → Metric used to determine the best model.
+*   **include\_num\_input\_tokens\_seen**: `default = true` → Log the number of input tokens processed.
+*   **average\_tokens\_across\_devices**: `default = true` → Average token counts across devices in distributed training.
+*   **fsdp\_config**: `default = None` → FSDP configuration dictionary.
+
+*   **deepspeed\_config**: `default = None` → DeepSpeed configuration dictionary.
+
+*   **accelerator\_config**: `default = None` → Accelerate configuration dictionary.
+
+*   **ddp\_config**: DDP configuration dictionary.
+
+*   **use\_cpu**: `default = false` → Whether to explicitly run training on CPU.
+*   **restore\_callback\_states\_from\_checkpoint**: → Whether to restore callback states from checkpoint.
+
+*   **gradient\_checkpointing**: Saves memory by recomputing activations during backward pass (slower but memory-efficient).
+*  **gradient_checkpointing_kwargs** :
+
+   *  **preserve_rng_state**: `default = true` → Controls whether to preserve the RNG (Random Number Generator) state during checkpointing. Preserving RNG state ensures reproducibility of stochastic operations (e.g., dropout) when recomputing activations during backward passes.
+   *  **use_reentrant**: `default = false`  → Determines whether to use reentrant gradient checkpointing. Reentrant checkpointing uses PyTorch's built-in mechanism for recomputation, which can reduce memory usage but may have limitations with certain custom autograd functions.
+*  **ddp\_config**: Arguments for Distributed Data Parallel (DDP) training.
+     *   **ddp\_backend**: `default = "qccl"` → Backend for distributed communication. Common options: `"nccl"` for GPU, `"gloo"` for CPU, `"qccl"` for QAIC.
+     *   **ddp\_find\_unused\_parameters**: `default = false` → Whether to detect unused parameters during backward pass.
+     *   **ddp\_bucket\_cap\_mb**: `default = 25` → Size (in MB) of gradient buckets for communication. Larger buckets reduce communication overhead but increase memory usage.
+     *   **ddp\_broadcast\_buffers**: `default = true` → Whether to broadcast model buffers (e.g., BatchNorm stats) across all ranks. Use `null` or `false` to skip for speed if safe.
+     *   **ddp\_timeout**: `default = 1800` → Timeout (in seconds) for DDP operations. Increase for large models or slow networks.
+ 
+*   **torch\_compile**: `default = false` → Wraps your model with torch.compile() (PyTorch 2.0+) to fuse ops, reduce Python overhead, and generate optimized kernels—often yielding speed-ups without code changes.
+*   **report_to**: `default = tensorboard` → Logging frameworks to use (e.g., `["tensorboard", "wandb","trackio"]`).
+
+*   **Optional distributed configs**: FSDP, DeepSpeed, or DDP for multi-QAIC or large-scale training.
+*    **resume_from_checkpoint**: Path to a checkpoint to resume training from.
+*    **disable_tqdm**: `default = false` → set to `true` to disable progress bar (if running in Notebook).
+*   **output_dir**: `default = "./training_results"` → Directory where training outputs (checkpoints, logs) will be saved.
+
+📁 **Output Directory Structure**
+
+    output_dir/
+    │
+    ├── checkpoints/              # Saved model checkpoints (checkpoint-*)
+    │
+    ├── runs/                     # TensorBoard logs
+    │   └── events.out.tfevents.* 
+    │
+    ├── logs/                     # Logs from other backends
+
+
+***
+
+## 4. Optimizer & Scheduler
+
+*   **optimizer**: `adamw`  → Optimizer for weight-decoupled regularization; options: `adamw`, `adam`, `sgd`.
+    *   **lr**: Initial learning rate (e.g., `5e-5` for fine-tuning).
+    *   **weight\_decay**: Regularization strength (commonly `0.01`).
+
+*   **scheduler**: `cosine`  → Learning rate decay strategy; options: `linear`, `cosine`, `cosine_with_restarts`, `polynomial`, `constant`, `constant_with_warmup`, `inverse_sqrt`.
+    *   **warmup\_steps**: Number of steps or ratio (e.g., `100` steps or `0.05` for 5% of total steps). Warmup is a technique where the learning rate starts small and gradually increases to the target value during the initial phase of training to stabilize optimization. Stabilizes early training and improves convergence.
+
+**Huggingface document for the reference and visualization of LRs**:
+https://huggingface.co/docs/transformers/v5.0.0rc1/en/main_classes/optimizer_schedules#transformers.SchedulerType
+ 
+***
+
+## 5. Callbacks
+
+Callbacks allow custom actions during training, such as logging, early stopping, or hardware profiling. Once these callbacks are registered, the trainer class will call these callbacks based on the state of the training. If a callback has "on_epoch_end" method defined then this method will be executed at the end of each epoch.
+
+*   **early\_stopping**:  
+    Stops training if there is no improvement in a monitored metric for a defined patience period.
+    *   **early\_stopping\_patience**: `3` → The number of consecutive evaluation steps or epochs without significant improvement after which training will stop early.
+    *   **early\_stopping\_threshold**: `0.01` → The minimum change in the monitored metric required to qualify as an improvement.
+*   **enhanced_progressbar**: A more informative progress bar that shows additional metrics like loss, accuracy, etc. It also provides better visualization of training progress. 
+*   **default_flow**: Handles the default behavior for logging, saving and evaluation. 
+*   **Printer**: Display progress and print the logs (`Printer` is used if you deactivate tqdm through the TrainingArguments, otherwise it’s `enhanced_progressbar`).   
+*   **JSONLoggerCallback**: Logs training metrics to a JSON file. This is useful for tracking training progress and results. 
+*   **tensorboard**: Enables logging of metrics and losses to TensorBoard for visualization.
+*   **QAICProfilerCallback**: Profiles QAIC devices over a specified training step range to monitor performance and resource usage.
+*   **QAICOpByOpVerifierCallback**: Verifies QAIC operations step-by-step during a specified training range for correctness and debugging.
+
+**References to some commonly used Hugging Face callbacks**:
+https://huggingface.co/docs/transformers/en/main_classes/callback
+***
\ No newline at end of file
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
new file mode 100644
index 0000000000..19d3b31df3
--- /dev/null
+++ b/docs/source/hf_finetune.md
@@ -0,0 +1,332 @@
+# HF-Based QEfficient Finetune Module
+
+The **QEfficient Fine-Tune Module** is a component of the QEfficient project focused on high-quality, production-grade fine-tuning pipelines. It leverages the Hugging Face ecosystem (Transformers, TRL) and supports QAIC (Qualcomm® AI) environments for accelerated training and inference.
+
+***
+
+## Highlights
+
+*   **SFT-first design** using `trl.SFTTrainer` with PEFT (LoRA/QLoRA) and mixed precision.
+*   **Typed Config Manager**: centralized YAML with validation, overrides, and profile inheritance.
+*   **Component Registry**: plug-and-play registries for models, tokenizers, datasets, trainers, optimizers, and callbacks.
+*   **Dataset support**: JSON/JSONL, CSV, and HF Hub datasets; supports instruction–response based chat schemas.
+*   **Parallelism**: This stack currently supports `Data Parallelism (DDP)` for single and multi node devices and `Pipeline Parallelism (PP)`. 
+*   **Multi-Node Finetuning**: Supports Multi node finetuning, which can be scaled across multiple servers.
+*   **Reproducibility**: experiment tracking hooks, seed control, and deterministic data loaders (where supported).
+
+***
+
+## Getting Started
+
+### Installation (ENV set up)
+
+Install the same prerequisites as **QEfficient**, additionally **QAIC PyTorch Eager mode** as needed.
+
+*   QEfficient Library: <https://github.com/quic/efficient-transformers/>
+
+If QEfficient is already installed, install `torch_qaic`, `transformers` and (optionally) `accelerate` for QAIC:
+
+```bash
+# torch_qaic (example wheel path for python 3.10 — adjust to your environment)
+pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl
+
+# Install transformers with QAIC backend support
+# Note: Upstream changes to transformer library
+git clone https://github.com/quic-swatia/transformers.git
+cd transformers 
+git checkout version-4.55.0 && pip install -e .
+
+# accelerate (example wheel path for python 3.10)
+pip install /opt/qti-aic/integrations/accelerate/py310/accelerate-1.10.0-py3-none-any.whl
+```
+
+Before training, set environment variables commonly used in HF and QAIC environments:
+
+```bash
+# Allow remote code in datasets that require it (use only if you trust the source)
+export HF_DATASETS_TRUST_REMOTE_CODE=True
+
+# QAIC debugging and device logs
+export QAIC_DEVICE_LOG_LEVEL=0   # Device-level logs
+export QAIC_DEBUG=1              # Show CPU fallback ops, etc.
+
+# Set temp directory
+export TMPDIR=$HOME/tmp
+```
+
+### Step-by-Step Guide to run a fine-tuning job
+
+> **Note**  
+> If you’re using the pre-built `torch-qaic-env` from the Docker image for QAIC SDK, `torch_qaic` and `accelerate` whl are already installed inside it.
+
+#### For QAIC Training
+For Docker-based environments, use the pre-built `torch-qaic-env` environment.
+
+```bash
+python -m venv finetune_env
+source finetune_env/bin/activate
+git clone https://github.com/quic/efficient-transformers.git
+cd efficient-transformers
+git checkout ft_experimental      #Can remove this once merged to mainline
+pip install -e .
+pip install   --index-url https://download.pytorch.org/whl/cpu \
+--extra-index-url     https://devpi.qualcomm.com/qcom/dev/+simple \
+--trusted-host devpi.qualcomm.com   "torch==2.9.1+cpu"  \
+"torchvision==0.24.1+cpu"   "torchaudio==2.9.1+cpu"
+pip install trl==0.22.0
+cd .. && git clone https://github.com/quic-swatia/transformers.git
+cd transformers 
+git checkout version-4.55.0 && pip install -e .
+cd .. && cd efficient-transformers
+QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py \
+QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+
+```
+
+
+#### For CUDA Training
+
+```bash
+python -m venv finetune_env
+source finetune_env/bin/activate
+git clone https://github.com/quic/efficient-transformers.git
+cd efficient-transformers
+git checkout ft_experimental   #Can remove this once merged to mainline
+pip install -e .
+pip install torch==2.9.1 torchvision==0.24.1 \
+torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
+pip install trl==0.22.0
+cd .. && git clone https://github.com/quic-swatia/transformers.git
+cd transformers 
+git checkout version-4.55.0 && pip install -e .
+cd .. && cd efficient-transformers
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 -m QEfficient.cloud.finetune_experimental \
+--device cuda --num_epochs 1 --model_name meta-llama/Llama-3.2-3B \
+--dataset_name  yahma/alpaca-cleaned --train_batch_size 1 \
+--gradient_accumulation_steps 768 \
+--prompt_func QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt \
+--completion_template {output}
+```
+
+***
+## Finetuning Guide
+
+### Sample Launch Commands
+
+**Single device (via YAML file)**
+
+```bash
+QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py \
+QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+
+#As Module
+QAIC_VISIBLE_DEVICES=0 python -m QEfficient.cloud.finetune_experimental \
+QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+```
+
+**Single device (via CLI flags)**
+
+```bash
+QAIC_VISIBLE_DEVICES=0 python -m QEfficient.cloud.finetune_experimental \
+--device qaic --lora_r 16 --target_modules q_proj, v_proj \
+--gradient_checkpointing True --dataset_name "yahma/alpaca-cleaned" \
+--completion_template {output} \
+--prompt_func QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt
+
+```
+
+**Distributed Data Parallelism (Using TorchRun)**
+#### If the tokenizer was used before forking processes (for DDP), which can cause deadlocks.
+```bash
+export TOKENIZERS_PARALLELISM=false
+```
+
+```bash
+QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+```
+
+**Distributed Data Parallelism(Using Accelerate)**
+```bash
+QAIC_VISIBLE_DEVICES=0,1,2,3 accelerate launch --num_processes 4 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+```
+
+***
+## Component Registry
+The training script uses a component registry to manage different components like models, optimizers, and datasets. This allows for easy swapping of components without modifying core logic.
+
+To register a new component, use the `@registry` decorator.
+See `QEfficient/finetune/experimental/core/component_registry.py` for more details on how to register components and their usage in the training pipeline. 
+
+***
+## Configuration
+
+The configuration system uses YAML files with typed validation. It supports:
+*   **Overrides**: Command-line arguments override config values.
+*   **Profiles**: Inherit from base profiles and override specific settings.
+*   **Validation**: Ensures all required fields are present and types match.
+
+See `QEfficient/finetune/experimental/core/config_manager.py` for more details on configuration management.
+Detailed configuration documentation is available in 
+[Training Configuration](#training-configuration).
+
+***
+## Prepare Data
+
+This module supports both custom dataset loaders and Hugging Face datasets. You can also define prompt templates or formatting functions in your configuration. Examples of prompt function in [Prompt Function Examples](#example-prompt-functions).
+See `QEfficient/finetune/experimental/examples` for more details on how to register our own custom dataset
+
+#### Using a Hugging Face Dataset with a Prompt Function/ Prompt Template
+
+In your config, reference an HF dataset and a template function name:
+
+```yaml
+dataset:
+  dataset_name: "yahma/alpaca-cleaned"
+  split_train: "train"
+  prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt"
+  completion_template: "{output}" # Template for completion field in dataset
+```
+
+Define the function (e.g., in `QEfficient/finetune/experimental/preprocessing/alpaca_func.py`):
+
+```python
+#preprocessing/alpaca_func.py
+def create_alpaca_prompt(row):
+    return prompt_no_input(row) if row["input"] == "" else prompt_input(row)
+```
+
+In your config, reference an HF dataset and a prompt template:
+
+```yaml
+dataset:
+  dataset_name: "openai/gsm8k"
+  config_name: "main"  # available config_name for gsm8k dataset: ["main", "socratic"]
+  train_split: "train"
+  prompt_template: "Solve the following math problem step by step:\n\n{'question'}\n\nAnswer:\n"
+  completion_template: "{answer}"
+```
+
+
+Notes: 
+*  The pipeline expects input data in JSON format. If your custom dataset is in JSONL or any other format, please convert it to JSON as a one‑time preprocessing step. After conversion, simply provide the JSON file path in your config.yaml.
+*  Ensure your dataset's rows have keys that match the placeholders used in "prompt_template" or "prompt func". Configure it in YAML (avoid Python f-strings inside YAML; use "{prompt}/{response}" placeholders)
+
+***
+## Parallelism
+
+The training script supports multiple parallelism strategies:
+
+### Data Parallelism (DDP)
+Distribute batches across devices.Configure this via `ddp` in the config.
+ ```yaml
+   ddp_config:
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: null
+    ddp_timeout: 1800
+ ```
+With the same sft_ddp_config.yaml, we can perform single node multi-device DDP and multinode DDP by changing the torchrun command
+ 
+**For DDP in a single server**:
+```bash
+QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc-per-node 4 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+``` 
+where nproc-per-node is number of workers(QAIC devices) running locally.
+
+**DDP across multiple servers(MULTINODE DDP for RACK LEVEL Finetuning)**:
+
+This enables scaling training across multiple nodes.
+
+Use servers with compatible/same network interface(eg:ethernet).
+
+And supported only for linux servers now. Use servers connected to same switch for benefits in time while scaling.
+
+*  On host server (i.e. the server which we are going to treat as the master and we’ll use the ip addr of this server as the master addr):
+
+    ```bash
+    QAIC_VISIBLE_DEVICES=0,1 GLOO_SOCKET_IFNAME=* torchrun --nnodes=2 --nproc-per-node=2 --node-rank=0 --master_addr=* --master_port=8888 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+    ```
+
+*  On client server:
+
+    ```bash
+    QAIC_VISIBLE_DEVICES=0,1 GLOO_SOCKET_IFNAME=* torchrun --nnodes=2 --nproc-per-node=2 --node-rank=1 --master_addr=* --master_port=8888 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+    ```
+
+*  Use servers with compatible/same network interface(eg:ethernet).
+*  PYTHONUNBUFFERED: make python prints unbuffered, especially useful to identify progress (or lack thereof) for distributed tasks.This is optional and not compulsory
+*  GLOO_SOCKET_IFNAME: specify which network interface gloo (and indirectly qccl) uses for inter-host communication (eg: eno1, eth0 etc)
+*  --nnodes: total number of hosts participating in the task
+*  --nproc-per-node: number of processes launched on this host, usually coincides with number of accelerators on this host
+*  --master_addr: ip of the host designated with node_rank=0 ($ ip addr) 
+*  --master_port: port on which host will be listening for other nodes to connect. (eg: 8888, 8000 etc).Use node-rank 0 on the host server and node-rank 1 on client server(for dual server setup).
+*  When running distributed training across multiple servers, the --node-rank parameter must be assigned a unique value for each server, starting from 0 and incrementing by 1 for each additional server. For a setup with N servers it range from 0 to N-1.
+
+***
+
+### Pipeline Parallelism (PP)
+
+Pipeline Parallelism splits a model's layers across multiple devices so that a model too large to fit on a single device can still be trained. 
+
+#### How it works
+
+PP is controlled by a single parameter: **`pp_degree`**.
+
+| `pp_degree` value | Behaviour |
+|---|---|
+| `1` (default) | PP disabled — standard single-device training |
+| `> 1` | Model is split into `pp_degree` stages, one per device |
+
+When `pp_degree > 1` the framework:
+1. Reads the model's layer count and architecture from its HuggingFace config.
+2. Distributes transformer layers as evenly as possible across stages (surplus layers go to the first stages).
+3. Pins the embedding (`model.embed_tokens`) to the first stage and the final norm (`model.norm`) to the last stage.
+4. When `pp_degree == num_available_devices`, uses HuggingFace's `device_map="auto"` for automatic placement. Otherwise a custom per-layer dict is built.
+
+#### Configuration parameter
+
+Add `pp_degree` under the `training` section of your YAML config or pass it as a CLI flag.
+
+```yaml
+# training section of your config YAML
+training:
+  device: "qaic"       # or "cuda"
+  pp_degree: 2         # split model into 2 pipeline stages
+```
+> **Note:** `pp_degree` must be ≤ the number of locally available devices. The total devices consumed per node is `pp_degree` (for PP-only) or `LOCAL_WORLD_SIZE × pp_degree` (for PP + DDP). Where LOCAL_WORLD_SIZE = number of processes per node. For example, add 'pp_degree: 2' as explained above in the existing yaml file: sft_single_device_gsm8k_config.yaml and use below commands. 
+
+#### Launch commands
+
+**PP only — single process, 2 stages (via YAML)**
+```bash
+python -m QEfficient.cloud.finetune_experimental configs/sft_single_device_gsm8k_config.yaml
+```
+where `sft_single_device_gsm8k_config.yaml` contains `pp_degree: 2` under `training:`.
+
+**PP only — single process, 2 stages (via CLI flags)**
+```bash
+python -m QEfficient.cloud.finetune_experimental \
+    --model_name meta-llama/Llama-3.2-1B \
+    --device qaic \
+    --pp_degree 2
+```
+
+
+
+#### Notes
+
+- PP is currently verified primarily for **Llama-family** models. Other architectures with different layer naming conventions may need adjustments in `device_map_utils.py`.
+
+***
+
+## To run the Finetune project tests
+
+Install following plugins:
+```bash
+pip install pytest pytest-mock
+```
+
+```bash
+QAIC_VISIBLE_DEVICES=0 python -m pytest QEfficient/finetune/experimental/tests/
+```

From 14260f9e2333420b971d95c4ec7c2ae9fc821a4d Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Wed, 25 Mar 2026 14:49:19 +0000
Subject: [PATCH 04/23] Formatted docs

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/docs/ReadMe.md |  0
 docs/source/hf_finetune.md                      | 11 ++++-------
 2 files changed, 4 insertions(+), 7 deletions(-)
 delete mode 100644 QEfficient/finetune/experimental/docs/ReadMe.md

diff --git a/QEfficient/finetune/experimental/docs/ReadMe.md b/QEfficient/finetune/experimental/docs/ReadMe.md
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index 19d3b31df3..06347ecc9f 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -59,7 +59,7 @@ export TMPDIR=$HOME/tmp
 > **Note**  
 > If you’re using the pre-built `torch-qaic-env` from the Docker image for QAIC SDK, `torch_qaic` and `accelerate` whl are already installed inside it.
 
-#### For QAIC Training
+#### For QAIC Finetuning
 For Docker-based environments, use the pre-built `torch-qaic-env` environment.
 
 ```bash
@@ -83,8 +83,7 @@ QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
 
 ```
 
-
-#### For CUDA Training
+#### For CUDA Finetuning
 
 ```bash
 python -m venv finetune_env
@@ -135,7 +134,7 @@ QAIC_VISIBLE_DEVICES=0 python -m QEfficient.cloud.finetune_experimental \
 
 ```
 
-**Distributed Data Parallelism (Using TorchRun)**
+**Distributed Data Parallelism (Via TorchRun)**
 #### If the tokenizer was used before forking processes (for DDP), which can cause deadlocks.
 ```bash
 export TOKENIZERS_PARALLELISM=false
@@ -145,7 +144,7 @@ export TOKENIZERS_PARALLELISM=false
 QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
 ```
 
-**Distributed Data Parallelism(Using Accelerate)**
+**Distributed Data Parallelism(Via Accelerate)**
 ```bash
 QAIC_VISIBLE_DEVICES=0,1,2,3 accelerate launch --num_processes 4 -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
 ```
@@ -206,7 +205,6 @@ dataset:
   completion_template: "{answer}"
 ```
 
-
 Notes: 
 *  The pipeline expects input data in JSON format. If your custom dataset is in JSONL or any other format, please convert it to JSON as a one‑time preprocessing step. After conversion, simply provide the JSON file path in your config.yaml.
 *  Ensure your dataset's rows have keys that match the placeholders used in "prompt_template" or "prompt func". Configure it in YAML (avoid Python f-strings inside YAML; use "{prompt}/{response}" placeholders)
@@ -313,7 +311,6 @@ python -m QEfficient.cloud.finetune_experimental \
 ```
 
 
-
 #### Notes
 
 - PP is currently verified primarily for **Llama-family** models. Other architectures with different layer naming conventions may need adjustments in `device_map_utils.py`.

From 3b6558f9d33951b1cf7c547f4507a9db6641af9e Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Thu, 26 Mar 2026 11:20:54 +0530
Subject: [PATCH 05/23] [QEff.finetune] Adding style remix dataset config
 (#858)

Adding config file to support style remix dataset

---------

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     |  2 +-
 .../experimental/configs/sft_ddp_config.yaml  |  1 +
 .../sft_single_device_alpaca_config.yaml      |  1 +
 ...t_single_device_custom_dataset_config.yaml | 50 +++++++++++++++++++
 .../sft_single_device_gsm8k_config.yaml       |  1 +
 .../experimental/core/config_manager.py       |  7 ++-
 .../finetune/experimental/core/dataset.py     | 11 ++++
 .../experimental/tests/test_dataset.py        | 11 ++--
 .../experimental/tests/test_finetune.py       |  1 +
 docs/source/config.md                         | 20 ++++----
 10 files changed, 89 insertions(+), 16 deletions(-)
 create mode 100644 QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 08ea8f5e5b..43fcde5f8c 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -115,7 +115,7 @@ def _create_datasets(self) -> Tuple[Any, Any]:
         dataset_name = dataset_config.get("dataset_name")
         train_split = dataset_config.get("train_split", "train")
         test_split = dataset_config.get("test_split", "test")
-        seed = self.config.training["seed"]
+        seed = dataset_config.get("data_seed", 42)
 
         # Create a copy of dataset_config excluding keys that are passed explicitly
         # to avoid duplicate keyword arguments when unpacking
diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
index f7a0f6b1a9..a426dd6140 100644
--- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -25,6 +25,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
index dfc5bd09c3..2bdf800bc5 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
@@ -24,6 +24,7 @@ dataset:
   dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
   prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
   completion_template: "{output}"    # Model will be trained on this part. 
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 # Training configuration
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
new file mode 100644
index 0000000000..fbdcc88d6b
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
@@ -0,0 +1,50 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Dataset: Style-Remix (hallisky/DiSC)
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
+  completion_template: "{generation}"    # Model will be trained on this part. 
+  dataset_disc_style: "sarcasm_more" # Style of dataset to use
+  data_seed: 42 # Random seed for dataset shuffling
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
index f8627f6dad..9391fb0bd6 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -25,6 +25,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 # Training configuration
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index a3e0a3cd2f..10b61c7957 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -144,9 +144,13 @@ class DatasetConfig:
         metadata={"help": "Function for formatting output completions (e.g., '{output}')."},
     )
     collate_fn: str = field(
-        default="dynamic_padding",
+        default=None,
         metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
     )
+    dataset_disc_style: str = field(
+        default=None,
+        metadata={"help": "Style of dataset"},
+    )
     group_by_length: bool = field(
         default=True,
         metadata={"help": "Whether to group samples by length to minimize padding."},
@@ -184,6 +188,7 @@ class DatasetConfig:
         metadata={"help": "Name of the hf configuration file."},
     )
     json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."})
+    data_seed: int = field(default=42, metadata={"help": "Seed for data shuffling and sampling."})
 
 
 @dataclass
diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index 22594cb81b..e607ef2b96 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -89,6 +89,7 @@ def __init__(
         **kwargs,
     ):
         self.split_ratio = split_ratio
+        self.seed = seed
         self.json_file_path = kwargs.get("json_file_path", None)
         self.prompt_template = kwargs.get("prompt_template", None)
         self.completion_template = kwargs.get("completion_template", None)
@@ -96,6 +97,7 @@ def __init__(
         self.completion_func_path = kwargs.get("completion_func", None)
         self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
         self.config_name = kwargs.get("config_name", None)
+        self.dataset_disc_style = kwargs.get("dataset_disc_style", None)
 
         if self.json_file_path not in (None, ""):
             if not os.path.isfile(self.json_file_path):
@@ -127,6 +129,7 @@ def _initialize_dataset(self):
             # Load dataset from JSON file
             validate_json_structure(self.json_file_path)
             self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
+            self.dataset = self.dataset.shuffle(seed=self.seed)
             # Apply train/test split if needed
             if self.split in ["train", "test"]:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
@@ -149,6 +152,14 @@ def _initialize_dataset(self):
                 load_split = "train"
             # FIXME: Add streaming support for larger datasets.
             self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+            self.dataset = self.dataset.shuffle(seed=self.seed)
+            if self.dataset_disc_style:
+                available_styles = set(self.dataset["category"])
+                if self.dataset_disc_style not in available_styles:
+                    raise RuntimeError(
+                        f"For DiSC dataset the provided disc_style '{self.dataset_disc_style}' is not supported."
+                    )
+                self.dataset = self.dataset.filter(lambda example: example["category"] == self.dataset_disc_style)
 
             if len(available_splits) == 1:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py
index d6dc5729cb..81d37db903 100644
--- a/QEfficient/finetune/experimental/tests/test_dataset.py
+++ b/QEfficient/finetune/experimental/tests/test_dataset.py
@@ -44,7 +44,9 @@ def setUp(self):
             {"question": "What is AI?", "answer": "Artificial Intelligence"},
             {"question": "What is ML?", "answer": "Machine Learning"},
             {"question": "What is DL?", "answer": "Deep Learning"},
+            {"question": "What is LLM?", "answer": "Large Language Model"},
             {"question": "What is NLP?", "answer": "Natural Language Processing"},
+            {"question": "What is VLM?", "answer": "Vision Language Model"},
             {"question": "", "answer": "Empty question"},  # Empty question
             {"question": "Valid question", "answer": ""},  # Empty answer
             {"question": None, "answer": "None question"},  # None question
@@ -78,6 +80,7 @@ def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder,
         def create_mock_dataset():
             mock_dataset = MagicMock()
             mock_dataset.column_names = ["text", "label"]
+            mock_dataset.shuffle.return_value = mock_dataset
             mock_dataset.num_rows = 3
 
             # Mock __getitem__ to return processed samples
@@ -177,7 +180,7 @@ def test_sft_dataset_json_file_without_filtering(self):
         )
 
         # When filtering is disabled and split="train" is used, it still applies train/test split
-        # So we get ~80% of 8 samples = ~6 samples
+        # So we get ~80% of 10 samples = ~8 samples
         self.assertGreater(len(dataset), 0)
         self.assertLessEqual(len(dataset), 8)
 
@@ -203,12 +206,12 @@ def test_sft_dataset_train_test_split_from_json(self):
             seed=SEED,
         )
 
-        # After filtering, we have 4 valid samples
-        # With split ratio, train should have ~3 samples, test should have ~1 sample
+        # After filtering, we have 6 valid samples
+        # With split ratio, train should have ~4 samples, test should have ~2 sample
         self.assertGreater(len(train_dataset), 0)
         self.assertGreater(len(test_dataset), 0)
         # Total should equal the filtered dataset size
-        self.assertEqual(len(train_dataset) + len(test_dataset), 4)
+        self.assertEqual(len(train_dataset) + len(test_dataset), 6)
 
     def test_sft_dataset_with_custom_prompt_function(self):
         """Test loading with custom prompt function."""
diff --git a/QEfficient/finetune/experimental/tests/test_finetune.py b/QEfficient/finetune/experimental/tests/test_finetune.py
index 8e3ead3e98..9eb857be71 100644
--- a/QEfficient/finetune/experimental/tests/test_finetune.py
+++ b/QEfficient/finetune/experimental/tests/test_finetune.py
@@ -226,6 +226,7 @@ def test_create_datasets_called_and_assigned(
         "dataset_name": "test_dataset",
         "train_split": train_split,
         "test_split": test_split,
+        "data_seed": 42,
     }
 
     train_ds = MagicMock(name="train_ds")
diff --git a/docs/source/config.md b/docs/source/config.md
index 7b5be6d0c3..88f36baf30 100644
--- a/docs/source/config.md
+++ b/docs/source/config.md
@@ -63,6 +63,8 @@ If provided, this takes precedence over dataset_name.
 *   **train_batch_size**: `default = 1` → Per-device batch size during training.
 *   **eval_batch_size**: `default = 1` → Per-device batch size during evaluation.
 *   **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch).
+*   **dataset_disc_style**: `default = None` →  Selects the style remix category to apply to the dataset during preprocessing; when None, no style remixing is applied and the original dataset style is preserved.
+
 *   **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching.
 *   **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field).
 *   **num_workers**: `default = 4` → Number of subprocesses to use for data loading.
@@ -88,7 +90,7 @@ dataset:
   train_split: "train"
   test_split: "test"
   max_seq_length: 512
-  prompt_func: "preprocess/alpaca_func:create_alpaca_prompt"
+  prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt"
   completion_template: "{output}"
 
 ```
@@ -144,21 +146,19 @@ dataset:
   completion_template: "{answer}"
 
 ```
-
 ***
-#### **4. grammar (grammar_dataset)**
+
+#### **4. Style-Remix (hallisky/DiSC)**
 
 ```yaml
 dataset:
-  tokenizer_name: "meta-llama/Llama-3.2-1B"
   dataset_type: "sft_dataset"
-  dataset_name: "grammar"
-  train_split: "train"
-  split_ratio: 0.8
-  prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n"
-  completion_template: "{target}"
-```
+  dataset_name: "hallisky/DiSC" 
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" 
+  completion_template: "{generation}"     
+  dataset_disc_style: "sarcasm_more" 
 
+```
 ***
 
 ## 3. Training Configuration

From 2d6d60b4861751c124a7bfa1ecde2c9e0e462f38 Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Thu, 26 Mar 2026 09:41:04 +0000
Subject: [PATCH 06/23] Commented unit test from cloud module

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 tests/unit_test/utils/test_cloud.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/unit_test/utils/test_cloud.py b/tests/unit_test/utils/test_cloud.py
index 264942970b..38b0d6b4c1 100644
--- a/tests/unit_test/utils/test_cloud.py
+++ b/tests/unit_test/utils/test_cloud.py
@@ -62,10 +62,11 @@ def test_finetune_module_importable(self):
 
         assert QEfficient.cloud.finetune is not None
 
-    def test_finetune_experimental_importable(self):
-        import QEfficient.cloud.finetune_experimental
+    # Commenting as trl library legals is not there yet, causing import errors. Will re-enable once trl is available.
+    # def test_finetune_experimental_importable(self):
+    #     import QEfficient.cloud.finetune_experimental
 
-        assert QEfficient.cloud.finetune_experimental is not None
+    #     assert QEfficient.cloud.finetune_experimental is not None
 
 
 # ---------------------------------------------------------------------------

From 6887919a0a5ba142ecd8fc97ccdec9a654702aff Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Fri, 27 Mar 2026 06:02:10 +0000
Subject: [PATCH 07/23] Added exception handling for dataset loading

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/core/dataset.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index e607ef2b96..eeaedd471c 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -140,7 +140,13 @@ def _initialize_dataset(self):
             if self.config_name is not None:
                 load_kwargs["name"] = self.config_name
 
-            db = load_dataset_builder(self.dataset_name, **load_kwargs)
+            try:
+                db = load_dataset_builder(self.dataset_name, **load_kwargs)
+            except Exception as e:
+                raise RuntimeError(
+                f"Failed to load dataset builder for '{self.dataset_name}': {e}. "
+                "Please check the dataset name and your network connection."
+            )
             available_splits = []
             if db.info.splits is not None:
                 available_splits = list(db.info.splits.keys())
@@ -151,7 +157,13 @@ def _initialize_dataset(self):
             if self.split not in available_splits:
                 load_split = "train"
             # FIXME: Add streaming support for larger datasets.
-            self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+            try:
+                self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+            except Exception as e:
+                raise RuntimeError(
+                f"Failed to load dataset '{self.dataset_name}' with split '{load_split}': {e}. "
+                "Please verify the dataset exists and is accessible."
+            )
             self.dataset = self.dataset.shuffle(seed=self.seed)
             if self.dataset_disc_style:
                 available_styles = set(self.dataset["category"])

From fbd1f647b3e33c29f1c7846c5669807ec97aa401 Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Fri, 27 Mar 2026 06:03:48 +0000
Subject: [PATCH 08/23] Format

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/core/dataset.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index eeaedd471c..5f0931f796 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -144,9 +144,9 @@ def _initialize_dataset(self):
                 db = load_dataset_builder(self.dataset_name, **load_kwargs)
             except Exception as e:
                 raise RuntimeError(
-                f"Failed to load dataset builder for '{self.dataset_name}': {e}. "
-                "Please check the dataset name and your network connection."
-            )
+                    f"Failed to load dataset builder for '{self.dataset_name}': {e}. "
+                    "Please check the dataset name and your network connection."
+                )
             available_splits = []
             if db.info.splits is not None:
                 available_splits = list(db.info.splits.keys())
@@ -161,9 +161,9 @@ def _initialize_dataset(self):
                 self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
             except Exception as e:
                 raise RuntimeError(
-                f"Failed to load dataset '{self.dataset_name}' with split '{load_split}': {e}. "
-                "Please verify the dataset exists and is accessible."
-            )
+                    f"Failed to load dataset '{self.dataset_name}' with split '{load_split}': {e}. "
+                    "Please verify the dataset exists and is accessible."
+                )
             self.dataset = self.dataset.shuffle(seed=self.seed)
             if self.dataset_disc_style:
                 available_styles = set(self.dataset["category"])

From c03e455b2484c4ec20e836c8c2d32967e54de14d Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic_sallabad@quicinc.com>
Date: Fri, 27 Mar 2026 00:15:39 +0530
Subject: [PATCH 09/23] [QEff. Finetuning]: Tests for Pipeline Parallelism and
 updated documentation (#893)

1) Added unit test cases for Pipeline Parallelism
2) Added documentation on how to run these tests
3) Created a constants file

Signed-off-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Co-authored-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 .../experimental/core/config_manager.py       |    3 +-
 .../finetune/experimental/core/model.py       |   10 -
 .../experimental/core/utils/constants.py      |    1 +
 .../tests/test_pipeline_parallelism.py        | 1030 +++++++++++++++++
 docs/source/hf_finetune.md                    |    8 +
 5 files changed, 1041 insertions(+), 11 deletions(-)
 create mode 100644 QEfficient/finetune/experimental/core/utils/constants.py
 create mode 100644 QEfficient/finetune/experimental/tests/test_pipeline_parallelism.py

diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 10b61c7957..9846c91944 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -21,6 +21,7 @@
 from transformers.hf_argparser import HfArgumentParser
 
 from QEfficient.finetune.experimental.core.logger import Logger
+from QEfficient.finetune.experimental.core.utils import constants
 from QEfficient.finetune.experimental.core.utils.dist_utils import is_main_process
 from QEfficient.utils.device_utils import is_nsp_free
 
@@ -855,7 +856,7 @@ def get_model_config(self) -> Dict[str, Any]:
             training_dtype = training_config.get("torch_dtype")
             if training_dtype:
                 # Convert from training format (fp16/bf16) to model format (float16/bfloat16)
-                dtype_mapping = {"fp16": "float16", "bf16": "bfloat16"}
+                dtype_mapping = dtype_mapping = constants.DTYPE_MAPPING
                 model_config["torch_dtype"] = dtype_mapping.get(training_dtype, "auto")
 
         return model_config
diff --git a/QEfficient/finetune/experimental/core/model.py b/QEfficient/finetune/experimental/core/model.py
index 0f087e6653..1ccd0ea864 100644
--- a/QEfficient/finetune/experimental/core/model.py
+++ b/QEfficient/finetune/experimental/core/model.py
@@ -105,16 +105,6 @@ def _resolve_auto_class(auto_class_name: str) -> Type:
             )
         return getattr(transformers, auto_class_name)
 
-    # def _build_quant_config(self) -> Optional[BitsAndBytesConfig]:
-    #     if not self.model_kwargs.get("load_in_4bit"):
-    #         return None
-    #     return BitsAndBytesConfig(
-    #         load_in_4bit=True,
-    #         bnb_4bit_quant_type=self.model_kwargs.get("bnb_4bit_quant_type", "nf4"),
-    #         bnb_4bit_compute_dtype=self.model_kwargs.get("bnb_4bit_compute_dtype", torch.float16),
-    #         bnb_4bit_use_double_quant=self.model_kwargs.get("bnb_4bit_use_double_quant", True),
-    #     )
-
     def configure_model_kwargs(self) -> Dict[str, Any]:
         """Hook for subclasses to tweak HF `.from_pretrained` kwargs."""
 
diff --git a/QEfficient/finetune/experimental/core/utils/constants.py b/QEfficient/finetune/experimental/core/utils/constants.py
new file mode 100644
index 0000000000..ed7c9e1bfa
--- /dev/null
+++ b/QEfficient/finetune/experimental/core/utils/constants.py
@@ -0,0 +1 @@
+DTYPE_MAPPING = {"fp16": "float16", "bf16": "bfloat16"}
diff --git a/QEfficient/finetune/experimental/tests/test_pipeline_parallelism.py b/QEfficient/finetune/experimental/tests/test_pipeline_parallelism.py
new file mode 100644
index 0000000000..110236555d
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_pipeline_parallelism.py
@@ -0,0 +1,1030 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Pipeline Parallelism (PP) tests for meta-llama/Llama-3.2-1B.
+"""
+
+import os
+import shutil
+from collections import Counter
+from types import SimpleNamespace
+from typing import Dict, List
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+from datasets import Dataset
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_LLAMA_MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+# Llama-3.2-1B has 16 transformer layers and uses tied embeddings
+_LLAMA_NUM_LAYERS = 16
+_LLAMA_TIED_EMBEDDINGS = True
+
+# 15 short instruction/response pairs used for training + evaluation
+_ALPACA_SAMPLES = [
+    {"text": "### Instruction:\nWhat is 2 + 2?\n### Response:\n4"},
+    {"text": "### Instruction:\nName the capital of France.\n### Response:\nParis"},
+    {"text": "### Instruction:\nWhat color is the sky?\n### Response:\nBlue"},
+    {"text": "### Instruction:\nHow many days in a week?\n### Response:\nSeven"},
+    {"text": "### Instruction:\nWhat is the boiling point of water in Celsius?\n### Response:\n100°C"},
+    {"text": "### Instruction:\nWho wrote Romeo and Juliet?\n### Response:\nWilliam Shakespeare"},
+    {
+        "text": "### Instruction:\nWhat language does Python code run in?\n### Response:\nPython is an interpreted language."
+    },
+    {"text": "### Instruction:\nConvert 1 km to meters.\n### Response:\n1000 meters"},
+    {"text": "### Instruction:\nWhat is H2O?\n### Response:\nWater"},
+    {"text": "### Instruction:\nWhat does CPU stand for?\n### Response:\nCentral Processing Unit"},
+    {"text": "### Instruction:\nHow many continents are there?\n### Response:\nSeven"},
+    {"text": "### Instruction:\nWhat is the speed of light?\n### Response:\nApproximately 3×10⁸ m/s"},
+    {"text": "### Instruction:\nWhat is the largest planet?\n### Response:\nJupiter"},
+    {
+        "text": "### Instruction:\nWhat is photosynthesis?\n### Response:\nThe process plants use to convert sunlight to energy."
+    },
+    {"text": "### Instruction:\nHow many bytes in a kilobyte?\n### Response:\n1024 bytes"},
+]
+
+
+def _make_fake_llama_config(
+    num_hidden_layers: int = _LLAMA_NUM_LAYERS,
+    tie_word_embeddings: bool = _LLAMA_TIED_EMBEDDINGS,
+    vocab_size: int = 32_000,
+    hidden_size: int = 2048,
+) -> SimpleNamespace:
+    """Return a minimal config object that looks like Llama-3.2-1B to our utils."""
+    return SimpleNamespace(
+        num_hidden_layers=num_hidden_layers,
+        tie_word_embeddings=tie_word_embeddings,
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        model_type="llama",
+    )
+
+
+def _assert_layer_device_ids(
+    dmap: Dict[str, int],
+    num_layers: int,
+    pp_degree: int,
+    local_rank: int = 0,
+) -> None:
+    """
+    Central invariant checker for transformer-layer device assignments.
+
+    Enforces:
+      1. Exactly ``num_layers`` layer keys exist – no gaps, no phantom layers.
+      2. Layer device IDs are **non-decreasing** (monotonicity / pipeline order).
+      3. All layer IDs are within the valid range for this rank.
+      4. Layers form a **complete partition**: union covers every layer index,
+         each stage-set is disjoint.
+      5. Each stage receives either ``base`` or ``base+1`` layers (balanced).
+      6. Every device in the rank's range is used at least once.
+    """
+    first_device = local_rank * pp_degree
+    valid_devices = set(range(first_device, first_device + pp_degree))
+
+    # --- 1. Key completeness: exactly the expected layer keys ---------------
+    expected_keys = {f"model.layers.{i}" for i in range(num_layers)}
+    actual_layer_keys = {k for k in dmap if k.startswith("model.layers.")}
+    missing = expected_keys - actual_layer_keys
+    phantom = actual_layer_keys - expected_keys
+    assert not missing, f"Missing layer keys in device map: {sorted(missing)}"
+    assert not phantom, f"Phantom layer keys in device map (never expected): {sorted(phantom)}"
+
+    # --- 2. Monotonicity: device IDs are non-decreasing -------------------
+    layer_devices: List[int] = [dmap[f"model.layers.{i}"] for i in range(num_layers)]
+    assert layer_devices == sorted(layer_devices), (
+        f"Layer-to-device assignment is not monotonically non-decreasing: {layer_devices}\n"
+        f"Layers must flow in order through the pipeline."
+    )
+
+    # --- 3. Range validity ------------------------------------------------
+    out_of_range = [(i, d) for i, d in enumerate(layer_devices) if d not in valid_devices]
+    assert not out_of_range, (
+        f"Layer(s) assigned to devices outside valid range {valid_devices} "
+        f"for rank={local_rank}, pp={pp_degree}: {out_of_range}"
+    )
+
+    # --- 4. Partition: union == full set, per-stage sets are disjoint -----
+    stages: List[List[int]] = [
+        [i for i in range(num_layers) if layer_devices[i] == first_device + s] for s in range(pp_degree)
+    ]
+    union = set().union(*stages)
+    assert union == set(range(num_layers)), (
+        f"Layer partition does not cover all layers.\n  Expected: {set(range(num_layers))}\n  Got union: {union}"
+    )
+    for s_idx, s_a in enumerate(stages):
+        for t_idx, s_b in enumerate(stages):
+            if s_idx >= t_idx:
+                continue
+            overlap = set(s_a) & set(s_b)
+            assert not overlap, f"Stages {s_idx} and {t_idx} share layers {overlap} – stages must be disjoint."
+
+    # --- 5. Balance: each stage has base or base+1 layers -----------------
+    base, remainder = divmod(num_layers, pp_degree)
+    counts = [len(s) for s in stages]
+    for stage_idx, count in enumerate(counts):
+        expected_count = base + (1 if stage_idx < remainder else 0)
+        assert count == expected_count, (
+            f"Stage {stage_idx} has {count} layers; expected {expected_count} "
+            f"(base={base}, remainder={remainder}, pp={pp_degree}, layers={num_layers})."
+        )
+
+    # --- 6. Every device in range is used ---------------------------------
+    used_devices = set(layer_devices)
+    assert used_devices == valid_devices, (
+        f"Not all devices in the rank's range are used.\n  Expected: {valid_devices}\n  Used:     {used_devices}"
+    )
+
+
+def _assert_finite_positive_loss(value: float, label: str, *, gt: float = 0.0) -> None:
+    """
+    Smoke-check that a reported loss is finite and strictly above ``gt`` (default 0).
+    """
+    tensor_val = torch.tensor(value, dtype=torch.float32)
+    assert torch.isfinite(tensor_val), f"{label} is not finite: {value}"
+    assert value > gt, f"{label} = {value:.4f} ≤ {gt}; expected loss strictly above {gt}."
+
+
+# ---------------------------------------------------------------------------
+# 1. Unit tests – device map structure
+# ---------------------------------------------------------------------------
+
+
+class TestPPDeviceMapUnit:
+    """Unit tests for custom_device_map and get_device_map (no device required)."""
+
+    # -- custom_device_map ---------------------------------------------------
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_fixed_layers_on_correct_devices(self, _mock_rank, mock_cfg):
+        """
+        Structural invariants for the four non-transformer components:
+          • embed_tokens lives on the FIRST device of this rank's block.
+          • norm and rotary_emb are CO-LOCATED on the LAST device.
+          • embed_tokens and norm are on DIFFERENT devices (pipeline split exists).
+          • The gap between first and last device equals pp_degree - 1.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        pp_degree = 2
+        local_rank = 0
+        first_device = local_rank * pp_degree  # 0
+        last_device = first_device + pp_degree - 1  # 1
+
+        mock_cfg.return_value = _make_fake_llama_config()
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        # Anchor components at correct pipeline boundaries
+        assert dmap["model.embed_tokens"] == first_device, (
+            f"embed_tokens must be on first device {first_device}, got {dmap['model.embed_tokens']}"
+        )
+        assert dmap["model.norm"] == last_device, (
+            f"model.norm must be on last device {last_device}, got {dmap['model.norm']}"
+        )
+        # norm and rotary_emb must be co-located (both at the tail of the pipeline)
+        assert dmap["model.rotary_emb"] == dmap["model.norm"], (
+            "rotary_emb and norm must be co-located on the same device; "
+            f"got rotary_emb={dmap['model.rotary_emb']}, norm={dmap['model.norm']}"
+        )
+        # The pipeline must actually split; first and last must differ
+        assert dmap["model.embed_tokens"] != dmap["model.norm"], (
+            "embed_tokens and norm are on the same device – no pipeline split occurred."
+        )
+        # The span of devices matches what was requested
+        assert dmap["model.norm"] - dmap["model.embed_tokens"] == pp_degree - 1, (
+            f"Device span ({dmap['model.norm']} - {dmap['model.embed_tokens']}) "
+            f"must equal pp_degree - 1 = {pp_degree - 1}."
+        )
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_tied_embeddings_lm_head_on_first_device(self, _mock_rank, mock_cfg):
+        """
+        For tied embeddings (Llama-3.2-1B default):
+          • lm_head must be CO-LOCATED with embed_tokens (same device).
+          • lm_head must NOT be co-located with model.norm.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        mock_cfg.return_value = _make_fake_llama_config(tie_word_embeddings=True)
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=2)
+
+        # Co-location invariant: lm_head shares device with embed_tokens
+        assert dmap["lm_head"] == dmap["model.embed_tokens"], (
+            "Tied-embedding model: lm_head must be on the same device as embed_tokens.\n"
+            f"  lm_head={dmap['lm_head']}, embed_tokens={dmap['model.embed_tokens']}"
+        )
+        # Separation invariant: lm_head is NOT on the last device (where norm lives)
+        assert dmap["lm_head"] != dmap["model.norm"], (
+            "Tied-embedding model: lm_head must not be co-located with model.norm.\n"
+            f"  lm_head={dmap['lm_head']}, norm={dmap['model.norm']}"
+        )
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_untied_embeddings_lm_head_on_last_device(self, _mock_rank, mock_cfg):
+        """
+        For non-tied embeddings:
+          • lm_head must be CO-LOCATED with model.norm (last device).
+          • lm_head must NOT be co-located with embed_tokens (first device).
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        mock_cfg.return_value = _make_fake_llama_config(tie_word_embeddings=False)
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=2)
+
+        # Co-location invariant: lm_head shares device with norm (output side)
+        assert dmap["lm_head"] == dmap["model.norm"], (
+            "Non-tied model: lm_head must be on the same device as model.norm.\n"
+            f"  lm_head={dmap['lm_head']}, norm={dmap['model.norm']}"
+        )
+        # Separation invariant: lm_head is NOT on the first device
+        assert dmap["lm_head"] != dmap["model.embed_tokens"], (
+            "Non-tied model: lm_head must not be co-located with embed_tokens.\n"
+            f"  lm_head={dmap['lm_head']}, embed_tokens={dmap['model.embed_tokens']}"
+        )
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_layer_distribution_pp2(self, _mock_rank, mock_cfg):
+        """
+        16 layers, pp=2: exact partition into two contiguous, equal halves.
+
+        Checks beyond simple counting:
+          • Monotonicity: device IDs are non-decreasing across layer indices.
+          • Partition: stage-0 and stage-1 sets are disjoint and their union
+            covers all 16 layers.
+          • No phantom or missing layer keys.
+          • Both devices are actually used (completeness).
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        num_layers, pp_degree = 16, 2
+        mock_cfg.return_value = _make_fake_llama_config(num_hidden_layers=num_layers)
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        # Delegate to the central invariant checker
+        _assert_layer_device_ids(dmap, num_layers, pp_degree, local_rank=0)
+
+        # Verify the exact split boundary for this balanced case
+        layer_devices = [dmap[f"model.layers.{i}"] for i in range(num_layers)]
+        first_half = layer_devices[:8]
+        second_half = layer_devices[8:]
+        assert all(d == 0 for d in first_half), f"Layers 0-7 should all be on device 0; got {first_half}"
+        assert all(d == 1 for d in second_half), f"Layers 8-15 should all be on device 1; got {second_half}"
+
+    @pytest.mark.parametrize("pp_degree,num_layers", [(3, 16), (4, 16), (3, 9), (4, 8)])
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_layer_distribution_balanced(self, _mock_rank, mock_cfg, pp_degree, num_layers):
+        """
+        For any (pp_degree, num_layers) pair, the full invariant suite must hold.
+
+        In addition to the central checker, verifies that each stage's count is
+        exactly ``base`` or ``base+1`` – stricter than ``max - min ≤ 1`` because
+        it rules out the pathological case where all surplus goes to one stage and
+        another stage has 0.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        mock_cfg.return_value = _make_fake_llama_config(num_hidden_layers=num_layers)
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        # Full invariant suite (monotonicity, partition, balance, completeness)
+        _assert_layer_device_ids(dmap, num_layers, pp_degree, local_rank=0)
+
+        # Also verify no stage is starved (every stage has at least one layer)
+        counts = Counter(dmap[f"model.layers.{i}"] for i in range(num_layers))
+        assert min(counts.values()) >= 1, (
+            f"At least one stage has 0 layers: {dict(counts)} (pp={pp_degree}, layers={num_layers})"
+        )
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_all_layers_assigned(self, _mock_rank, mock_cfg):
+        """
+        The set of layer keys in the map must equal {model.layers.0, ..., model.layers.15}
+        exactly – no missing layers, no phantom layers like model.layers.16.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        num_layers, pp_degree = 16, 4
+        mock_cfg.return_value = _make_fake_llama_config(num_hidden_layers=num_layers)
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        expected_layer_keys = {f"model.layers.{i}" for i in range(num_layers)}
+        actual_layer_keys = {k for k in dmap if k.startswith("model.layers.")}
+
+        # Exact set equality – catches both missing and phantom keys at once
+        assert actual_layer_keys == expected_layer_keys, (
+            f"Layer key mismatch.\n"
+            f"  Missing : {sorted(expected_layer_keys - actual_layer_keys)}\n"
+            f"  Phantom : {sorted(actual_layer_keys - expected_layer_keys)}"
+        )
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    def test_too_few_layers_raises(self, _mock_rank, mock_cfg):
+        """
+        When pp_degree > num_layers the error must mention BOTH the conflicting
+        numbers (num_layers=2, pp_degree=4), so the caller can diagnose the issue.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        mock_cfg.return_value = _make_fake_llama_config(num_hidden_layers=2)
+        with pytest.raises(ValueError, match=r"(?=.*\b2\b)(?=.*\b4\b)"):
+            # Regex uses lookaheads to require BOTH '2' (num_layers) and '4'
+            # (pp_degree) appear somewhere in the error message
+            custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=4)
+
+    # -- get_device_map ------------------------------------------------------
+
+    def test_get_device_map_pp1_returns_none(self):
+        """
+        pp_degree=1 (PP disabled) must return the Python singleton None –
+        not an empty dict, not False, not 0.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
+
+        result = get_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=1)
+        assert result is None, f"Expected None (PP disabled), got {type(result).__name__}: {result!r}"
+        assert type(result) is type(None), "Return type must be NoneType, not a falsy proxy"
+
+    @patch("torch.qaic.device_count", return_value=1)
+    def test_get_device_map_pp_exceeds_devices_raises(self, _mock_count):
+        """
+        When pp_degree > num_available_devices the error must mention:
+          • the word 'pp_degree'
+          • the requested pp value (4)
+          • the available device count (1)
+        This ensures the error message is actionable, not just 'invalid config'.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
+
+        # Regex requires all three pieces of information in the error message
+        with pytest.raises(ValueError, match=r"(?=.*pp_degree)(?=.*\b4\b)(?=.*\b1\b)"):
+            get_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=4)
+
+    @patch("torch.qaic.device_count", return_value=2)
+    def test_get_device_map_pp_equals_devices_returns_auto(self, _mock_count):
+        """
+        When pp_degree == num_available_devices HuggingFace 'auto' placement
+        is used. Verify the return type (str) and exact value ("auto").
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
+
+        result = get_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=2)
+
+        assert isinstance(result, str), f"Expected a string ('auto'), got {type(result).__name__}: {result!r}"
+        assert result == "auto", f"Expected 'auto', got '{result}'"
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    @patch("torch.qaic.device_count", return_value=4)
+    def test_get_device_map_pp_less_than_devices_returns_dict(self, _mock_count, _mock_rank, mock_cfg):
+        """
+        When pp_degree < num_available_devices a custom dict is returned.
+
+        Checks:
+          • Return type is exactly dict.
+          • All four mandatory component keys are present.
+          • All values are Python ints (not numpy ints or strings).
+          • Both devices in [0, pp_degree) appear in the values (completeness).
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
+
+        pp_degree = 2
+        mock_cfg.return_value = _make_fake_llama_config()
+        result = get_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        assert isinstance(result, dict), f"Expected dict, got {type(result).__name__}: {result!r}"
+
+        required_keys = {"model.embed_tokens", "lm_head", "model.norm", "model.rotary_emb"}
+        missing_keys = required_keys - result.keys()
+        assert not missing_keys, f"Required component keys missing from device map: {missing_keys}"
+
+        # All device IDs must be plain Python ints (not numpy.int64 etc.)
+        non_int = {k: type(v).__name__ for k, v in result.items() if not isinstance(v, int)}
+        assert not non_int, f"Device map values must be Python ints; found wrong types: {non_int}"
+
+        # Both devices in the pp range must actually be used (completeness)
+        used_devices = set(result.values())
+        expected_devices = set(range(pp_degree))
+        assert used_devices == expected_devices, (
+            f"Not all pipeline stage devices are represented in the map.\n"
+            f"  Expected devices: {expected_devices}\n"
+            f"  Used devices:     {used_devices}"
+        )
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=0,
+    )
+    @patch("torch.qaic.device_count", return_value=4)
+    def test_get_device_map_returns_valid_device_ids(self, _mock_count, _mock_rank, mock_cfg):
+        """
+        Every device ID in the returned map must be in [0, pp_degree).
+        PLUS: every device in [0, pp_degree) must appear at least once
+        (no wasted or unreachable stages).
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
+
+        pp_degree = 2
+        mock_cfg.return_value = _make_fake_llama_config()
+        dmap = get_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        assert isinstance(dmap, dict)
+        valid_range = range(pp_degree)
+
+        # --- Range validity: no out-of-bound IDs --------------------------
+        out_of_range = {k: v for k, v in dmap.items() if v not in valid_range}
+        assert not out_of_range, f"Device IDs outside valid range [0, {pp_degree}):\n" + "\n".join(
+            f"  {k!r}: {v}" for k, v in sorted(out_of_range.items())
+        )
+
+        # --- Completeness: every stage device is actually used -------------
+        used = set(dmap.values())
+        unused = set(valid_range) - used
+        assert not unused, (
+            f"Stage devices {unused} are never assigned any component – those pipeline stages would be empty."
+        )
+
+
+# ---------------------------------------------------------------------------
+# 2. Distributed rank tests (local_rank > 0)
+# ---------------------------------------------------------------------------
+
+
+class TestPPDeviceMapDistributed:
+    """Verify device IDs are correctly offset when local_rank > 0 (DDP + PP)."""
+
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+        return_value=1,
+    )
+    def test_rank1_devices_offset_by_pp_degree(self, _mock_rank, mock_cfg):
+        """
+        For pp_degree=2 and local_rank=1 the block of device IDs must be
+        exactly {2, 3} – not overlapping with rank-0's block {0, 1}.
+
+        Checks:
+          • embed_tokens is on first_device (computed, not hardcoded).
+          • norm is on last_device (computed).
+          • All layer devices are within [first_device, last_device].
+          • The device set is completely disjoint from rank-0's devices.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        pp_degree = 2
+        local_rank = 1
+        first_device = local_rank * pp_degree  # 2
+        last_device = first_device + pp_degree - 1  # 3
+        rank0_devices = set(range(pp_degree))  # {0, 1}
+
+        mock_cfg.return_value = _make_fake_llama_config()
+        dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        assert dmap["model.embed_tokens"] == first_device, (
+            f"embed_tokens must be on first_device={first_device}, got {dmap['model.embed_tokens']}"
+        )
+        assert dmap["model.norm"] == last_device, f"norm must be on last_device={last_device}, got {dmap['model.norm']}"
+
+        # All layer devices must be within this rank's block
+        layer_devices = {dmap[f"model.layers.{i}"] for i in range(_LLAMA_NUM_LAYERS)}
+        out_of_block = layer_devices - set(range(first_device, last_device + 1))
+        assert not out_of_block, (
+            f"Rank-1 layers assigned to devices outside [{first_device}, {last_device}]: {out_of_block}"
+        )
+
+        # Rank-1 devices must be completely disjoint from rank-0 devices
+        overlap = set(dmap.values()) & rank0_devices
+        assert not overlap, (
+            f"Rank-1 device map overlaps with rank-0 devices {rank0_devices}: {overlap}\n"
+            f"DDP replicas must use non-overlapping device blocks."
+        )
+
+    @pytest.mark.parametrize("local_rank,pp_degree", [(0, 2), (1, 2), (0, 4), (2, 4)])
+    @patch(
+        "QEfficient.finetune.experimental.core.utils.device_map_utils.AutoConfig.from_pretrained",
+    )
+    def test_device_range_is_complete_and_correct(self, mock_cfg, local_rank, pp_degree):
+        """
+        The set of device IDs actually used must EXACTLY EQUAL the expected
+        block [local_rank*pp_degree, local_rank*pp_degree + pp_degree - 1].
+
+        'Subset' is not sufficient: if any device in the block is unused the
+        pipeline has a ghost stage consuming no memory and carrying no layers.
+        """
+        from QEfficient.finetune.experimental.core.utils.device_map_utils import custom_device_map
+
+        mock_cfg.return_value = _make_fake_llama_config()
+
+        with patch(
+            "QEfficient.finetune.experimental.core.utils.device_map_utils.get_local_rank",
+            return_value=local_rank,
+        ):
+            dmap = custom_device_map(_LLAMA_MODEL_NAME, "qaic", pp_degree=pp_degree)
+
+        expected_block = set(range(local_rank * pp_degree, local_rank * pp_degree + pp_degree))
+        actual_ids = set(dmap.values())
+
+        # Exact equality, not just subset
+        assert actual_ids == expected_block, (
+            f"Device block mismatch for rank={local_rank}, pp={pp_degree}.\n"
+            f"  Expected: {expected_block}\n"
+            f"  Got:      {actual_ids}\n"
+            f"  Missing:  {expected_block - actual_ids}\n"
+            f"  Extra:    {actual_ids - expected_block}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# 3. ConfigManager PP validation
+# ---------------------------------------------------------------------------
+
+
+class TestPPConfigValidation:
+    """Test that ConfigManager correctly validates pp_degree."""
+
+    def _make_config_manager_with_pp(self, pp_degree: int):
+        """Build a real ConfigManager pointing to test_config.yaml, then override pp_degree."""
+        import sys
+
+        test_yaml = os.path.join(os.path.dirname(__file__), "test_config.yaml")
+        old_argv = sys.argv[:]
+        sys.argv = ["finetune_experimental.py", test_yaml]
+        try:
+            from QEfficient.finetune.experimental.core.config_manager import ConfigManager
+
+            cm = ConfigManager()
+            cm.config.training.pp_degree = pp_degree
+            return cm
+        finally:
+            sys.argv = old_argv
+
+    def test_pp_degree_default_is_one(self):
+        """
+        Default pp_degree must be the integer 1 (not "1", not 0, not True).
+        Verifies both value and type.
+        """
+        from QEfficient.finetune.experimental.core.config_manager import TrainingConfig
+
+        tc = TrainingConfig()
+
+        assert isinstance(tc.pp_degree, int), (
+            f"pp_degree must be an int, got {type(tc.pp_degree).__name__}: {tc.pp_degree!r}"
+        )
+        assert tc.pp_degree == 1, f"Default pp_degree must be 1 (PP disabled), got {tc.pp_degree}"
+        # Ensure it is not a boolean True (which equals 1 but is semantically wrong)
+        assert type(tc.pp_degree) is not bool, "pp_degree must be int, not bool"
+
+
+# ---------------------------------------------------------------------------
+# 4. FineTuningPipeline integration – PP device_map injection
+# ---------------------------------------------------------------------------
+
+
+class TestPPFineTuningPipelineIntegration:
+    """
+    Verify that FineTuningPipeline._create_model correctly:
+      • calls get_device_map when pp_degree > 1
+      • injects the returned device_map into the model kwargs
+      • does NOT call get_device_map when pp_degree == 1
+      • does NOT leak pp_degree or PEFT keys into model creation kwargs
+    """
+
+    class _DictLike(dict):
+        """dict subclass that also supports attribute access for training config."""
+
+        def __getattr__(self, key):
+            try:
+                return self[key]
+            except KeyError:
+                raise AttributeError(key)
+
+    def _make_pipeline(self, pp_degree: int, model_name: str = _LLAMA_MODEL_NAME):
+        from QEfficient.cloud import finetune_experimental as fte
+        from QEfficient.cloud.finetune_experimental import FineTuningPipeline
+
+        cm = MagicMock()
+        training = self._DictLike(
+            {
+                "type": "sft",
+                "output_dir": "/tmp/test_pp_output",
+                "pp_degree": pp_degree,
+                "device": "qaic",
+                "seed": 42,
+                "torch_dtype": "fp16",
+            }
+        )
+        cm.config.training = training
+        cm.get_training_config.return_value = training
+        cm.get_model_config.side_effect = lambda: {
+            "model_type": "hf",
+            "model_name": model_name,
+            "use_peft": False,
+            "torch_dtype": "fp16",
+        }
+        cm.get_optimizer_config.side_effect = lambda: {
+            "optimizer_name": "adamw",
+            "lr": 5e-5,
+            "weight_decay": 0.01,
+        }
+        cm.get_callback_config.return_value = {"callbacks": {}}
+        cm.get_scheduler_config.return_value = {
+            "scheduler_name": "cosine",
+            "warmup_ratio": 0.1,
+            "warmup_steps": 0,
+        }
+        cm.get_dataset_config.return_value = {
+            "dataset_type": "seq_completion",
+            "dataset_name": "dummy",
+            "train_split": "train",
+            "test_split": "test",
+            "split_ratio": 0.8,
+            "dataset_num_samples": -1,
+            "dataloader_pin_memory": False,
+            "dataloader_persistent_workers": False,
+            "dataloader_prefetch_factor": None,
+            "dataloader_drop_last": False,
+            "dataloader_num_workers": 0,
+            "group_by_length": False,
+        }
+        fte.ComponentFactory.create_trainer_config.return_value = (
+            MagicMock(),
+            MagicMock(),
+            {},
+        )
+        return FineTuningPipeline(cm), cm
+
+    @patch("QEfficient.cloud.finetune_experimental.get_device_map", return_value=None)
+    @patch("QEfficient.cloud.finetune_experimental.ComponentFactory")
+    def test_pp_disabled_does_not_call_get_device_map(self, mock_factory, mock_get_dm):
+        """
+        When pp_degree=1:
+          • get_device_map must NOT be called (PP is off).
+          • create_model must still be called exactly once with the right model type.
+        """
+        mock_factory.create_model.return_value = MagicMock()
+        pipeline, _ = self._make_pipeline(pp_degree=1)
+
+        mock_get_dm.assert_not_called()
+
+        # Model creation
+        assert mock_factory.create_model.call_count == 1, (
+            "create_model must be called exactly once even when PP is disabled"
+        )
+        first_positional = mock_factory.create_model.call_args.args[0]
+        assert first_positional == "hf", (
+            f"create_model's first arg must be the model type 'hf', got {first_positional!r}"
+        )
+
+    @patch(
+        "QEfficient.cloud.finetune_experimental.get_device_map",
+        return_value={"model.embed_tokens": 0, "model.norm": 1},
+    )
+    @patch("QEfficient.cloud.finetune_experimental.ComponentFactory")
+    def test_pp_enabled_calls_get_device_map(self, mock_factory, mock_get_dm):
+        """
+        When pp_degree=2:
+          • get_device_map must be called EXACTLY once with the correct keyword args.
+          • create_model must also be called exactly once.
+        """
+        mock_factory.create_model.return_value = MagicMock()
+        pipeline, _ = self._make_pipeline(pp_degree=2)
+
+        mock_get_dm.assert_called_once_with(
+            model_name=_LLAMA_MODEL_NAME,
+            device="qaic",
+            pp_degree=2,
+        )
+        # Ensure model creation followed device map generation
+        assert mock_factory.create_model.call_count == 1, (
+            "create_model must be called exactly once after get_device_map"
+        )
+
+    @patch(
+        "QEfficient.cloud.finetune_experimental.get_device_map",
+        return_value={"model.embed_tokens": 0, "model.norm": 1},
+    )
+    @patch("QEfficient.cloud.finetune_experimental.ComponentFactory")
+    def test_pp_device_map_injected_into_model_kwargs(self, mock_factory, mock_get_dm):
+        """
+        The dict returned by get_device_map must be forwarded VERBATIM to
+        ComponentFactory.create_model as the 'device_map' kwarg.
+
+        Also verifies that internal/PEFT fields are NOT leaked into model kwargs:
+          • 'pp_degree' must not appear (PP handled; no-op for the model loader)
+          • 'use_peft' must not appear (PEFT is applied separately by the trainer)
+        """
+        expected_dmap = {"model.embed_tokens": 0, "model.norm": 1}
+        mock_get_dm.return_value = expected_dmap
+        mock_factory.create_model.return_value = MagicMock()
+
+        pipeline, _ = self._make_pipeline(pp_degree=2)
+
+        call_kwargs = mock_factory.create_model.call_args.kwargs
+
+        # device_map must be present and equal to the exact dict from get_device_map
+        assert "device_map" in call_kwargs, (
+            f"'device_map' must be forwarded to create_model; got kwargs: {list(call_kwargs)}"
+        )
+        assert call_kwargs["device_map"] == expected_dmap, (
+            f"device_map was modified before forwarding.\n"
+            f"  Expected: {expected_dmap}\n"
+            f"  Got:      {call_kwargs['device_map']}"
+        )
+
+        # Internal fields must not leak through
+        assert "pp_degree" not in call_kwargs, (
+            "'pp_degree' must not be forwarded to create_model – it is consumed by _create_model."
+        )
+        assert "use_peft" not in call_kwargs, (
+            "'use_peft' must not be forwarded to create_model – PEFT is applied by the trainer."
+        )
+
+    @patch("QEfficient.cloud.finetune_experimental.get_device_map", return_value=None)
+    @patch("QEfficient.cloud.finetune_experimental.ComponentFactory")
+    def test_pp_disabled_no_device_map_in_kwargs(self, mock_factory, mock_get_dm):
+        """
+        When pp_degree=1:
+          • 'device_map' in kwargs must NOT be a PP-generated dict (it may
+            still be a user-supplied string like 'auto' from the YAML config,
+            but cannot be a layer-to-device dict that was computed by PP).
+          • 'pp_degree' must not appear in kwargs.
+          • 'use_peft' must not appear in kwargs.
+        """
+        mock_factory.create_model.return_value = MagicMock()
+        pipeline, _ = self._make_pipeline(pp_degree=1)
+
+        call_kwargs = mock_factory.create_model.call_args.kwargs
+
+        device_map_val = call_kwargs.get("device_map", None)
+        assert not isinstance(device_map_val, dict), (
+            f"A PP-generated dict device_map must not be injected when pp_degree=1; got {device_map_val!r}"
+        )
+        assert "pp_degree" not in call_kwargs, "'pp_degree' must not be forwarded to create_model."
+        assert "use_peft" not in call_kwargs, "'use_peft' must not be forwarded to create_model."
+
+
+# ---------------------------------------------------------------------------
+# 5. End-to-end training tests  (need model weights / multi-QAiC)
+# ---------------------------------------------------------------------------
+
+
+def _make_tiny_dataset(n: int = 15) -> Dataset:
+    """Build an n-sample dataset from the fixed _ALPACA_SAMPLES list."""
+    return Dataset.from_dict({"text": [s["text"] for s in _ALPACA_SAMPLES[:n]]})
+
+
+def _sft_config(output_dir: str, fp16: bool = False):
+    """Minimal SFTConfig for a fast smoke-test run (5 steps, 1 mid-run eval)."""
+    from trl import SFTConfig
+
+    return SFTConfig(
+        output_dir=output_dir,
+        max_length=128,
+        per_device_train_batch_size=1,
+        per_device_eval_batch_size=1,
+        num_train_epochs=1,
+        max_steps=5,  # 5 optimiser steps – fast enough for CI
+        eval_steps=3,  # one mid-training evaluation
+        eval_strategy="steps",
+        save_strategy="no",
+        logging_steps=1,
+        fp16=fp16,
+        bf16=False,
+        report_to="none",  # no wandb / tensorboard during tests
+    )
+
+
+class TestPPE2ETraining:
+    """
+    End-to-end training + evaluation for meta-llama/Llama-3.2-1B.
+
+    The model is downloaded automatically on first run.
+    Set the HF_TOKEN environment variable (or log in via ``huggingface-cli
+    login``) before running, as Llama-3.2-1B is a gated repository.
+
+    Skip conditions
+    ---------------
+    • The pp_degree=2 tests are skipped when < 2 QAIiC devices are available.
+
+    """
+
+    OUTPUT_DIR_SINGLE = "/tmp/test_pp_llama_single"
+    OUTPUT_DIR_PP2 = "/tmp/test_pp_llama_pp2"
+    _REDUCED_LAYERS = 2  # Use 2-layer model for speed; PP logic is layer-count agnostic
+    _MAX_STEPS = 5
+
+    @pytest.fixture(autouse=True)
+    def cleanup(self):
+        """Remove output directories after each test."""
+        yield
+        for d in (self.OUTPUT_DIR_SINGLE, self.OUTPUT_DIR_PP2):
+            if os.path.exists(d):
+                shutil.rmtree(d, ignore_errors=True)
+
+    # -- helpers -------------------------------------------------------------
+
+    def _load_llama_model_and_tokenizer(self, device_map=None):
+        """
+        Load Llama-3.2-1B with num_hidden_layers reduced to _REDUCED_LAYERS.
+        Optionally injects a PP device_map.
+        """
+        from QEfficient.finetune.experimental.core.component_registry import ComponentFactory
+        from QEfficient.finetune.experimental.core.model import HFModel  # noqa: F401
+
+        kwargs = {
+            "auto_class_name": "AutoModelForCausalLM",
+            "use_cache": False,
+            "attn_implementation": "eager",
+            "num_hidden_layers": self._REDUCED_LAYERS,
+        }
+        if device_map is not None:
+            kwargs["device_map"] = device_map
+        return ComponentFactory.create_model("hf", _LLAMA_MODEL_NAME, **kwargs)
+
+    def _make_device_map_for_reduced_model(self, pp_degree: int, local_rank: int = 0) -> Dict[str, int]:
+        """PP device_map for the 2-layer Llama-3.2-1B (tied embeddings)."""
+        first_device = local_rank * pp_degree
+        last_device = first_device + pp_degree - 1
+        return {
+            "model.embed_tokens": first_device,
+            "lm_head": first_device,  # tied
+            "model.norm": last_device,
+            "model.rotary_emb": last_device,
+            "model.layers.0": first_device,
+            "model.layers.1": last_device,
+        }
+
+    # -- multi-device (pp_degree=2) ------------------------------------------
+
+    @pytest.mark.skipif(
+        torch.qaic.device_count() < 2,
+        reason="PP with pp_degree=2 requires at least 2 QAIC devices",
+    )
+    def test_pp2_device_map_structure_for_reduced_model(self):
+        """
+        Structural invariants of the device_map for the 2-layer reduced model:
+
+          • embed_tokens and lm_head are CO-LOCATED (tied-embedding invariant).
+          • norm and rotary_emb are CO-LOCATED (tail co-location invariant).
+          • embed_tokens and norm are on DIFFERENT devices (pipeline actually splits).
+          • layers.0 and layers.1 are on DIFFERENT devices (both stages used).
+          • The complete set of assigned devices is exactly {0, 1} (no ghost stages).
+        """
+        pp_degree = 2
+        dmap = self._make_device_map_for_reduced_model(pp_degree=pp_degree)
+
+        # Co-location invariants
+        assert dmap["lm_head"] == dmap["model.embed_tokens"], (
+            "Tied model: lm_head must be co-located with embed_tokens, "
+            f"got lm_head={dmap['lm_head']} embed_tokens={dmap['model.embed_tokens']}"
+        )
+        assert dmap["model.rotary_emb"] == dmap["model.norm"], (
+            "rotary_emb must be co-located with model.norm, "
+            f"got rotary_emb={dmap['model.rotary_emb']} norm={dmap['model.norm']}"
+        )
+
+        # Split invariants
+        assert dmap["model.embed_tokens"] != dmap["model.norm"], (
+            "embed_tokens and norm must be on different devices (pipeline split)."
+        )
+        assert dmap["model.layers.0"] != dmap["model.layers.1"], (
+            "layers.0 and layers.1 must be on different devices (pp_degree=2 split)."
+        )
+
+        # Completeness: both stage devices are used, none are empty
+        used_devices = set(dmap.values())
+        expected_devices = set(range(pp_degree))
+        assert used_devices == expected_devices, f"Device set mismatch: expected {expected_devices}, got {used_devices}"
+
+    @pytest.mark.skipif(
+        torch.qaic.device_count() < 2,
+        reason="PP with pp_degree=2 requires at least 2 QAIC devices",
+    )
+    def test_pp2_training_with_lora(self):
+        """
+        LoRA + PP: verify PEFT adapters are compatible with multi-device placement.
+
+        Advanced assertions
+        -------------------
+        • LoRA trainable / total ratio < 1%.
+        • LoRA 'lora_A' weights exist in the named parameters.
+        • LoRA weights span BOTH GPUs (adapters were placed across the pipeline).
+        • Both train_loss and eval_loss are finite and strictly positive.
+        """
+        from peft import LoraConfig
+        from trl import SFTConfig, SFTTrainer
+
+        dmap = self._make_device_map_for_reduced_model(pp_degree=2)
+        hf_model = self._load_llama_model_and_tokenizer(device_map=dmap)
+        lora_cfg = LoraConfig(
+            task_type="CAUSAL_LM",
+            r=4,
+            lora_alpha=8,
+            lora_dropout=0.05,
+            target_modules=["q_proj", "v_proj"],
+            bias="none",
+        )
+
+        trainer = SFTTrainer(
+            model=hf_model.model,
+            args=SFTConfig(
+                output_dir=self.OUTPUT_DIR_PP2,
+                max_length=128,
+                per_device_train_batch_size=1,
+                num_train_epochs=1,
+                max_steps=self._MAX_STEPS,
+                eval_steps=3,
+                eval_strategy="steps",
+                save_strategy="no",
+                logging_steps=1,
+                fp16=True,
+                bf16=False,
+                report_to="none",
+            ),
+            train_dataset=_make_tiny_dataset(12),
+            eval_dataset=_make_tiny_dataset(3),
+            processing_class=hf_model.tokenizer,
+            peft_config=lora_cfg,
+        )
+
+        train_result = trainer.train()
+        _assert_finite_positive_loss(train_result.training_loss, "PP=2 LoRA train_loss")
+
+        eval_metrics = trainer.evaluate()
+        assert "eval_loss" in eval_metrics, "eval_metrics must contain 'eval_loss'"
+        _assert_finite_positive_loss(eval_metrics["eval_loss"], "PP=2 LoRA eval_loss")
+
+        # LoRA efficiency
+        trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
+        total = sum(p.numel() for p in trainer.model.parameters())
+        ratio = trainable / total
+        assert ratio < 0.01, f"LoRA trainable/total = {ratio:.4%} ≥ 1% – unexpectedly high for r=4."
+
+        # LoRA adapters must exist
+        lora_params = [(n, p) for n, p in trainer.model.named_parameters() if "lora_A" in n]
+        assert lora_params, "No lora_A parameters found after PEFT wrapping."
+
+        # LoRA weights must span BOTH devices (the adapter is across the pipeline)
+        lora_devices = {f"{p.device.type}:{p.device.index}" for _, p in lora_params}
+        assert "qaic:0" in lora_devices, "No LoRA adapter on qaic:0 – stage 0 is untrained."
+        assert "qaic:1" in lora_devices, "No LoRA adapter on qaic:1 – stage 1 is untrained."
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index 06347ecc9f..c59de5cba4 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -327,3 +327,11 @@ pip install pytest pytest-mock
 ```bash
 QAIC_VISIBLE_DEVICES=0 python -m pytest QEfficient/finetune/experimental/tests/
 ```
+
+To run two of the pipeline parallelism tests, 2 devices are required:
+
+```bash
+QAIC_VISIBLE_DEVICES=0,1 python -m pytest QEfficient/finetune/experimental/tests/test_pipeline_parallelism.py
+```
+
+If we pass only one device, two of these tests get skipped and remaining tests run successfully. 

From 5d5f0ce4ea69981ed25011a85e4170610435f0f1 Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Fri, 27 Mar 2026 10:38:08 +0000
Subject: [PATCH 10/23] Fixed repolinter error

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/core/utils/constants.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/QEfficient/finetune/experimental/core/utils/constants.py b/QEfficient/finetune/experimental/core/utils/constants.py
index ed7c9e1bfa..76b3ccd2b1 100644
--- a/QEfficient/finetune/experimental/core/utils/constants.py
+++ b/QEfficient/finetune/experimental/core/utils/constants.py
@@ -1 +1,8 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
 DTYPE_MAPPING = {"fp16": "float16", "bf16": "bfloat16"}

From 53d6855cfa7b4f7e2bda67234592566779808f33 Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Mon, 30 Mar 2026 06:04:46 +0000
Subject: [PATCH 11/23] Corrected file paths

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/configs/sft_ddp_config.yaml | 3 ++-
 docs/source/hf_finetune.md                                   | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
index a426dd6140..f4059aa1e4 100644
--- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
 # Model configuration
 model:
   model_type: "hf"  # Hugging Face model
@@ -25,7 +26,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset
-  data_seed: 42 # Random seed for dataset shuffling
+  data_seed: 42 # Random seed for dataset shuffling, for deterministic shuffling and reproducibility
 
 
 
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index c59de5cba4..4e252605e0 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -298,9 +298,9 @@ training:
 
 **PP only — single process, 2 stages (via YAML)**
 ```bash
-python -m QEfficient.cloud.finetune_experimental configs/sft_single_device_gsm8k_config.yaml
+python -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
 ```
-where `sft_single_device_gsm8k_config.yaml` contains `pp_degree: 2` under `training:`.
+where change `pp_degree: 2` under `training:` for `sft_single_device_gsm8k_config.yaml` to enable pipeline parallelism of degree 2.
 
 **PP only — single process, 2 stages (via CLI flags)**
 ```bash

From fb3fb863146750ffbea431a36bb9e3d4b7115873 Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Mon, 30 Mar 2026 06:08:43 +0000
Subject: [PATCH 12/23] Updates

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 docs/source/hf_finetune.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index 4e252605e0..e2117ee803 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -300,7 +300,7 @@ training:
 ```bash
 python -m QEfficient.cloud.finetune_experimental QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
 ```
-where change `pp_degree: 2` under `training:` for `sft_single_device_gsm8k_config.yaml` to enable pipeline parallelism of degree 2.
+where user can configure `pp_degree: 2` under `training:` section for the input config file `sft_single_device_gsm8k_config.yaml` to enable pipeline parallelism of degree 2.
 
 **PP only — single process, 2 stages (via CLI flags)**
 ```bash

From 6241bdd7f742383ed0f70a7fdfc27ecf40001f37 Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Mon, 30 Mar 2026 07:40:35 +0000
Subject: [PATCH 13/23] Added Trainer arguments reference

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 docs/source/hf_finetune.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index e2117ee803..4b339c3f0e 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -18,6 +18,9 @@ The **QEfficient Fine-Tune Module** is a component of the QEfficient project foc
 
 ## Getting Started
 
+Transformer's Trainer `https://huggingface.co/docs/transformers/main/en/main_classes/trainer#trainer` class goes hand-in-hand with the TrainingArguments class `https://huggingface.co/docs/transformers/v5.2.0/en/main_classes/trainer#transformers.TrainingArguments`, which offers a wide range of options to customize how a model is trained.
+Since this stack is based on HF's Trainer class. Please refer to above docs to configure config.yaml file for finetuning. 
+
 ### Installation (ENV set up)
 
 Install the same prerequisites as **QEfficient**, additionally **QAIC PyTorch Eager mode** as needed.

From d78271710eb13d057b5b3900c9e10e29042811cb Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Mon, 30 Mar 2026 09:22:56 +0000
Subject: [PATCH 14/23] Addressed some Qgenie reviews and fixed code

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py         | 15 ++++++++++++---
 .../experimental/core/utils/device_map_utils.py   | 10 +++++++++-
 docs/source/hf_finetune.md                        |  2 ++
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 43fcde5f8c..34ac493726 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -66,9 +66,18 @@ def __init__(self, config_manager: ConfigManager):
 
         # Create model and tokenizer
         logger.log_rank_zero("Loading model and tokenizer...")
-        model_instance = self._create_model()
-        self.model = model_instance.model
-        self.tokenizer = model_instance.tokenizer
+        try:
+            model_instance = self._create_model()
+            self.model = model_instance.model
+            self.tokenizer = model_instance.tokenizer
+        except Exception as e:
+            logger.log_rank_zero(f"Failed to load model: {e}", level=logging.ERROR)
+            # Cleanup datasets if already created
+            if hasattr(self, 'train_dataset'):
+                del self.train_dataset
+            if hasattr(self, 'eval_dataset'):
+                del self.eval_dataset
+            raise RuntimeError(f"Model loading failed: {e}") from e
 
         # Create optimizer
         logger.log_rank_zero("Preparing optimizer...")
diff --git a/QEfficient/finetune/experimental/core/utils/device_map_utils.py b/QEfficient/finetune/experimental/core/utils/device_map_utils.py
index c9ac24bace..3970b052bd 100644
--- a/QEfficient/finetune/experimental/core/utils/device_map_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/device_map_utils.py
@@ -34,12 +34,20 @@ def get_device_map(
     Returns:
         Dict: A dictionary mapping layer names to device IDs, or None if no PP.
     """
-    if pp_degree <= 1:
+    # Validate pp_degree
+    if not isinstance(pp_degree, int):
+        raise TypeError(f"pp_degree must be an integer, got {type(pp_degree).__name__}")
+    
+    if pp_degree < 1:
+        raise ValueError(f"pp_degree must be >= 1, got {pp_degree}")
+    
+    if pp_degree == 1:
         return None
 
     torch_device = torch.device(device)
     num_available_devices = getattr(torch, torch_device.type).device_count()
 
+    
     if pp_degree > num_available_devices:
         raise ValueError(
             f"pp_degree ({pp_degree}) cannot exceed the number of available {device} devices "
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index 4b339c3f0e..b73cdc50f1 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -80,6 +80,7 @@ pip install trl==0.22.0
 cd .. && git clone https://github.com/quic-swatia/transformers.git
 cd transformers 
 git checkout version-4.55.0 && pip install -e .
+pip install datasets==4.5.0
 cd .. && cd efficient-transformers
 QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py \
 QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -101,6 +102,7 @@ pip install trl==0.22.0
 cd .. && git clone https://github.com/quic-swatia/transformers.git
 cd transformers 
 git checkout version-4.55.0 && pip install -e .
+pip install datasets==4.5.0
 cd .. && cd efficient-transformers
 CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 -m QEfficient.cloud.finetune_experimental \
 --device cuda --num_epochs 1 --model_name meta-llama/Llama-3.2-3B \

From a0f0e80bf91be4ade6880033269d5329a60cb3ed Mon Sep 17 00:00:00 2001
From: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
Date: Mon, 30 Mar 2026 10:48:14 +0000
Subject: [PATCH 15/23] Added security checks for import_func in dataset.py

Signed-off-by: Ann Kuruvilla <akuruvil@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     |   4 +-
 .../finetune/experimental/core/dataset.py     | 210 +++++++++++++++++-
 .../core/utils/device_map_utils.py            |   5 +-
 .../experimental/tests/test_dataset.py        |  12 +-
 4 files changed, 215 insertions(+), 16 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 34ac493726..ce024828c8 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -73,9 +73,9 @@ def __init__(self, config_manager: ConfigManager):
         except Exception as e:
             logger.log_rank_zero(f"Failed to load model: {e}", level=logging.ERROR)
             # Cleanup datasets if already created
-            if hasattr(self, 'train_dataset'):
+            if hasattr(self, "train_dataset"):
                 del self.train_dataset
-            if hasattr(self, 'eval_dataset'):
+            if hasattr(self, "eval_dataset"):
                 del self.eval_dataset
             raise RuntimeError(f"Model loading failed: {e}") from e
 
diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index 5f0931f796..8c36036e29 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -10,10 +10,11 @@
 """
 
 import importlib
+import inspect
 import os
 import re
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict
+from typing import Any, Callable, Dict, Set
 
 from datasets import load_dataset, load_dataset_builder
 from torch.utils.data import Dataset
@@ -25,6 +26,60 @@
     validate_json_structure,
 )
 
+# Whitelist of allowed modules
+ALLOWED_MODULE_PREFIXES: Set[str] = {
+    "QEfficient.finetune.experimental.preprocessing",
+    "custom_funcs",  # Assuming this is a user-defined module for custom functions
+    "test_module",  # Used in tests, should be removed or renamed in production
+}
+# Blacklist of dangerous modules/functions
+DANGEROUS_MODULES: Set[str] = {
+    "os",
+    "sys",
+    "subprocess",
+    "shutil",
+    "pickle",
+    "eval",
+    "exec",
+    "compile",
+    "__import__",
+    "importlib",
+    "builtins",
+    "pty",
+    "commands",
+    "popen2",
+    "socket",
+    "urllib",
+    "requests",
+    "http",
+    "ftplib",
+    "smtplib",
+}
+DANGEROUS_FUNCTIONS: Set[str] = {
+    "system",
+    "popen",
+    "exec",
+    "eval",
+    "compile",
+    "__import__",
+    "open",
+    "input",
+    "execfile",
+    "reload",
+    "rmtree",
+    "remove",
+    "unlink",
+    "chmod",
+    "chown",
+    "kill",
+    "Popen",
+    "call",
+    "run",
+    "check_output",
+    "getoutput",
+    "getstatusoutput",
+}
+
 logger = Logger(__name__)
 
 
@@ -238,18 +293,159 @@ def _setup_templates(self, dataset, dataset_columns):
             dataset = dataset.filter(lambda example: self._filter_empty_or_none_samples(example, relevant_columns))
         return dataset
 
+    def _validate_module_path(self, module_path: str) -> None:
+        """
+        Validate that the module path is safe to import.
+
+        Args:
+            module_path: Module path to validate
+
+        Raises:
+            ValueError: If module is not allowed
+        """
+        # Check for empty or invalid path
+        if not module_path or not module_path.strip():
+            raise ValueError("Module path cannot be empty")
+
+        # Check for path traversal attempts
+        if ".." in module_path or "/" in module_path or "\\" in module_path:
+            raise ValueError(f"Module path '{module_path}' contains invalid characters. Path traversal is not allowed.")
+
+        # Check against allowed prefixes
+        is_allowed = any(module_path.startswith(prefix) for prefix in ALLOWED_MODULE_PREFIXES)
+
+        if not is_allowed:
+            raise ValueError(
+                f"Module '{module_path}' is not in the allowed modules list. "
+                f"Only modules starting with {ALLOWED_MODULE_PREFIXES} are allowed. "
+                "This is a security restriction to prevent arbitrary code execution."
+            )
+
+        # Check against dangerous modules
+        module_parts = module_path.split(".")
+        for part in module_parts:
+            if part in DANGEROUS_MODULES:
+                raise ValueError(
+                    f"Module '{module_path}' contains dangerous module '{part}'. "
+                    "Importing this module is not allowed for security reasons."
+                )
+
+    def _validate_function_name(self, function_name: str) -> None:
+        """
+        Validate that the function name is safe.
+
+        Args:
+            function_name: Function name to validate
+
+        Raises:
+            ValueError: If function name is dangerous or invalid
+        """
+        # Check for empty name
+        if not function_name or not function_name.strip():
+            raise ValueError("Function name cannot be empty")
+
+        # Check against dangerous functions
+        if function_name in DANGEROUS_FUNCTIONS:
+            raise ValueError(f"Function '{function_name}' is not allowed for security reasons.")
+
+        # Check for dunder methods
+        if function_name.startswith("__") and function_name.endswith("__"):
+            raise ValueError(f"Dunder method '{function_name}' is not allowed for security reasons.")
+
+        # Check for private methods
+        if function_name.startswith("_"):
+            raise ValueError(
+                f"Private function '{function_name}' is not allowed. Only public functions can be imported."
+            )
+
+        # Validate identifier
+        if not function_name.isidentifier():
+            raise ValueError(
+                f"Invalid function name '{function_name}'. Function name must be a valid Python identifier."
+            )
+
+    def _validate_function(self, func: Callable, function_name: str, module_path: str) -> None:
+        """
+        Validate that the function is safe to use.
+
+        Args:
+            func: Function to validate
+            function_name: Name of the function
+            module_path: Module path
+
+        Raises:
+            ValueError: If function is not safe
+        """
+        # Check if callable
+        if not callable(func):
+            raise ValueError(f"'{function_name}' in module '{module_path}' is not callable. Got type: {type(func)}")
+
+        # Check if built-in (potentially dangerous)
+        if inspect.isbuiltin(func):
+            raise ValueError(f"Built-in function '{function_name}' is not allowed for security reasons.")
+
+        # Check function module
+        func_module = inspect.getmodule(func)
+        if func_module:
+            func_module_name = func_module.__name__
+
+            # Verify it's from the expected module
+            if not func_module_name.startswith(tuple(ALLOWED_MODULE_PREFIXES)):
+                raise ValueError(
+                    f"Function '{function_name}' is from module '{func_module_name}' "
+                    f"which is not in allowed prefixes: {ALLOWED_MODULE_PREFIXES}"
+                )
+
+            # Check for dangerous modules
+            for dangerous in DANGEROUS_MODULES:
+                if dangerous in func_module_name:
+                    raise ValueError(f"Function '{function_name}' is from dangerous module '{func_module_name}'.")
+
+        # Check function signature
+        try:
+            sig = inspect.signature(func)
+            params = list(sig.parameters.keys())
+
+            # Function should accept at least one parameter
+            if len(params) < 1:
+                raise ValueError(
+                    f"Function '{function_name}' must accept at least one parameter (example dict). "
+                    f"Got signature: {sig}"
+                )
+
+        except (ValueError, TypeError) as e:
+            raise ValueError(f"Cannot inspect signature of '{function_name}': {e}. Function may not be safe to use.")
+
     def import_func(self, func_path: str) -> Callable:
+        """
+        Safely import a function from a module with security checks.
+        Args:
+           func_path: Path in format 'module_path:function_name'
+        Returns:
+           Callable function
+        """
         if ":" not in func_path:
-            raise ValueError("func_path must be in the format 'module_file_path:function_name'.")
-        module_file_path, function_name = func_path.split(":")
-
+            raise ValueError(f"func_path must be in the format 'module_file_path:function_name'. Got: '{func_path}'")
+        module_file_path, function_name = func_path.split(":", 1)
+        # Security validations
+        self._validate_module_path(module_file_path)
+        self._validate_function_name(function_name)
         try:
             module = importlib.import_module(module_file_path)
-        except Exception:
-            raise RuntimeError(f"Unable to import module : {module_file_path}.")
+        except ImportError as e:
+            raise RuntimeError(
+                f"Unable to import module '{module_file_path}': {e}. "
+                "Please ensure the module exists and is in PYTHONPATH."
+            )
+        except Exception as e:
+            raise RuntimeError(f"Error importing module '{module_file_path}': {e}")
         if not hasattr(module, function_name):
             raise ValueError(f"Function {function_name} not found in module {module_file_path}.")
-        return getattr(module, function_name)
+        func = getattr(module, function_name)
+        # Validate function
+        self._validate_function(func, function_name, module_file_path)
+
+        return func
 
     def _filter_empty_or_none_samples(self, example: Dict[str, Any], relevant_columns: list) -> bool:
         """
diff --git a/QEfficient/finetune/experimental/core/utils/device_map_utils.py b/QEfficient/finetune/experimental/core/utils/device_map_utils.py
index 3970b052bd..0845685343 100644
--- a/QEfficient/finetune/experimental/core/utils/device_map_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/device_map_utils.py
@@ -37,17 +37,16 @@ def get_device_map(
     # Validate pp_degree
     if not isinstance(pp_degree, int):
         raise TypeError(f"pp_degree must be an integer, got {type(pp_degree).__name__}")
-    
+
     if pp_degree < 1:
         raise ValueError(f"pp_degree must be >= 1, got {pp_degree}")
-    
+
     if pp_degree == 1:
         return None
 
     torch_device = torch.device(device)
     num_available_devices = getattr(torch, torch_device.type).device_count()
 
-    
     if pp_degree > num_available_devices:
         raise ValueError(
             f"pp_degree ({pp_degree}) cannot exceed the number of available {device} devices "
diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py
index 81d37db903..6d9366a31c 100644
--- a/QEfficient/finetune/experimental/tests/test_dataset.py
+++ b/QEfficient/finetune/experimental/tests/test_dataset.py
@@ -348,7 +348,7 @@ def test_sft_dataset_invalid_module_import(self):
                 dataset_name="dummy",
                 split="train",
                 json_file_path=self.json_file_path,
-                prompt_func="nonexistent_module:function",
+                prompt_func="QEfficient.finetune.experimental.preprocessing.nonexistent_module:function",
                 completion_template="A: {answer}",
             )
 
@@ -377,9 +377,13 @@ def test_sft_dataset_invalid_function_name(self):
 
             self.assertIn("not found in module", str(context.exception))
         finally:
-            sys.path.remove(self.test_dir)
-            if os.path.exists(func_file_path):
-                os.remove(func_file_path)
+            if self.test_dir in sys.path:
+                sys.path.remove(self.test_dir)
+            try:
+                if os.path.exists(func_file_path):
+                    os.remove(func_file_path)
+            except Exception:
+                pass
 
     def test_sft_dataset_filter_empty_or_none_samples(self):
         """Test filtering of samples with empty or None values."""

From a3fefcf947230c6ca60c04777e2c6e2d70d61d77 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Tue, 31 Mar 2026 11:14:27 +0530
Subject: [PATCH 16/23] Adding reference data test for finetune (#897)

Added testcase to test compare loss and metrics for different sdks to stable sdk

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../finetune/experimental/tests/constants.py  |  20 +-
 .../experimental/tests/reference_data.py      | 196 ++++++++++++++++
 .../experimental/tests/test_integrated.py     | 212 +++++++++++++++---
 3 files changed, 385 insertions(+), 43 deletions(-)
 create mode 100644 QEfficient/finetune/experimental/tests/reference_data.py

diff --git a/QEfficient/finetune/experimental/tests/constants.py b/QEfficient/finetune/experimental/tests/constants.py
index 578a165756..1dfd806581 100644
--- a/QEfficient/finetune/experimental/tests/constants.py
+++ b/QEfficient/finetune/experimental/tests/constants.py
@@ -53,8 +53,8 @@ class AutoClassName(str, Enum):
 
 TEST_LORA_R = 8
 TEST_LORA_ALPHA = 16
-TEST_LORA_DROPOUT = 0.1
-TEST_LORA_TARGET_MODULES_LLAMA = ["q_proj", "v_proj"]
+TEST_LORA_DROPOUT = 0
+TEST_LORA_TARGET_MODULES_LLAMA = ["k_proj", "v_proj"]
 TEST_LORA_TARGET_MODULES_BERT = ["query", "value"]
 TEST_LORA_BIAS = "none"
 
@@ -62,29 +62,31 @@ class AutoClassName(str, Enum):
 # Training Parameters
 # ============================================================================
 
-TEST_LEARNING_RATE = 5e-5
+TEST_LEARNING_RATE = 1e-4
 TEST_WEIGHT_DECAY = 0.01
-TEST_WARMUP_STEPS = 5
 TEST_NUM_TRAIN_EPOCHS = 1
 TEST_LOGGING_STEPS = 1
 TEST_PER_DEVICE_BATCH_SIZE = 1
-TEST_MAX_SEQ_LENGTH_CAUSAL = 256
+TEST_MAX_SEQ_LENGTH_CAUSAL = 512
 TEST_MAX_SEQ_LENGTH_SEQ_CLS = 128
 TEST_MAX_LENGTH = 128
 TEST_NUM_HIDDEN_LAYERS = 2
+TEST_EVAL_STEPS = 1
+TEST_MAX_STEPS = 20
 
 # ============================================================================
 # Dataset Paths and Names
 # ============================================================================
 
 # HuggingFace Dataset Names
-HF_DATASET_ALPACA = "tatsu-lab/alpaca"
+HF_DATASET_ALPACA = "yahma/alpaca-cleaned"
 HF_DATASET_GSM8K = "openai/gsm8k"
 HF_DATASET_GSM8K_CONFIG = "main"
+HF_DATASET_ALPACA_CONFIG = "default"
 HF_DATASET_IMDB = "stanfordnlp/imdb"
 
 # Dataset subset size for testing
-TEST_DATASET_SUBSET_SIZE = 10
+TEST_DATASET_SUBSET_SIZE = 30
 
 # ============================================================================
 # Model Names
@@ -103,7 +105,9 @@ class AutoClassName(str, Enum):
 OPT_SGD_MOMENTUM = 0.9
 
 # ============================================================================
-# Loss Parameters
+# Reference Parameters
 # ============================================================================
 
 TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD = 2.0
+METRIC_ATOL = 0.5
+LOSS_ATOL = 0.5
diff --git a/QEfficient/finetune/experimental/tests/reference_data.py b/QEfficient/finetune/experimental/tests/reference_data.py
new file mode 100644
index 0000000000..0c98e40cbf
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/reference_data.py
@@ -0,0 +1,196 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""Reference data for the finetune tests from SDK version - 1.22.0.32"""
+
+# A dictionary to hold all reference data for all test sets.
+REFERENCE_DATA = {
+    # Scenario 1: Single-device llama 3.2-1B training on Alpaca dataset.
+    "llama_3.2_1B_config_alpaca_single_device": {
+        "description": "Baseline for Llama on Alpaca single-device",
+        "train_step_losses": [
+            1.4349,
+            2.2995,
+            2.2019,
+            2.6829,
+            2.6993,
+            3.1324,
+            0.9013,
+            2.0960,
+            2.8757,
+            2.8672,
+            2.7872,
+            2.9132,
+            1.6886,
+            1.8413,
+            2.7607,
+            2.7590,
+            2.3171,
+            2.7947,
+            1.8753,
+            1.9519,
+        ],
+        "eval_step_losses": [
+            2.6338751316070557,
+            2.6335389614105225,
+            2.633760929107666,
+            2.633681535720825,
+            2.6326093673706055,
+            2.631838798522949,
+            2.631136655807495,
+            2.6302545070648193,
+            2.6286115646362305,
+            2.6276299953460693,
+            2.6258647441864014,
+            2.623645544052124,
+            2.6215550899505615,
+            2.619070053100586,
+            2.619069814682007,
+            2.6169092655181885,
+            2.614532470703125,
+            2.6107428073883057,
+            2.608034372329712,
+            2.6044723987579346,
+        ],
+        "train_step_metrics": [
+            4.19922506528457,
+            9.969196610152352,
+            9.04217732555441,
+            14.627451456085135,
+            14.86931955734922,
+            22.928943023137037,
+            2.462802674467,
+            8.133570475257905,
+            17.737836262089058,
+            17.587703621506293,
+            16.23549670841589,
+            18.41563440957389,
+            5.411898740757472,
+            6.304729083647153,
+            15.810906710189084,
+            15.784051002600961,
+            10.146207597963729,
+            16.357720700775936,
+            6.522775659532417,
+            7.042054781812275,
+        ],
+        "eval_step_metrics": [
+            13.927636889811502,
+            13.922955620276747,
+            13.926046409688226,
+            13.92494081758739,
+            13.910018937577274,
+            13.899304438973257,
+            13.88954856901887,
+            13.877301323952516,
+            13.854520435781245,
+            13.840927936073056,
+            13.816516774237883,
+            13.78588915533494,
+            13.757100487984223,
+            13.722956029013478,
+            13.722952757206189,
+            13.693335649231411,
+            13.660828047312608,
+            13.609156079932017,
+            13.572346435255119,
+            13.524088094436797,
+        ],
+    },
+    # Scenario 2: Single-device llama 3.2-1B training on GSM8k dataset.
+    "llama_3.2_1B_config_gsm8k_single_device": {
+        "description": "Baseline for Llama on GSM8k single-device",
+        "train_step_losses": [
+            1.3673,
+            1.4945,
+            1.9357,
+            1.6798,
+            2.0900,
+            2.4106,
+            1.6770,
+            1.9297,
+            1.6710,
+            1.6080,
+            2.0020,
+            2.4688,
+            1.9058,
+            1.0837,
+            1.6257,
+            1.7367,
+            2.1968,
+            2.0709,
+            1.2537,
+            1.5762,
+        ],
+        "eval_step_losses": [
+            1.8449769020080566,
+            1.8452266454696655,
+            1.8454170227050781,
+            1.8453651666641235,
+            1.845239281654358,
+            1.8449088335037231,
+            1.844594120979309,
+            1.844080924987793,
+            1.8441227674484253,
+            1.844334602355957,
+            1.843506932258606,
+            1.8429961204528809,
+            1.8429960012435913,
+            1.842812418937683,
+            1.842100739479065,
+            1.8413892984390259,
+            1.8405832052230835,
+            1.8402591943740845,
+            1.8391698598861694,
+            1.8383756875991821,
+        ],
+        "train_step_metrics": [
+            3.924739580044779,
+            4.457107441895579,
+            6.928892583003728,
+            5.364482967231716,
+            8.084915164305059,
+            11.140643526266233,
+            5.349483424083576,
+            6.887443698505779,
+            5.31748262194783,
+            4.992815604333296,
+            7.40384899909771,
+            11.808268424182273,
+            6.724785304521,
+            2.9555950466184813,
+            5.081975175009138,
+            5.678573177316005,
+            8.996179615055848,
+            7.931958669171997,
+            3.5032811472953274,
+            4.836541987577648,
+        ],
+        "eval_step_metrics": [
+            6.327953625692157,
+            6.329534188094885,
+            6.330739302024553,
+            6.330411023459722,
+            6.329614169763233,
+            6.327522906012752,
+            6.325531868624348,
+            6.322286463860749,
+            6.322551009417799,
+            6.323890488295427,
+            6.318658558693856,
+            6.315431737525067,
+            6.315430984666982,
+            6.3142716897002895,
+            6.309779550913647,
+            6.305292111251173,
+            6.30021150605156,
+            6.298170499844544,
+            6.291313421012602,
+            6.286319017715728,
+        ],
+    },
+}
diff --git a/QEfficient/finetune/experimental/tests/test_integrated.py b/QEfficient/finetune/experimental/tests/test_integrated.py
index d13d237bc7..207cf458e6 100644
--- a/QEfficient/finetune/experimental/tests/test_integrated.py
+++ b/QEfficient/finetune/experimental/tests/test_integrated.py
@@ -10,12 +10,14 @@
 Tests the complete workflow using all components from the core/ directory.
 """
 
+import math
 import os
 import shutil
 import tempfile
 from dataclasses import dataclass
 from typing import Optional
 
+import numpy as np
 import pytest
 import torch
 
@@ -31,12 +33,17 @@
     TrainingConfig,
 )
 from QEfficient.finetune.experimental.core.logger import Logger
+from QEfficient.finetune.experimental.tests import reference_data as ref_data
 from QEfficient.finetune.experimental.tests.constants import (
     HF_DATASET_ALPACA,
+    HF_DATASET_ALPACA_CONFIG,
     HF_DATASET_GSM8K,
     HF_DATASET_GSM8K_CONFIG,
     HF_DATASET_IMDB,
+    LOSS_ATOL,
+    METRIC_ATOL,
     TEST_DATASET_SUBSET_SIZE,
+    TEST_EVAL_STEPS,
     TEST_LEARNING_RATE,
     TEST_LOGGING_STEPS,
     TEST_LORA_ALPHA,
@@ -47,18 +54,18 @@
     TEST_LORA_TARGET_MODULES_LLAMA,
     TEST_MAX_SEQ_LENGTH_CAUSAL,
     TEST_MAX_SEQ_LENGTH_SEQ_CLS,
+    TEST_MAX_STEPS,
     TEST_MODEL_LLAMA,
-    TEST_NUM_HIDDEN_LAYERS,
     TEST_NUM_TRAIN_EPOCHS,
     TEST_PER_DEVICE_BATCH_SIZE,
     TEST_SEED,
-    TEST_WARMUP_STEPS,
     TEST_WEIGHT_DECAY,
     TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD,
     AutoClassName,
     DatasetType,
     TaskType,
 )
+from QEfficient.finetune.utils.helper import get_rank, get_world_size
 
 logger = Logger(__name__)
 # ============================================================================
@@ -66,6 +73,98 @@
 # ============================================================================
 
 
+def clean_up(path):
+    if os.path.isdir(path) and os.path.exists(path):
+        shutil.rmtree(path)
+    if os.path.isfile(path):
+        os.remove(path)
+
+
+def assert_list_close(ref_list, actual_list, atol, name, scenario_key, current_world_size, current_rank):
+    """
+    Asserts that two lists of floats are numerically close element-wise.
+    If not close, reports the step numbers and the differences at those steps.
+    """
+    # --- Initial Checks ---
+    assert actual_list is not None and isinstance(actual_list, list), (
+        f"Actual {name} data is missing or not a list for scenario '{scenario_key}'."
+    )
+    assert len(ref_list) == len(actual_list), (
+        f"{name} length mismatch for scenario '{scenario_key}' (WS: {current_world_size}, Rank: {current_rank}). "
+        f"Expected {len(ref_list)} elements, but got {len(actual_list)}."
+    )
+
+    # --- Convert to NumPy arrays for efficient comparison ---
+    ref_arr = np.array(ref_list)
+    actual_arr = np.array(actual_list)
+
+    # --- Check if all elements are close using np.allclose ---
+    # This is the primary assertion that will fail if any deviation is too large
+    if not np.allclose(ref_arr, actual_arr, atol=atol):
+        # If not all close, identify the specific deviations
+        deviated_indices = np.where(~np.isclose(ref_arr, actual_arr, atol=atol))[0]
+        deviation_details = []
+        for idx in deviated_indices:
+            ref_val = ref_arr[idx]
+            actual_val = actual_arr[idx]
+            diff = actual_val - ref_val
+            deviation_details.append(f"Step {idx}: Ref={ref_val:.6f}, Actual={actual_val:.6f}, Diff={diff:.6f}")
+
+        # Calculate max_diff
+        max_diff = np.max(np.abs(ref_arr - actual_arr))
+
+        # --- Report detailed deviation in the AssertionError ---
+        error_message = (
+            f"{name} deviated too much for scenario '{scenario_key}' "
+            f"(WS: {current_world_size}, Rank: {current_rank}).\n"
+            f"Max Difference: {max_diff:.6f}, Allowed Tolerance: {atol:.6f}.\n"
+            f"Deviations found at {len(deviated_indices)} steps:\n" + "\n".join(deviation_details) + "\n"
+            f"Reference (first 10): {ref_list[:10]}...\n"
+            f"Actual    (first 10): {actual_list[:10]}..."
+        )
+        assert False, error_message  # Force the assertion to fail with the custom message
+    else:
+        # If all close, report success and max_diff for printing
+        max_diff = np.max(np.abs(ref_arr - actual_arr))
+        print(f"  ✅ {name} PASSED. Max Diff: {max_diff:.6f}")
+
+
+def get_reference_metrics(
+    scenario_key,
+):
+    reference_data = ref_data.REFERENCE_DATA.get(scenario_key)
+    if reference_data is None:
+        pytest.fail(f"Reference data for scenario '{scenario_key}' not found in REFERENCE_DATA.")
+    current_world_size = get_world_size()
+    current_rank = get_rank()
+    if current_world_size > 1:
+        rank_reference_data = reference_data.get("rank_data", {}).get(str(current_rank))
+        if rank_reference_data is None:
+            pytest.fail(f"Reference data for rank {current_rank} not found in distributed scenario '{scenario_key}'.")
+        ref_train_losses = rank_reference_data["train_step_losses"]
+        ref_eval_losses = rank_reference_data["eval_step_losses"]
+        ref_train_metrics = rank_reference_data["train_step_metrics"]
+        ref_eval_metrics = rank_reference_data["eval_step_metrics"]
+    else:  # Single device or world_size=1
+        ref_train_losses = reference_data["train_step_losses"]
+        ref_eval_losses = reference_data["eval_step_losses"]
+        ref_train_metrics = reference_data["train_step_metrics"]
+        ref_eval_metrics = reference_data["eval_step_metrics"]
+
+    all_ref_metrices = {
+        "ref_train_losses": ref_train_losses,
+        "ref_eval_losses": ref_eval_losses,
+        "ref_train_metrics": ref_train_metrics,
+        "ref_eval_metrics": ref_eval_metrics,
+    }
+
+    all_config_spy = {
+        "current_world_size": current_world_size,
+        "current_rank": current_rank,
+    }
+    return all_ref_metrices, all_config_spy
+
+
 @dataclass
 class TestModelConfig:
     """Dataclass for test model configuration."""
@@ -83,9 +182,10 @@ class TestDatasetConfig:
     dataset_name: str
     hf_dataset_name: str
     hf_dataset_config: Optional[str]
-    prompt_template: str
     completion_template: str
     max_seq_length: int
+    prompt_template: Optional[str] = None
+    prompt_func: Optional[str] = None
 
 
 @dataclass
@@ -121,7 +221,7 @@ class TestTrainingConfig:
     dataset_name="openai/gsm8k",
     hf_dataset_name=HF_DATASET_GSM8K,
     hf_dataset_config=HF_DATASET_GSM8K_CONFIG,
-    prompt_template="Question: {question}\nAnswer: ",
+    prompt_template="Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n",
     completion_template="{answer}",
     max_seq_length=TEST_MAX_SEQ_LENGTH_CAUSAL,
 )
@@ -129,8 +229,8 @@ class TestTrainingConfig:
 ALPACA_DATASET_CONFIG = TestDatasetConfig(
     dataset_name="yahma/alpaca-cleaned",
     hf_dataset_name=HF_DATASET_ALPACA,
-    hf_dataset_config=None,
-    prompt_template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
+    hf_dataset_config=HF_DATASET_ALPACA_CONFIG,
+    prompt_func="QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt",
     completion_template="{output}",
     max_seq_length=TEST_MAX_SEQ_LENGTH_CAUSAL,
 )
@@ -181,7 +281,7 @@ def create_master_config(
             auto_class_name=auto_class_name,
             use_peft=model_config.use_peft,
             use_cache=False,
-            attn_implementation="eager",
+            attn_implementation="sdpa",
             device_map=None,
             peft_config=PeftConfig(
                 lora_r=TEST_LORA_R,
@@ -202,12 +302,13 @@ def create_master_config(
             max_seq_length=dataset_config.max_seq_length,
             train_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
             eval_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
-            prompt_template=dataset_config.prompt_template,
+            prompt_template=dataset_config.prompt_template if dataset_config.prompt_template else None,
+            prompt_func=dataset_config.prompt_func if dataset_config.prompt_func else None,
             completion_template=dataset_config.completion_template,
-            num_workers=1,
-            test_split="train",
+            test_split="test",
             config_name=dataset_config.hf_dataset_config,
             dataset_num_samples=TEST_DATASET_SUBSET_SIZE,
+            data_seed=TEST_SEED,
         ),
         optimizers=OptimizerConfig(
             optimizer_name="adamw",
@@ -216,7 +317,6 @@ def create_master_config(
         ),
         scheduler=SchedulerConfig(
             scheduler_name="cosine",
-            warmup_steps=TEST_WARMUP_STEPS,
         ),
         training=TrainingConfig(
             type="sft",  # Using the "type" field from TrainingConfig
@@ -224,15 +324,18 @@ def create_master_config(
             num_train_epochs=TEST_NUM_TRAIN_EPOCHS,
             per_device_train_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
             per_device_eval_batch_size=TEST_PER_DEVICE_BATCH_SIZE,
+            logging_strategy="steps",
             logging_steps=TEST_LOGGING_STEPS,
-            save_strategy="no",
-            eval_strategy="no",
+            gradient_accumulation_steps=1,
+            eval_strategy="steps",
+            max_steps=TEST_MAX_STEPS,
+            eval_steps=TEST_EVAL_STEPS,
             seed=TEST_SEED,
         ),
     )
 
 
-def run_training(trainer, config_name: str):
+def run_training(trainer, config_name):
     """
     Run training and return results.
 
@@ -244,13 +347,15 @@ def run_training(trainer, config_name: str):
         Training result, Evaluation result
     """
     logger.info(f"Starting training for {config_name}...")
-    train_result = trainer.train()
+    trainer.train()
     logger.info(f"Training completed for {config_name}!")
-    logger.info(f"Starting evaluation for {config_name}...")
-    eval_result = trainer.evaluate()
-    logger.info(f"Evaluation completed for {config_name}!")
-
-    return train_result, eval_result
+    train_step_loss = [log["loss"] for log in trainer.state.log_history if "loss" in log]
+    eval_step_loss = [log["eval_loss"] for log in trainer.state.log_history if "eval_loss" in log]
+    train_step_metric = [math.exp(x) for x in train_step_loss]
+    eval_step_metric = [math.exp(x) for x in eval_step_loss]
+    final_train_loss = train_step_loss[-1] if train_step_loss else float("inf")
+    final_eval_loss = eval_step_loss[-1] if eval_step_loss else float("inf")
+    return final_eval_loss, final_train_loss, train_step_loss, eval_step_loss, train_step_metric, eval_step_metric
 
 
 def verify_training_results(train_result, eval_result):
@@ -262,11 +367,10 @@ def verify_training_results(train_result, eval_result):
         eval_result: Evaluation result dictionary
     """
     assert train_result is not None
-    assert hasattr(train_result, "training_loss")
-    assert "eval_loss" in eval_result
-    logger.info(f"Training loss: {train_result.training_loss:.4f}")
-    logger.info(f"Evaluation loss: {eval_result['eval_loss']:.4f}")
-    assert abs(train_result.training_loss - eval_result["eval_loss"]) < TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD
+    assert eval_result is not None
+    logger.info(f"Training loss: {train_result:.4f}")
+    logger.info(f"Evaluation loss: {eval_result:.4f}")
+    assert abs(train_result - eval_result) < TRAIN_EVAL_EPOCH_LOSS_DIFF_THRESHOLD
 
 
 def run_inference_causal_lm(model, tokenizer):
@@ -314,21 +418,23 @@ def teardown_method(self):
                 logger.warning(f"Warning: Failed to clean up {self.test_output_dir}: {e}")
 
     @pytest.mark.parametrize(
-        "dataset_config,config_name",
+        "dataset_config,config_name,scenario_key",
         [
             pytest.param(
                 GSM8K_DATASET_CONFIG,
                 "llama_3.2_1B_gsm8k",
+                "llama_3.2_1B_config_gsm8k_single_device",
                 id="llama_gsm8k",
             ),
             pytest.param(
                 ALPACA_DATASET_CONFIG,
                 "llama_3.2_1B_alpaca",
+                "llama_3.2_1B_config_alpaca_single_device",
                 id="llama_alpaca",
             ),
         ],
     )
-    def test_llama_causal_lm(self, dataset_config: TestDatasetConfig, config_name: str):
+    def test_llama_causal_lm(self, dataset_config: TestDatasetConfig, scenario_key: str, config_name: str):
         """
         Test Llama model with different datasets for causal language modeling.
 
@@ -343,9 +449,6 @@ def test_llama_causal_lm(self, dataset_config: TestDatasetConfig, config_name: s
             output_dir=self.test_output_dir,
         )
         config_manager = ConfigManager(master_config)
-        model_config = config_manager.get_model_config()
-        # for fast testing
-        model_config["num_hidden_layers"] = TEST_NUM_HIDDEN_LAYERS
         pipeline = FineTuningPipeline(config_manager)
         model, tokenizer = pipeline.get_model_and_tokenizer()
         trainer = pipeline.get_trainer()
@@ -359,10 +462,49 @@ def test_llama_causal_lm(self, dataset_config: TestDatasetConfig, config_name: s
         total_params = sum(p.numel() for p in model.parameters())
         logger.info(f"Total parameters: {total_params:,}")
         # Run training
-        train_result, eval_result = run_training(trainer, config_name)
-
-        # Verify training results
-        verify_training_results(train_result, eval_result)
+        final_eval_loss, final_train_loss, train_step_loss, eval_step_loss, train_step_metric, eval_step_metric = (
+            run_training(trainer, config_name)
+        )
+        all_ref_metrices, all_config_spy = get_reference_metrics(scenario_key)
+        verify_training_results(final_train_loss, final_eval_loss)
+        run_inference_causal_lm(model, tokenizer)
 
         # Test inference
-        run_inference_causal_lm(model, tokenizer)
+        # Assertions for step-level values using the helper function
+        assert_list_close(
+            all_ref_metrices["ref_train_losses"],
+            train_step_loss,
+            LOSS_ATOL,
+            "Train Step Losses",
+            scenario_key,
+            all_config_spy["current_world_size"],
+            all_config_spy["current_rank"],
+        )
+        assert_list_close(
+            all_ref_metrices["ref_eval_losses"],
+            eval_step_loss,
+            LOSS_ATOL,
+            "Eval Step Losses",
+            scenario_key,
+            all_config_spy["current_world_size"],
+            all_config_spy["current_rank"],
+        )
+        assert_list_close(
+            all_ref_metrices["ref_train_metrics"],
+            train_step_metric,
+            METRIC_ATOL,
+            "Train Step Metrics",
+            scenario_key,
+            all_config_spy["current_world_size"],
+            all_config_spy["current_rank"],
+        )
+        assert_list_close(
+            all_ref_metrices["ref_eval_metrics"],
+            eval_step_metric,
+            METRIC_ATOL,
+            "Eval Step Metrics",
+            scenario_key,
+            all_config_spy["current_world_size"],
+            all_config_spy["current_rank"],
+        )
+        clean_up("qaic-dumps")

From e9e7a7f52b1d1f02646e4f6c8e5ee44582df316f Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic_sallabad@quicinc.com>
Date: Tue, 31 Mar 2026 15:50:01 +0530
Subject: [PATCH 17/23] [QEff. Finetuning]: Updating PP documentation (#899)

Updating PP CLI command as per latest changes in config manager
In future, this command should also be updated if any changes are done
in single SOC CLI command

Signed-off-by: Swati Allabadi <sallabad@qti.qualcomm.com>
Co-authored-by: Swati Allabadi <sallabad@qti.qualcomm.com>
---
 docs/source/hf_finetune.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index b73cdc50f1..a2c74c060d 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -309,10 +309,12 @@ where user can configure `pp_degree: 2` under `training:` section for the input
 
 **PP only — single process, 2 stages (via CLI flags)**
 ```bash
-python -m QEfficient.cloud.finetune_experimental \
-    --model_name meta-llama/Llama-3.2-1B \
-    --device qaic \
-    --pp_degree 2
+QAIC_VISIBLE_DEVICES=0,1 python -m QEfficient.cloud.finetune_experimental \
+--device qaic --lora_r 16 --target_modules q_proj, v_proj \
+--gradient_checkpointing True --dataset_name "yahma/alpaca-cleaned" \
+--completion_template {output} \
+--prompt_func QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt \
+--pp_degree 2
 ```
 
 

From ee92e08fb6f6877bd1ff879cff758c23fa93fa40 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Tue, 31 Mar 2026 08:57:08 +0000
Subject: [PATCH 18/23] Initial commit: Adding TP+DDP support in hf trainer
 stack

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     |  53 +++++++-
 .../experimental/core/component_registry.py   |   2 +-
 .../experimental/core/config_manager.py       |  24 ++--
 .../experimental/core/utils/tp_peft_utils.py  | 119 ++++++++++++++++++
 .../core/utils/training_config_utils.py       |  13 ++
 QEfficient/transformers/cache_utils.py        |   2 +
 .../transformers/quantizers/quantizer_awq.py  |   2 +
 7 files changed, 202 insertions(+), 13 deletions(-)
 create mode 100644 QEfficient/finetune/experimental/core/utils/tp_peft_utils.py

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index ce024828c8..45b7feedea 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -14,6 +14,9 @@
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
+import torch
+from accelerate.utils import ParallelismConfig
+
 from QEfficient.finetune.experimental.core.callbacks import replace_progress_callback
 from QEfficient.finetune.experimental.core.component_registry import ComponentFactory
 from QEfficient.finetune.experimental.core.config_manager import (
@@ -26,6 +29,7 @@
 from QEfficient.finetune.experimental.core.trainer import sft_trainer  # noqa: F401
 from QEfficient.finetune.experimental.core.utils.device_map_utils import get_device_map
 from QEfficient.finetune.experimental.core.utils.peft_utils import convert_peft_config_to_lora_config
+from QEfficient.finetune.experimental.core.utils.tp_peft_utils import apply_peft_to_model
 from QEfficient.finetune.experimental.core.utils.training_config_utils import prepare_training_config
 
 logger = Logger(__name__)
@@ -161,8 +165,11 @@ def _create_model(self) -> Any:
         model_name = model_config.pop("model_name")
 
         # Get training config for PP settings
-        training_config = self.config.training
-        pp_degree = training_config.get("pp_degree", 1)
+        # training_config = self.config.training
+        training_config = self.training_config
+
+        pp_degree = self.training_config.get("pp_degree", 1)
+
         device = training_config.get("device", "qaic")
 
         # Generate device_map for pipeline parallelism if pp_degree > 1
@@ -176,11 +183,39 @@ def _create_model(self) -> Any:
             model_config["device_map"] = device_map
             logger.log_rank_zero(f"Pipeline Parallelism enabled: Using device_map for {pp_degree} stages")
 
+        tp_degree = training_config.pop("tp_degree", 1)
+
+        if tp_degree > 1:
+            pc = training_config.get("parallelism_config")
+            if not isinstance(pc, ParallelismConfig):
+                raise TypeError(f"Expected ParallelismConfig, got {type(pc).__name__}")
+            device_mesh = pc.build_device_mesh(device)
+            tp_mesh = device_mesh["tp"]
+            model_config["tp_plan"] = "auto"
+            model_config["tp_size"] = tp_degree
+            model_config["device_mesh"] = tp_mesh
+
         # Filter out PEFT-related fields, these shouldn't be passed to model creation
         excluded_keys = {"use_peft", "peft_config"}
         model_config_kwargs = {k: v for k, v in model_config.items() if k not in excluded_keys}
 
         model_instance = ComponentFactory.create_model(model_type, model_name, **model_config_kwargs)
+
+        if tp_degree > 1:
+            # Need to explicitly untie the embedding weights here to consider
+            # this as separate params in further TP processing
+            model_instance.model.lm_head.weight = torch.nn.Parameter(model_instance.model.lm_head.weight.clone())
+            peft_config = None
+            if model_config.get("use_peft", False):
+                peft_config_dataclass = model_config.get("peft_config")
+                if peft_config_dataclass is not None:
+                    peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
+                # Apply PEFT to the model and include PEFT layers in TP plan
+
+                model_instance.model = apply_peft_to_model(
+                    model_instance.model, tp_mesh=tp_mesh, peft_config=peft_config
+                )
+
         return model_instance
 
     def _create_optimizer(self) -> Tuple[Any, Dict[str, Any]]:
@@ -245,15 +280,18 @@ def _create_trainer(
         # Get PEFT config if enabled
         model_config_dict = self.config_manager.get_model_config()
         peft_config = None
-        if model_config_dict.get("use_peft", False):
+        if model_config_dict.get("use_peft", False) and not (
+            self.config_manager.config.training.get("tp_degree", 1) > 1
+        ):
             peft_config_dataclass = model_config_dict.get("peft_config")
             if peft_config_dataclass is not None:
                 peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
 
         # Build dependencies for trainer configuration
         dependencies = {}
-        if peft_config is not None:
+        if peft_config is not None and not (self.config_manager.config.training.get("tp_degree", 1) > 1):
             dependencies["peft_config"] = peft_config
+
         trainer_cls, args_cls, additional_kwargs = ComponentFactory.create_trainer_config(trainer_type, **dependencies)
 
         # Clean up training config: remove fields that shouldn't be passed to TrainingArguments
@@ -264,6 +302,13 @@ def _create_trainer(
         # Remove PP-specific fields as they're handled via device_map in model loading
         training_config.pop("pp_degree", None)
 
+        training_config.pop("tp_degree", None)
+        training_config.pop("ddp_degree", None)
+
+        # Before constructing SFTConfig/TrainingArguments
+        if training_config.get("report_to") is None:
+            training_config["report_to"] = "tensorboard"
+
         # Create trainer arguments instance
         args = args_cls(**training_config)
         dataset_config_dict = self.config_manager.get_dataset_config()
diff --git a/QEfficient/finetune/experimental/core/component_registry.py b/QEfficient/finetune/experimental/core/component_registry.py
index 59bd3598dd..5a97c9df67 100644
--- a/QEfficient/finetune/experimental/core/component_registry.py
+++ b/QEfficient/finetune/experimental/core/component_registry.py
@@ -225,7 +225,7 @@ def create_trainer_config(name: str, **dependencies) -> tuple:
         for kwarg, default in config["required_kwargs"].items():
             if kwarg in dependencies:
                 additional_kwargs[kwarg] = dependencies[kwarg]
-            elif default != "REQUIRED":
+            elif default != "REQUIRED" and not isinstance(default, type):
                 additional_kwargs[kwarg] = default
 
         # Check for missing required arguments
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 9846c91944..58adba82d9 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -242,10 +242,10 @@ class ModelConfig:
         default="AutoModelForCausalLM",
         metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."},
     )
-    load_in_4bit: bool = field(
-        default=False,
-        metadata={"help": "Whether to load the model in 4-bit quantization."},
-    )
+    # load_in_4bit: bool = field(
+    #     default=False,
+    #     metadata={"help": "Whether to load the model in 4-bit quantization."},
+    # )
     use_peft: bool = field(
         default=True,
         metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."},
@@ -330,10 +330,10 @@ class TrainingConfig:
         default="./training_results",
         metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
     )
-    overwrite_output_dir: bool = field(
-        default=False,
-        metadata={"help": "Whether to overwrite the output directory."},
-    )
+    # overwrite_output_dir: bool = field(
+    #     default=False,
+    #     metadata={"help": "Whether to overwrite the output directory."},
+    # )
     seed: int = field(
         default=42,
         metadata={"help": "Random seed for reproducibility."},
@@ -476,6 +476,14 @@ class TrainingConfig:
         default=1,
         metadata={"help": "Pipeline parallelism degree (number of pipeline stages). Set > 1 to enable PP."},
     )
+    tp_degree: int = field(
+        default=1,
+        metadata={"help": "Tensor parallelism degree (number of pipeline stages). Set > 1 to enable TP."},
+    )
+    ddp_degree: int = field(
+        default=1,
+        metadata={"help": "Data parallelism degree (number of pipeline stages). Set > 1 to enable DDP."},
+    )
 
 
 @dataclass
diff --git a/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py b/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py
new file mode 100644
index 0000000000..139fa4fee5
--- /dev/null
+++ b/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py
@@ -0,0 +1,119 @@
+import torch
+from peft import get_peft_model
+from transformers.integrations.tensor_parallel import (
+    ALL_PARALLEL_STYLES,
+    distribute_model,
+    replace_layer_number_by_wildcard,
+)
+
+
+def print_trainable_parameters(model) -> None:
+    """
+    Print the number of trainable parameters, all params and percentage of trainablke params.
+    Args:
+        model: The PyTorch model.
+    """
+    trainable_params, all_param = model.get_nb_trainable_parameters()
+    print(
+        f"Trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}"
+    )
+
+
+def is_rowwise_parallel(param: torch.distributed.tensor.DTensor) -> bool:
+    """Check if a DTensor is row-wise parallel."""
+    if not isinstance(param, torch.distributed.tensor.DTensor):
+        return False
+    placements = param.placements
+    if len(placements) != 1:
+        # Till now only TP is applied. If DP is also applied, then the placements might be of len 2.
+        return False
+    tp_placement = placements[0]
+    return tp_placement.is_shard() and tp_placement.dim == 0  # Row-wise sharding
+
+
+def is_colwise_parallel(param: torch.distributed.tensor.DTensor) -> bool:
+    """Check if a DTensor is column-wise parallel."""
+    if not isinstance(param, torch.distributed.tensor.DTensor):
+        return False
+    placements = param.placements
+    if len(placements) != 1:
+        # Till now only TP is applied. If DP is also applied, then the placements might be of len 2.
+        return False
+    tp_placement = placements[0]
+    return tp_placement.is_shard() and tp_placement.dim == 1  # Column-wise sharding
+
+
+def update_peft_tp_plan(model):
+    # If original layer has colwise then Lora-A --> colwise and Lora-B --> rowwise
+    # If original layer has rowwise then Lora-A --> rowwise and Lora-B --> colwise
+    peft_tp_plan = {}
+    for name, schema in model.tp_plan.items():
+        lora_a_name = "base_model.model." + name + ".lora_A.default"
+        lora_b_name = "base_model.model." + name + ".lora_B.default"
+        if schema == "rowwise":
+            peft_tp_plan[lora_a_name] = "rowwise"
+            peft_tp_plan[lora_b_name] = "colwise"
+        elif schema == "colwise":
+            peft_tp_plan[lora_a_name] = "colwise"
+            peft_tp_plan[lora_b_name] = "lora_rowwise"
+    model.tp_plan.update(peft_tp_plan)
+
+
+def apply_tp_modification_for_peft(model, tp_mesh=None):
+    if tp_mesh is None:
+        return
+
+    state_dict = model.state_dict()
+    for name, param in state_dict.items():
+        if ("lora_A.default" in name) or ("lora_B.default" in name):
+            name_for_tp = name.replace(".weight", "")
+            name_for_tp = replace_layer_number_by_wildcard(name_for_tp)
+            if name_for_tp not in model.tp_plan:
+                raise RuntimeError(f"{name_for_tp} not found in model.tp_plan. Please include PEFT layers in tp_plan.")
+            lora_plan = model.tp_plan[name_for_tp]
+
+            empty_param = param.clone().to(device="meta")
+            tp_layer_cls = ALL_PARALLEL_STYLES[lora_plan].__class__
+            tp_layer = tp_layer_cls(
+                device_mesh=tp_mesh,
+                rank=tp_mesh.get_local_rank(),
+                empty_param=empty_param.clone(),
+            )
+            module_path, _, param_name = name.rpartition(".")
+            module_obj = model.get_submodule(module_path)
+
+            # prepare_module_tp does same thing as distribute_model. Hence commented out.
+            # Ideal order of opeartion would be prepare_module_tp followed by shard_tensor based on what HF's tensor parallel code.
+            # tp_layer.prepare_module_tp(module_obj, tp_mesh)
+
+            # Shard the param
+            tp_layer.shard_tensor(param, tensor_idx=None, dtype=empty_param.dtype)
+            setattr(getattr(module_obj, param_name), "data", param)
+
+
+def apply_peft_to_model(model, tp_mesh=None, peft_config=None):
+    peft_config = peft_config
+    # Add PEFT adapters to the model
+    model = get_peft_model(model, peft_config)
+    print_trainable_parameters(model)
+
+    if tp_mesh is None:
+        return
+
+    # Include PEFT parameters in TP plan and update model.tp_plan inplace.
+    update_peft_tp_plan(model)
+
+    # Register pre-forward and post-forward hooks to convert input/output DTensor
+    # to tensor and vice-versa.
+    distribute_model(
+        model,
+        tp_plan=model.tp_plan,
+        distributed_config=None,
+        device_mesh=tp_mesh,
+        tp_size=tp_mesh.size(),
+    )
+
+    # Convert PEFT weights from torch.Tensor to DTensor and apply TP modifications
+    apply_tp_modification_for_peft(model, tp_mesh)
+
+    return model
diff --git a/QEfficient/finetune/experimental/core/utils/training_config_utils.py b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
index 1cd6704e44..b13a0692c2 100644
--- a/QEfficient/finetune/experimental/core/utils/training_config_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
@@ -11,6 +11,8 @@
 
 from typing import Any, Dict
 
+from accelerate.utils import ParallelismConfig
+
 from QEfficient.finetune.experimental.core.config_manager import ConfigManager
 
 
@@ -34,6 +36,17 @@ def prepare_training_config(
     # Handle dtype conversion
     # To do: (For Tanisha) Check if torch_dtype should rather be added directly in model_config only in config_manager.py
 
+    # TODO: Add PC here
+    parallelism_config = {}
+    if training_config.get("tp_degree", 1) > 1:
+        parallelism_config["tp_size"] = training_config["tp_degree"]
+    if training_config.get("ddp_degree", 1) > 1:
+        parallelism_config["dp_replicate_size"] = training_config["ddp_degree"]
+
+    if parallelism_config:  # Only inject if at least one parallelism dimension is active
+        pc = ParallelismConfig(**parallelism_config)
+        training_config["parallelism_config"] = pc
+
     torch_dtype = training_config.pop("torch_dtype", None)
     if torch_dtype is None:
         raise ValueError("'torch_dtype' field is required in training configuration. Expected one of: ['fp16', 'bf16']")
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 0e1118407a..faba40ac8d 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -10,6 +10,8 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+
+# from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache
 from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache
 
 from QEfficient.customop import (
diff --git a/QEfficient/transformers/quantizers/quantizer_awq.py b/QEfficient/transformers/quantizers/quantizer_awq.py
index ef8a03521f..5cb4ef8438 100644
--- a/QEfficient/transformers/quantizers/quantizer_awq.py
+++ b/QEfficient/transformers/quantizers/quantizer_awq.py
@@ -7,6 +7,8 @@
 
 import torch
 from transformers.quantizers.quantizer_awq import AwqQuantizer
+
+# from transformers.utils.quantization_config import AwqConfig
 from transformers.utils.quantization_config import AwqBackendPackingMethod, AwqConfig, AWQLinearVersion
 
 from QEfficient.transformers.quantizers.awq import WQLinear_GEMM

From 7c9965dbfbb48a1a95031ef6f6f43b6cbc25f618 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 6 Apr 2026 08:21:09 +0000
Subject: [PATCH 19/23] adding back the AOT stack imports

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py                       | 1 -
 .../finetune/experimental/core/utils/training_config_utils.py   | 1 -
 QEfficient/transformers/cache_utils.py                          | 2 --
 QEfficient/transformers/quantizers/quantizer_awq.py             | 2 --
 4 files changed, 6 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 45b7feedea..1d2c88fdc8 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -211,7 +211,6 @@ def _create_model(self) -> Any:
                 if peft_config_dataclass is not None:
                     peft_config = convert_peft_config_to_lora_config(peft_config_dataclass)
                 # Apply PEFT to the model and include PEFT layers in TP plan
-
                 model_instance.model = apply_peft_to_model(
                     model_instance.model, tp_mesh=tp_mesh, peft_config=peft_config
                 )
diff --git a/QEfficient/finetune/experimental/core/utils/training_config_utils.py b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
index b13a0692c2..ab71925901 100644
--- a/QEfficient/finetune/experimental/core/utils/training_config_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
@@ -35,7 +35,6 @@ def prepare_training_config(
 
     # Handle dtype conversion
     # To do: (For Tanisha) Check if torch_dtype should rather be added directly in model_config only in config_manager.py
-
     # TODO: Add PC here
     parallelism_config = {}
     if training_config.get("tp_degree", 1) > 1:
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index faba40ac8d..0e1118407a 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -10,8 +10,6 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
-
-# from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache
 from transformers.cache_utils import DynamicCache, DynamicLayer, EncoderDecoderCache, HybridCache, HybridChunkedCache
 
 from QEfficient.customop import (
diff --git a/QEfficient/transformers/quantizers/quantizer_awq.py b/QEfficient/transformers/quantizers/quantizer_awq.py
index 5cb4ef8438..ef8a03521f 100644
--- a/QEfficient/transformers/quantizers/quantizer_awq.py
+++ b/QEfficient/transformers/quantizers/quantizer_awq.py
@@ -7,8 +7,6 @@
 
 import torch
 from transformers.quantizers.quantizer_awq import AwqQuantizer
-
-# from transformers.utils.quantization_config import AwqConfig
 from transformers.utils.quantization_config import AwqBackendPackingMethod, AwqConfig, AWQLinearVersion
 
 from QEfficient.transformers.quantizers.awq import WQLinear_GEMM

From 96b71de3096073f62670bf43b8a66289399c32c4 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 6 Apr 2026 10:33:18 +0000
Subject: [PATCH 20/23] adding readme and config files for tp+ddp

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     |  20 ++-
 .../configs/sft_tp_ddp_gsm8k_config.yaml      |  52 ++++++++
 .../configs/sft_tp_gsm8k_config.yaml          |  52 ++++++++
 docs/source/hf_finetune.md                    | 124 ++++++++++++++++++
 4 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
 create mode 100644 QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 1d2c88fdc8..0a89554069 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -15,6 +15,7 @@
 from typing import Any, Dict, List, Tuple
 
 import torch
+import torch.distributed as dist
 from accelerate.utils import ParallelismConfig
 
 from QEfficient.finetune.experimental.core.callbacks import replace_progress_callback
@@ -63,7 +64,9 @@ def __init__(self, config_manager: ConfigManager):
 
         # Prepare training configuration
         self.training_config = prepare_training_config(config_manager=self.config_manager)
-
+        self.tp_enabled = self.training_config["tp_degree"] > 1
+        if self.tp_enabled:
+            self._initialize_dist_tp()
         # Create datasets
         logger.log_rank_zero("Creating datasets...")
         self.train_dataset, self.eval_dataset = self._create_datasets()
@@ -115,6 +118,21 @@ def _setup_environment(self) -> None:
         os.environ["TRACKIO_DIR"] = str(self.output_dir / "trackio_logs")
         os.environ["TENSORBOARD_LOGGING_DIR"] = str(self.output_dir)
 
+    def _initialize_dist_tp(self):
+        WORLD_SIZE = int(os.getenv("WORLD_SIZE", "1"))
+        LOCAL_RANK = int(os.getenv("LOCAL_RANK", "0"))
+
+        if self.training_config["device"] == "cuda":
+            backend = "nccl"
+        else:
+            backend = "cpu:gloo,qaic:qccl"
+
+        dist.init_process_group(
+            backend=backend,  # "nccl" for GPUs, "gloo" for CPUs
+            world_size=WORLD_SIZE,  # total number of processes
+            rank=LOCAL_RANK,  # unique ID for this process
+        )
+
     def _create_datasets(self) -> Tuple[Any, Any]:
         """
         Create training and evaluation datasets.
diff --git a/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
new file mode 100644
index 0000000000..215c7d5785
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+  tp_degree: 2
+  ddp_degree: 2
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
new file mode 100644
index 0000000000..78d5caf1d9
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+  tp_degree: 2
+  ddp_degree: 1
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index a2c74c060d..e5aadb3a7f 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -324,6 +324,130 @@ QAIC_VISIBLE_DEVICES=0,1 python -m QEfficient.cloud.finetune_experimental \
 
 ***
 
+### Tensor Parallelism (TP) *(Experimental)*
+
+Tensor Parallelism splits individual layers (e.g., weight matrices) across multiple devices, allowing large models to be trained by distributing the computation within each layer rather than across layers.
+
+#### How it works
+
+TP is controlled by a single parameter: **`tp_degree`**.
+
+| `tp_degree` value | Behaviour |
+|---|---|
+| `1` (default) | TP disabled — standard single-device training |
+| `> 1` | Each layer is split across `tp_degree` devices |
+
+When `tp_degree > 1` the framework distributes tensor operations (e.g., matrix multiplications in attention and MLP blocks) across devices, with each device holding a shard of the weights.
+
+#### Configuration parameter
+
+Add `tp_degree` under the `training` section of your YAML config or pass it as a CLI flag.
+
+```yaml
+# training section of your config YAML
+training:
+  device: "qaic"       # or "cuda"
+  tp_degree: 2         # split tensors across 2 devices
+```
+
+#### Installation Setup
+
+>  **Note:** TP requires a specific set of dependencies. Follow the steps below carefully.
+
+```bash
+python -m venv finetune_env
+source finetune_env/bin/activate
+
+git clone https://github.com/quic/efficient-transformers.git
+cd efficient-transformers
+git checkout ft_experimental      # Can remove this once merged to mainline
+pip install -e .
+
+pip install \
+  --index-url https://download.pytorch.org/whl/cpu \
+  --extra-index-url https://devpi.qualcomm.com/qcom/dev/+simple \
+  --trusted-host devpi.qualcomm.com \
+  "torch==2.9.1+cpu" \
+  "torchvision==0.24.1+cpu" \
+  "torchaudio==2.9.1+cpu"
+
+cd .. && git clone https://github.com/smedhe/transformers.git
+cd transformers
+git checkout v5.1.0_release_hf_stack_tp_ddp && pip install -e .
+
+cd .. && git clone https://github.com/smedhe/accelerate.git
+cd accelerate
+git checkout qaic_support_accel_23_02 && pip install -e .
+
+pip install trl==0.22.0
+pip install datasets==4.5.0
+pip install -U fsspec
+cd .. && cd efficient-transformers
+```
+
+#### Launch command
+
+**TP only — 2 devices, 2 processes (via YAML)**
+
+```bash
+QAIC_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node=2 QEfficient/cloud/finetune_experimental.py \
+  QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
+```
+
+> **Note:** `tp_degree` must be ≤ the number of locally available devices. Each process handles one TP shard, so `nproc-per-node` should equal `tp_degree`.
+
+---
+
+### Tensor Parallelism + Data Distributed Parallel (TP + DDP) *(Experimental)*
+
+TP + DDP combines tensor parallelism with data parallelism. Each TP group handles a model shard, while DDP replicates those groups across multiple process groups to train on larger batches of data simultaneously.
+
+#### How it works
+
+TP + DDP is controlled by two parameters: **`tp_degree`** and the number of DDP replicas (determined by `nproc-per-node` and `tp_degree`).
+
+| Configuration | Behaviour |
+|---|---|
+| `tp_degree = 1`, `ddp_degree > 1` | Pure DDP |
+| `tp_degree > 1`, `ddp_degree = 1` | Pure TP |
+| `tp_degree > 1`, `ddp_degree > 1` | TP + DDP |
+
+- Total devices consumed per node: `LOCAL_WORLD_SIZE × tp_degree`, where `LOCAL_WORLD_SIZE` = number of processes per node.
+- For example, with `tp_degree=2` and `nproc-per-node=4`: 2 TP shards × 2 DDP replicas = 4 devices total.
+
+#### Configuration parameter
+
+```yaml
+# training section of your config YAML
+training:
+  device: "qaic"       # or "cuda"
+  tp_degree: 2         # tensor parallel degree
+```
+
+#### Installation Setup
+
+>  **Note:** TP + DDP uses the same installation setup as TP. Please follow the [TP Installation Setup](#installation-setup) steps before proceeding.
+
+#### Launch command
+
+**TP + DDP — 4 devices, 4 processes (via YAML)**
+
+```bash
+QAIC_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc-per-node=4 QEfficient/cloud/finetune_experimental.py \
+  QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
+```
+
+> **Note:** `nproc-per-node` should be `tp_degree`x`ddp_degree`.
+
+---
+
+#### Notes
+
+- TP and TP + DDP are currently **experimental features** and are primarily verified for **Llama-family** models.
+- Other architectures with different layer naming conventions may require adjustments.
+- These features require the custom `transformers` and `accelerate` forks listed in the installation setup above.
+
+***
 ## To run the Finetune project tests
 
 Install following plugins:

From 0e02eea38f16771086ed904fa5af384259c3ee2d Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Thu, 16 Apr 2026 14:03:50 +0530
Subject: [PATCH 21/23] adding test cases for tp and ddp, along with other
 fixes

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     |   1 +
 .../experimental/configs/sft_ddp_config.yaml  |   2 +
 .../sft_single_device_gsm8k_config.yaml       |   2 +
 .../configs/sft_tp_ddp_gsm8k_config.yaml      |  15 +-
 .../experimental/core/config_manager.py       |  84 ++++++-
 .../experimental/core/utils/tp_peft_utils.py  |  51 ++--
 .../core/utils/training_config_utils.py       |   2 +
 .../experimental/tests/test_config_manager.py |  76 ++++++
 .../tests/test_tensor_parallel.py             | 218 ++++++++++++++++++
 9 files changed, 432 insertions(+), 19 deletions(-)
 create mode 100644 QEfficient/finetune/experimental/tests/test_tensor_parallel.py

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 0a89554069..c07a14ff01 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -77,6 +77,7 @@ def __init__(self, config_manager: ConfigManager):
             model_instance = self._create_model()
             self.model = model_instance.model
             self.tokenizer = model_instance.tokenizer
+
         except Exception as e:
             logger.log_rank_zero(f"Failed to load model: {e}", level=logging.ERROR)
             # Cleanup datasets if already created
diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
index f4059aa1e4..18af7d8916 100644
--- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -27,6 +27,7 @@ dataset:
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset
   data_seed: 42 # Random seed for dataset shuffling, for deterministic shuffling and reproducibility
+  dataset_num_samples: 1000
 
 
 
@@ -36,6 +37,7 @@ training:
   gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
   per_device_train_batch_size: 1  # Batch size per device during training
   torch_compile: False # Whether to use torch.compile
+  ddp_degree: 4
   ddp_config: # DDP configuration
     ddp_backend: "qccl"
     ddp_find_unused_parameters: False
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
index 9391fb0bd6..20caa15b83 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -26,11 +26,13 @@ dataset:
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset 
   data_seed: 42 # Random seed for dataset shuffling
+  dataset_num_samples: 1000
 
 
 # Training configuration
 training:
   type: "sft"
+  output_dir: "./training_result_single_device"
   gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
   per_device_train_batch_size: 1  # Batch size per device during training
   num_train_epochs: 1
diff --git a/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
index 215c7d5785..e55fa5a4fd 100644
--- a/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
@@ -8,13 +8,13 @@
 model:
   model_type: "hf"  # Hugging Face model
   auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
-  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  model_name: "meta-llama/Llama-3.2-1b"  # Pretrained model name
   use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
   peft_config:
     lora_r: 8 # LoRA rank
     lora_alpha: 16
     lora_dropout: 0
-    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    target_modules: ["q_proj", "v_proj", "k_proj"] # Target modules for LoRA
     task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
     peft_type: "LORA"  # Options: LORA, IA3, etc.
 
@@ -26,6 +26,7 @@ dataset:
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset 
   data_seed: 42 # Random seed for dataset shuffling
+  dataset_num_samples: 100
 
 
 # Training configuration
@@ -37,11 +38,21 @@ training:
   torch_compile: False # Whether to use torch.compile
   tp_degree: 2
   ddp_degree: 2
+  pp_degree: 1
+  device: "qaic"
+  ddp_config: # DDP configuration
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: True
+    ddp_timeout: 1800
+
 
 # Optimizer configuration
 optimizers:
   optimizer_name: "adamw"
   lr: 1e-4
+  
 
 scheduler:
   scheduler_name: "cosine"
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 58adba82d9..713734fade 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -805,13 +805,95 @@ def validate_config(self) -> None:
         self._push(errors, training.get("logging_steps", 0) < 0, "training.logging_steps must be >= 0.")
         self._push(errors, training.get("save_total_limit", 0) < 0, "training.save_total_limit must be >= 0.")
 
-        # Pipeline Parallelism (PP) config
+        # Pipeline / Tensor / Data parallelism config
         pp_degree = training.get("pp_degree", 1)
+        tp_degree = training.get("tp_degree", 1)
+        ddp_degree = training.get("ddp_degree", 1)
+
         self._push(
             errors,
             not isinstance(pp_degree, int) or pp_degree < 1,
             "training.pp_degree must be a positive integer (default 1 = no PP; > 1 enables PP).",
         )
+        self._push(
+            errors,
+            not isinstance(tp_degree, int) or tp_degree < 1,
+            "training.tp_degree must be a positive integer (default 1 = no TP; > 1 enables TP).",
+        )
+        self._push(
+            errors,
+            not isinstance(ddp_degree, int) or ddp_degree < 1,
+            "training.ddp_degree must be a positive integer (default 1 = no DDP; > 1 enables DDP).",
+        )
+
+        # Supported modes:
+        #  - PP only
+        #  - DDP only (single-server / multi-server)
+        #  - TP only (single-server)
+        #  - TP + DDP (single-server)
+        if isinstance(pp_degree, int) and isinstance(tp_degree, int) and isinstance(ddp_degree, int):
+            self._push(
+                errors,
+                pp_degree > 1 and tp_degree > 1,
+                "Unsupported parallelism combination: TP cannot be combined with PP. "
+                "Supported modes are PP only, DDP only, TP only, or TP+DDP (single-server).",
+            )
+            self._push(
+                errors,
+                pp_degree > 1 and ddp_degree > 1,
+                "Unsupported parallelism combination: DDP cannot be combined with PP. "
+                "Supported modes are PP only, DDP only, TP only, or TP+DDP (single-server).",
+            )
+
+        # WORLD_SIZE consistency checks (when launched in distributed mode)
+        if "WORLD_SIZE" in os.environ:
+            try:
+                world_size = int(os.environ["WORLD_SIZE"])
+            except ValueError:
+                world_size = -1
+
+            self._push(
+                errors,
+                world_size < 1,
+                f"Invalid WORLD_SIZE={os.environ.get('WORLD_SIZE')!r}; expected a positive integer.",
+            )
+
+            if (
+                world_size > 0
+                and isinstance(pp_degree, int)
+                and isinstance(tp_degree, int)
+                and isinstance(ddp_degree, int)
+            ):
+                expected_world_size = pp_degree * tp_degree * ddp_degree
+                self._push(
+                    errors,
+                    expected_world_size != world_size,
+                    "Parallelism degree mismatch: pp_degree * tp_degree * ddp_degree "
+                    f"must equal WORLD_SIZE ({pp_degree} * {tp_degree} * {ddp_degree} = {expected_world_size}, "
+                    f"WORLD_SIZE={world_size}).",
+                )
+
+            local_world_size_raw = os.environ.get("LOCAL_WORLD_SIZE")
+            if local_world_size_raw is not None:
+                try:
+                    local_world_size = int(local_world_size_raw)
+                except ValueError:
+                    local_world_size = -1
+
+                self._push(
+                    errors,
+                    local_world_size < 1,
+                    f"Invalid LOCAL_WORLD_SIZE={local_world_size_raw!r}; expected a positive integer.",
+                )
+
+                if local_world_size > 0 and world_size > 0:
+                    multi_server = world_size > local_world_size
+                    self._push(
+                        errors,
+                        multi_server and tp_degree > 1,
+                        "Unsupported parallelism combination: TP and TP+DDP are supported only on a single server. "
+                        "Detected multi-server launch from WORLD_SIZE > LOCAL_WORLD_SIZE.",
+                    )
 
         # DDP config
         ddp = training.get("ddp_config", {})
diff --git a/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py b/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py
index 139fa4fee5..a3be162bec 100644
--- a/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/tp_peft_utils.py
@@ -1,5 +1,4 @@
 import torch
-from peft import get_peft_model
 from transformers.integrations.tensor_parallel import (
     ALL_PARALLEL_STYLES,
     distribute_model,
@@ -8,12 +7,8 @@
 
 
 def print_trainable_parameters(model) -> None:
-    """
-    Print the number of trainable parameters, all params and percentage of trainablke params.
-    Args:
-        model: The PyTorch model.
-    """
-    trainable_params, all_param = model.get_nb_trainable_parameters()
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    all_param = sum(p.numel() for p in model.parameters())
     print(
         f"Trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}"
     )
@@ -44,18 +39,37 @@ def is_colwise_parallel(param: torch.distributed.tensor.DTensor) -> bool:
 
 
 def update_peft_tp_plan(model):
-    # If original layer has colwise then Lora-A --> colwise and Lora-B --> rowwise
-    # If original layer has rowwise then Lora-A --> rowwise and Lora-B --> colwise
     peft_tp_plan = {}
-    for name, schema in model.tp_plan.items():
-        lora_a_name = "base_model.model." + name + ".lora_A.default"
-        lora_b_name = "base_model.model." + name + ".lora_B.default"
+    state_dict_keys = set(model.state_dict().keys())
+
+    for name, schema in list(model.tp_plan.items()):
+        lora_a_default_name = name + ".lora_A.default"
+        lora_b_default_name = name + ".lora_B.default"
+        # Save-time PEFT adapter keys are emitted without `.default` suffix
+        # (e.g. `...lora_B.weight`), so keep aliases for gather_state_dict_for_save lookup.
+        lora_a_name = name + ".lora_A"
+        lora_b_name = name + ".lora_B"
+
+        # Only add entries for layers that actually have LoRA adapters
+        has_lora = any(
+            replace_layer_number_by_wildcard(k.replace(".weight", "")) == lora_a_default_name
+            for k in state_dict_keys
+            if "lora_A.default" in k
+        )
+        if not has_lora:
+            continue
+
         if schema == "rowwise":
+            peft_tp_plan[lora_a_default_name] = "rowwise"
+            peft_tp_plan[lora_b_default_name] = "colwise_gather_output"
             peft_tp_plan[lora_a_name] = "rowwise"
-            peft_tp_plan[lora_b_name] = "colwise"
+            peft_tp_plan[lora_b_name] = "colwise_gather_output"
         elif schema == "colwise":
+            peft_tp_plan[lora_a_default_name] = "colwise"
+            peft_tp_plan[lora_b_default_name] = "lora_rowwise"
             peft_tp_plan[lora_a_name] = "colwise"
             peft_tp_plan[lora_b_name] = "lora_rowwise"
+
     model.tp_plan.update(peft_tp_plan)
 
 
@@ -87,14 +101,19 @@ def apply_tp_modification_for_peft(model, tp_mesh=None):
             # tp_layer.prepare_module_tp(module_obj, tp_mesh)
 
             # Shard the param
-            tp_layer.shard_tensor(param, tensor_idx=None, dtype=empty_param.dtype)
-            setattr(getattr(module_obj, param_name), "data", param)
+
+            sharded = tp_layer.shard_tensor(param, tensor_idx=None, dtype=empty_param.dtype)
+            if sharded is not None:
+                if not isinstance(sharded, torch.nn.Parameter):
+                    sharded = torch.nn.Parameter(sharded, requires_grad=empty_param.is_floating_point())
+                setattr(module_obj, param_name, sharded)  # replaces Parameter, not just .data
 
 
 def apply_peft_to_model(model, tp_mesh=None, peft_config=None):
     peft_config = peft_config
     # Add PEFT adapters to the model
-    model = get_peft_model(model, peft_config)
+    model.add_adapter(peft_config, "default")  # sets _hf_peft_config_loaded = True
+    model.enable_adapters()  # activates the adapter
     print_trainable_parameters(model)
 
     if tp_mesh is None:
diff --git a/QEfficient/finetune/experimental/core/utils/training_config_utils.py b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
index ab71925901..b54b5d7197 100644
--- a/QEfficient/finetune/experimental/core/utils/training_config_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
@@ -49,7 +49,9 @@ def prepare_training_config(
     torch_dtype = training_config.pop("torch_dtype", None)
     if torch_dtype is None:
         raise ValueError("'torch_dtype' field is required in training configuration. Expected one of: ['fp16', 'bf16']")
+
     training_config[torch_dtype] = True
+
     training_config["data_seed"] = training_config.get("seed")
 
     # Restoring the "torch_dtype" after torch_dtype conversion using the saved value
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
index 69d2db92af..9052743cb0 100644
--- a/QEfficient/finetune/experimental/tests/test_config_manager.py
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -182,3 +182,79 @@ def test_torch_dtype_invalid():
         config_manager.validate_config()
 
     assert "torch_dtype must be one of" in str(exc_info.value)
+
+
+def test_parallelism_rejects_tp_plus_pp_combo():
+    """TP cannot be combined with PP in supported mode matrix."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    training_config = TrainingConfig(tp_degree=2, pp_degree=2, ddp_degree=1)
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    with pytest.raises(ValueError) as exc_info:
+        config_manager.validate_config()
+
+    assert "TP cannot be combined with PP" in str(exc_info.value)
+
+
+def test_parallelism_rejects_ddp_plus_pp_combo():
+    """DDP cannot be combined with PP in supported mode matrix."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    training_config = TrainingConfig(tp_degree=1, pp_degree=2, ddp_degree=2)
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    with pytest.raises(ValueError) as exc_info:
+        config_manager.validate_config()
+
+    assert "DDP cannot be combined with PP" in str(exc_info.value)
+
+
+def test_parallelism_world_size_product_mismatch(monkeypatch):
+    """WORLD_SIZE must match pp*tp*ddp when distributed env is set."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    monkeypatch.setenv("WORLD_SIZE", "8")
+
+    training_config = TrainingConfig(tp_degree=2, pp_degree=1, ddp_degree=2)
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    with pytest.raises(ValueError) as exc_info:
+        config_manager.validate_config()
+
+    assert "must equal WORLD_SIZE" in str(exc_info.value)
+
+
+def test_parallelism_multi_server_rejects_tp(monkeypatch):
+    """TP and TP+DDP are rejected for multi-server launch."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    monkeypatch.setenv("WORLD_SIZE", "8")
+    monkeypatch.setenv("LOCAL_WORLD_SIZE", "4")
+
+    training_config = TrainingConfig(tp_degree=2, pp_degree=1, ddp_degree=4)
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    with pytest.raises(ValueError) as exc_info:
+        config_manager.validate_config()
+
+    assert "TP and TP+DDP are supported only on a single server" in str(exc_info.value)
+
+
+def test_parallelism_valid_tp_ddp_single_server(monkeypatch):
+    """TP+DDP single-server should pass when WORLD_SIZE matches degree product."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    monkeypatch.setenv("WORLD_SIZE", "4")
+    monkeypatch.setenv("LOCAL_WORLD_SIZE", "4")
+
+    training_config = TrainingConfig(tp_degree=2, pp_degree=1, ddp_degree=2)
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    # Should not raise
+    config_manager.validate_config()
diff --git a/QEfficient/finetune/experimental/tests/test_tensor_parallel.py b/QEfficient/finetune/experimental/tests/test_tensor_parallel.py
new file mode 100644
index 0000000000..4d411777ed
--- /dev/null
+++ b/QEfficient/finetune/experimental/tests/test_tensor_parallel.py
@@ -0,0 +1,218 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+"""
+Tensor Parallelism (TP) tests for experimental finetuning pipeline.
+Covers TP-only and TP+DDP enablement paths for non-MoE models.
+"""
+
+from unittest.mock import patch
+
+import pytest
+import torch
+from accelerate.utils import ParallelismConfig
+
+MODULE = "QEfficient.cloud.finetune_experimental"
+
+FineTuningPipeline = __import__(MODULE, fromlist=["FineTuningPipeline"]).FineTuningPipeline
+
+
+# ---------------------------------------------------------------------------
+# 1. Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def tmp_outdir(tmp_path):
+    return tmp_path / "out"
+
+
+@pytest.fixture
+def mock_config_manager(mocker, tmp_outdir):
+    cm = mocker.MagicMock(name="ConfigManager")
+    cm.config = mocker.MagicMock()
+    cm.config.training = {"output_dir": str(tmp_outdir)}
+    return cm
+
+
+@pytest.fixture
+def model_bundle(mocker):
+    bundle = mocker.MagicMock(name="ModelBundle")
+    bundle.model = mocker.MagicMock(name="model")
+    bundle.tokenizer = mocker.MagicMock(name="tokenizer")
+    return bundle
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_tp_training_cfg(ddp_degree: int) -> dict:
+    """Create a minimal training config for TP-only or TP+DDP scenarios."""
+    return {
+        "tp_degree": 2,
+        "ddp_degree": ddp_degree,
+        "device": "qaic",
+        "torch_dtype": "bf16",
+        "parallelism_config": ParallelismConfig(tp_size=2, dp_replicate_size=ddp_degree),
+        "type": "sft",
+    }
+
+
+# ---------------------------------------------------------------------------
+# 2. FineTuningPipeline integration – TP enablement and flow
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.finetune
+@pytest.mark.parametrize(
+    "scenario_name,ddp_degree",
+    [
+        ("tp_only", 1),
+        ("tp_ddp", 2),
+    ],
+)
+def test_tp_non_moe_enablement_functionality(mocker, mock_config_manager, model_bundle, scenario_name, ddp_degree):
+    """
+    Functional test: TP enablement path for non-MoE models
+    in both TP-only and TP+DDP training configurations.
+    """
+    training_cfg = _make_tp_training_cfg(ddp_degree)
+
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value=training_cfg)
+    mock_init_dist = mocker.patch.object(FineTuningPipeline, "_initialize_dist_tp", autospec=True)
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(None, None))
+    mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=model_bundle)
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(None, {}))
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=[])
+    mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=mocker.MagicMock())
+
+    pipe = FineTuningPipeline(mock_config_manager)
+
+    assert pipe.tp_enabled is True, f"TP should be enabled for scenario '{scenario_name}'"
+    mock_init_dist.assert_called_once_with(pipe)
+
+
+@pytest.mark.finetune
+@pytest.mark.parametrize(
+    "scenario_name,ddp_degree",
+    [
+        ("tp_only", 1),
+        ("tp_ddp", 2),
+    ],
+)
+def test_tp_non_moe_full_pipeline_check(mocker, mock_config_manager, model_bundle, scenario_name, ddp_degree):
+    """
+    Full pipeline check: constructor + run() flow
+    for TP-only and TP+DDP with non-MoE model path.
+    """
+    training_cfg = _make_tp_training_cfg(ddp_degree)
+
+    mocker.patch(f"{MODULE}.prepare_training_config", autospec=True, return_value=training_cfg)
+    mock_init_dist = mocker.patch.object(FineTuningPipeline, "_initialize_dist_tp", autospec=True)
+    mocker.patch.object(FineTuningPipeline, "_setup_environment", autospec=True)
+    mocker.patch.object(FineTuningPipeline, "_create_datasets", autospec=True, return_value=(None, None))
+    mocker.patch.object(FineTuningPipeline, "_create_model", autospec=True, return_value=model_bundle)
+    mocker.patch.object(FineTuningPipeline, "_create_optimizer", autospec=True, return_value=(None, {}))
+    mocker.patch.object(FineTuningPipeline, "_create_callbacks", autospec=True, return_value=[])
+
+    trainer_obj = mocker.MagicMock(name=f"trainer_{scenario_name}")
+    mocker.patch.object(FineTuningPipeline, "_create_trainer", autospec=True, return_value=trainer_obj)
+
+    pipe = FineTuningPipeline(mock_config_manager)
+    pipe.run()
+
+    mock_init_dist.assert_called_once_with(pipe)
+    trainer_obj.train.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# 3. Unit tests – model TP kwargs injection for non-MoE
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.finetune
+def test_create_model_non_moe_tp_only_injects_tp_plan(mocker):
+    """
+    Unit test: non-MoE model creation under TP-only configuration
+    injects TP arguments and does not apply PEFT TP hooks when use_peft=False.
+    """
+    pipe = FineTuningPipeline.__new__(FineTuningPipeline)
+    pipe.config_manager = mocker.MagicMock(name="ConfigManager")
+
+    pc = ParallelismConfig(tp_size=2)
+    mocker.patch.object(pc, "build_device_mesh", autospec=True, return_value={"tp": "tp_mesh"})
+
+    pipe.training_config = {
+        "tp_degree": 2,
+        "device": "cpu",
+        "parallelism_config": pc,
+    }
+    pipe.config_manager.get_model_config.return_value = {
+        "model_type": "hf",
+        "model_name": "non-moe-model",
+        "use_peft": False,
+    }
+
+    model_instance = mocker.MagicMock(name="ModelInstance")
+    original_weight = torch.nn.Parameter(torch.randn(4, 4))
+    model_instance.model.lm_head.weight = original_weight
+
+    with patch(f"{MODULE}.ComponentFactory") as mock_factory:
+        mock_factory.create_model.return_value = model_instance
+        mock_apply_peft = mocker.patch(f"{MODULE}.apply_peft_to_model", autospec=True)
+
+        returned = pipe._create_model()
+
+    assert returned is model_instance
+    mock_apply_peft.assert_not_called()
+    assert model_instance.model.lm_head.weight is not original_weight
+    assert isinstance(model_instance.model.lm_head.weight, torch.nn.Parameter)
+
+    kwargs = mock_factory.create_model.call_args.kwargs
+    assert kwargs["tp_plan"] == "auto"
+    assert kwargs["tp_size"] == 2
+    assert kwargs["device_mesh"] == "tp_mesh"
+
+
+@pytest.mark.finetune
+def test_create_model_non_moe_tp_ddp_injects_tp_plan(mocker):
+    """
+    Unit test: non-MoE model creation under TP+DDP still uses
+    TP mesh injection path for model construction.
+    """
+    pipe = FineTuningPipeline.__new__(FineTuningPipeline)
+    pipe.config_manager = mocker.MagicMock(name="ConfigManager")
+
+    pc = ParallelismConfig(tp_size=2, dp_replicate_size=2)
+    mocker.patch.object(pc, "build_device_mesh", autospec=True, return_value={"tp": "tp_mesh", "dp": "dp_mesh"})
+
+    pipe.training_config = {
+        "tp_degree": 2,
+        "ddp_degree": 2,
+        "device": "cpu",
+        "parallelism_config": pc,
+    }
+    pipe.config_manager.get_model_config.return_value = {
+        "model_type": "hf",
+        "model_name": "non-moe-model",
+        "use_peft": False,
+    }
+
+    model_instance = mocker.MagicMock(name="ModelInstance")
+    model_instance.model.lm_head.weight = torch.nn.Parameter(torch.randn(2, 2))
+
+    with patch(f"{MODULE}.ComponentFactory") as mock_factory:
+        mock_factory.create_model.return_value = model_instance
+        pipe._create_model()
+
+    kwargs = mock_factory.create_model.call_args.kwargs
+    assert kwargs["tp_plan"] == "auto"
+    assert kwargs["tp_size"] == 2
+    assert kwargs["device_mesh"] == "tp_mesh"

From 6a73c0f6cf55ce92f977e2aaf0b436adff5337c7 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Thu, 16 Apr 2026 16:39:42 +0530
Subject: [PATCH 22/23] Adding local world size checks

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 .../experimental/core/config_manager.py        | 18 +++++++++---------
 .../core/utils/training_config_utils.py        | 18 +++++++++++++++++-
 .../experimental/tests/test_config_manager.py  |  2 +-
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 713734fade..697b373a00 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -845,21 +845,21 @@ def validate_config(self) -> None:
                 "Supported modes are PP only, DDP only, TP only, or TP+DDP (single-server).",
             )
 
-        # WORLD_SIZE consistency checks (when launched in distributed mode)
-        if "WORLD_SIZE" in os.environ:
+        # LOCAL_WORLD_SIZE consistency checks (when launched in distributed mode)
+        if "LOCAL_WORLD_SIZE" in os.environ:
             try:
-                world_size = int(os.environ["WORLD_SIZE"])
+                local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
             except ValueError:
-                world_size = -1
+                local_world_size = -1
 
             self._push(
                 errors,
-                world_size < 1,
-                f"Invalid WORLD_SIZE={os.environ.get('WORLD_SIZE')!r}; expected a positive integer.",
+                local_world_size < 1,
+                f"Invalid LOCAL_WORLD_SIZE={os.environ.get('LOCAL_WORLD_SIZE')!r}; expected a positive integer.",
             )
 
             if (
-                world_size > 0
+                local_world_size > 0
                 and isinstance(pp_degree, int)
                 and isinstance(tp_degree, int)
                 and isinstance(ddp_degree, int)
@@ -867,10 +867,10 @@ def validate_config(self) -> None:
                 expected_world_size = pp_degree * tp_degree * ddp_degree
                 self._push(
                     errors,
-                    expected_world_size != world_size,
+                    expected_world_size != local_world_size,
                     "Parallelism degree mismatch: pp_degree * tp_degree * ddp_degree "
                     f"must equal WORLD_SIZE ({pp_degree} * {tp_degree} * {ddp_degree} = {expected_world_size}, "
-                    f"WORLD_SIZE={world_size}).",
+                    f"LOCAL_WORLD_SIZE={local_world_size}).",
                 )
 
             local_world_size_raw = os.environ.get("LOCAL_WORLD_SIZE")
diff --git a/QEfficient/finetune/experimental/core/utils/training_config_utils.py b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
index b54b5d7197..7e74cc23df 100644
--- a/QEfficient/finetune/experimental/core/utils/training_config_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
@@ -50,7 +50,23 @@ def prepare_training_config(
     if torch_dtype is None:
         raise ValueError("'torch_dtype' field is required in training configuration. Expected one of: ['fp16', 'bf16']")
 
-    training_config[torch_dtype] = True
+    # Normalize precision flags before mapping torch_dtype.
+    # This avoids contradictory user-provided combinations such as
+    # torch_dtype="fp16" with fp16=False.
+    training_config.pop("fp16", None)
+    training_config.pop("bf16", None)
+
+    device = training_config.get("device", "qaic")
+    if device == "qaic":
+        # For QAIC, avoid setting HF's fp16/bf16 TrainingArguments flags:
+        # - bf16=True triggers a GPU-only capability check in TrainingArguments.
+        # - fp16=True routes through QAIC GradScaler unscale path that can fail in TP+DDP.
+        # Keep precision encoded via model torch_dtype instead.
+        training_config["fp16"] = False
+        training_config["bf16"] = False
+    else:
+        if torch_dtype in ("fp16", "bf16"):
+            training_config[torch_dtype] = True
 
     training_config["data_seed"] = training_config.get("seed")
 
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
index 9052743cb0..c99f978d29 100644
--- a/QEfficient/finetune/experimental/tests/test_config_manager.py
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -216,7 +216,7 @@ def test_parallelism_world_size_product_mismatch(monkeypatch):
     """WORLD_SIZE must match pp*tp*ddp when distributed env is set."""
     from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
 
-    monkeypatch.setenv("WORLD_SIZE", "8")
+    monkeypatch.setenv("LOCAL_WORLD_SIZE", "4")
 
     training_config = TrainingConfig(tp_degree=2, pp_degree=1, ddp_degree=2)
     master_config = MasterConfig(training=training_config)

From 79541aa82d531182c84c8180b904684519263dfc Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 20 Apr 2026 15:30:54 +0530
Subject: [PATCH 23/23] adding tp+ddp related config changes

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     | 32 +++++++++---
 .../experimental/core/config_manager.py       | 51 +++++++++----------
 .../core/utils/training_config_utils.py       | 17 +++++--
 .../experimental/tests/test_config_manager.py | 19 ++++++-
 4 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index c07a14ff01..f9e26c8ff0 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -9,6 +9,7 @@
 Main entry point for fine-tuning LLMs using the experimental finetune framework.
 """
 
+import inspect
 import logging
 import os
 from pathlib import Path
@@ -128,11 +129,14 @@ def _initialize_dist_tp(self):
         else:
             backend = "cpu:gloo,qaic:qccl"
 
-        dist.init_process_group(
-            backend=backend,  # "nccl" for GPUs, "gloo" for CPUs
-            world_size=WORLD_SIZE,  # total number of processes
-            rank=LOCAL_RANK,  # unique ID for this process
-        )
+        # Explicit initialization is required for TP/TP+DDP so that the process
+        # group backend is set correctly on QAIC.
+        if not dist.is_initialized():
+            dist.init_process_group(
+                backend=backend,  # "nccl" for GPUs, "gloo" for CPUs
+                world_size=WORLD_SIZE,  # total number of processes
+                rank=LOCAL_RANK,  # local rank of this process
+            )
 
     def _create_datasets(self) -> Tuple[Any, Any]:
         """
@@ -327,8 +331,24 @@ def _create_trainer(
         if training_config.get("report_to") is None:
             training_config["report_to"] = "tensorboard"
 
+        # Filter out keys not accepted by the concrete args class (e.g. trl.SFTConfig
+        # variants that do not support group_by_length).
+        args_signature = inspect.signature(args_cls.__init__)
+        args_param_names = {
+            name for name, param in args_signature.parameters.items() if name != "self" and param.kind != param.VAR_KEYWORD
+        }
+        filtered_training_config = {k: v for k, v in training_config.items() if k in args_param_names}
+
+        removed_keys = sorted(set(training_config) - set(filtered_training_config))
+        if removed_keys:
+            logger.log_rank_zero(
+                "Dropping unsupported trainer args for "
+                f"{args_cls.__name__}: {', '.join(removed_keys)}",
+                level=logging.WARNING,
+            )
+
         # Create trainer arguments instance
-        args = args_cls(**training_config)
+        args = args_cls(**filtered_training_config)
         dataset_config_dict = self.config_manager.get_dataset_config()
         split_ratio = dataset_config_dict.get("split_ratio", 0.8)
         num_samples = dataset_config_dict.get("dataset_num_samples", -1)
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 697b373a00..3db4da9281 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -242,10 +242,6 @@ class ModelConfig:
         default="AutoModelForCausalLM",
         metadata={"help": "The AutoClass name to load the model (e.g., 'AutoModelForCausalLM')."},
     )
-    # load_in_4bit: bool = field(
-    #     default=False,
-    #     metadata={"help": "Whether to load the model in 4-bit quantization."},
-    # )
     use_peft: bool = field(
         default=True,
         metadata={"help": "Whether to use PEFT (Parameter-Efficient Fine-Tuning)."},
@@ -330,10 +326,6 @@ class TrainingConfig:
         default="./training_results",
         metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
     )
-    # overwrite_output_dir: bool = field(
-    #     default=False,
-    #     metadata={"help": "Whether to overwrite the output directory."},
-    # )
     seed: int = field(
         default=42,
         metadata={"help": "Random seed for reproducibility."},
@@ -873,27 +865,30 @@ def validate_config(self) -> None:
                     f"LOCAL_WORLD_SIZE={local_world_size}).",
                 )
 
-            local_world_size_raw = os.environ.get("LOCAL_WORLD_SIZE")
-            if local_world_size_raw is not None:
-                try:
-                    local_world_size = int(local_world_size_raw)
-                except ValueError:
-                    local_world_size = -1
-
-                self._push(
-                    errors,
-                    local_world_size < 1,
-                    f"Invalid LOCAL_WORLD_SIZE={local_world_size_raw!r}; expected a positive integer.",
-                )
+        if (
+            world_size > 0
+            and isinstance(pp_degree, int)
+            and isinstance(tp_degree, int)
+            and isinstance(ddp_degree, int)
+            and (pp_degree > 1 or tp_degree > 1)
+        ):
+            expected_world_size = pp_degree * tp_degree * ddp_degree
+            self._push(
+                errors,
+                expected_world_size != world_size,
+                "Parallelism degree mismatch for TP/PP modes: pp_degree * tp_degree * ddp_degree "
+                f"must equal WORLD_SIZE ({pp_degree} * {tp_degree} * {ddp_degree} = {expected_world_size}, "
+                f"WORLD_SIZE={world_size}).",
+            )
 
-                if local_world_size > 0 and world_size > 0:
-                    multi_server = world_size > local_world_size
-                    self._push(
-                        errors,
-                        multi_server and tp_degree > 1,
-                        "Unsupported parallelism combination: TP and TP+DDP are supported only on a single server. "
-                        "Detected multi-server launch from WORLD_SIZE > LOCAL_WORLD_SIZE.",
-                    )
+        if local_world_size > 0 and world_size > 0:
+            multi_server = world_size > local_world_size
+            self._push(
+                errors,
+                multi_server and tp_degree > 1,
+                "Unsupported parallelism combination: TP and TP+DDP are supported only on a single server. "
+                "Detected multi-server launch from WORLD_SIZE > LOCAL_WORLD_SIZE.",
+            )
 
         # DDP config
         ddp = training.get("ddp_config", {})
diff --git a/QEfficient/finetune/experimental/core/utils/training_config_utils.py b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
index 7e74cc23df..3395c48e78 100644
--- a/QEfficient/finetune/experimental/core/utils/training_config_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/training_config_utils.py
@@ -35,12 +35,19 @@ def prepare_training_config(
 
     # Handle dtype conversion
     # To do: (For Tanisha) Check if torch_dtype should rather be added directly in model_config only in config_manager.py
-    # TODO: Add PC here
     parallelism_config = {}
-    if training_config.get("tp_degree", 1) > 1:
-        parallelism_config["tp_size"] = training_config["tp_degree"]
-    if training_config.get("ddp_degree", 1) > 1:
-        parallelism_config["dp_replicate_size"] = training_config["ddp_degree"]
+    tp_degree = training_config.get("tp_degree", 1)
+    pp_degree = training_config.get("pp_degree", 1)
+    ddp_degree = training_config.get("ddp_degree", 1)
+
+    if tp_degree > 1:
+        parallelism_config["tp_size"] = tp_degree
+
+    # ddp_degree is a TP+DDP shaping hint and should not force data-parallel sizing
+    # for pure DDP (single-node or multi-node), where Accelerate derives world size
+    # from launcher environment (RANK/WORLD_SIZE/LOCAL_WORLD_SIZE).
+    if ddp_degree > 1 and (tp_degree > 1 or pp_degree > 1):
+        parallelism_config["dp_replicate_size"] = ddp_degree
 
     if parallelism_config:  # Only inject if at least one parallelism dimension is active
         pc = ParallelismConfig(**parallelism_config)
diff --git a/QEfficient/finetune/experimental/tests/test_config_manager.py b/QEfficient/finetune/experimental/tests/test_config_manager.py
index c99f978d29..8c980e063f 100644
--- a/QEfficient/finetune/experimental/tests/test_config_manager.py
+++ b/QEfficient/finetune/experimental/tests/test_config_manager.py
@@ -213,7 +213,7 @@ def test_parallelism_rejects_ddp_plus_pp_combo():
 
 
 def test_parallelism_world_size_product_mismatch(monkeypatch):
-    """WORLD_SIZE must match pp*tp*ddp when distributed env is set."""
+    """WORLD_SIZE must match pp*tp*ddp when TP/PP mode is enabled."""
     from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
 
     monkeypatch.setenv("LOCAL_WORLD_SIZE", "4")
@@ -228,6 +228,23 @@ def test_parallelism_world_size_product_mismatch(monkeypatch):
     assert "must equal WORLD_SIZE" in str(exc_info.value)
 
 
+def test_parallelism_multi_node_ddp_does_not_require_ddp_degree_world_size_match(monkeypatch):
+    """Pure multi-node DDP should not enforce pp*tp*ddp == WORLD_SIZE."""
+    from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig
+
+    monkeypatch.setenv("WORLD_SIZE", "8")
+    monkeypatch.setenv("LOCAL_WORLD_SIZE", "4")
+
+    # Pure DDP mode (pp=1, tp=1). ddp_degree is treated as TP+DDP shaping hint,
+    # so WORLD_SIZE mismatch should not fail validation in pure DDP launches.
+    training_config = TrainingConfig(tp_degree=1, pp_degree=1, ddp_degree=4)
+    master_config = MasterConfig(training=training_config)
+    config_manager = ConfigManager(config=master_config)
+
+    # Should not raise
+    config_manager.validate_config()
+
+
 def test_parallelism_multi_server_rejects_tp(monkeypatch):
     """TP and TP+DDP are rejected for multi-server launch."""
     from QEfficient.finetune.experimental.core.config_manager import MasterConfig, TrainingConfig