From 40d4e2540e0e0a459738e4f1096b943ecf72908d Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Fri, 13 Mar 2026 04:04:35 +0000
Subject: [PATCH 1/4] Updating logger.py

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py     |  2 +-
 .../experimental/core/config_manager.py       |  1 -
 .../finetune/experimental/core/dataset.py     |  7 ++--
 .../finetune/experimental/core/logger.py      | 41 ++++++++++++++++---
 .../finetune/experimental/core/model.py       |  1 -
 .../experimental/core/utils/dist_utils.py     | 17 ++++++++
 .../experimental/tests/test_logger.py         | 22 ++++++----
 QEfficient/utils/device_utils.py              |  2 -
 docs/source/hf_finetune.md                    | 30 +++++++++++---
 9 files changed, 96 insertions(+), 27 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 9828ea81ec..de45786d39 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -265,7 +265,7 @@ def _create_trainer(
         if num_samples > 0:
             # Truncating datasets to a smaller number of samples.
             # If you want to use all data, set dataset_num_samples to -1 or remove it from config.
-            logger.warning("Using fewer samples may impact finetuning quality.")
+            logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING)
             subset_train_indices = list(range(0, int(num_samples * split_ratio)))
             subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio)))
             eval_dataset = eval_dataset.select(subset_eval_indices)
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index 256904d225..a3e0a3cd2f 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -25,7 +25,6 @@
 from QEfficient.utils.device_utils import is_nsp_free
 
 logger = Logger(__name__)
-logger.logger.propagate = False
 
 
 @dataclass
diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index 766d851457..22594cb81b 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -26,7 +26,6 @@
 )
 
 logger = Logger(__name__)
-logger.logger.propagate = False
 
 
 class BaseDataset(Dataset, ABC):
@@ -102,9 +101,11 @@ def __init__(
             if not os.path.isfile(self.json_file_path):
                 raise FileNotFoundError(f"JSON file not found or invalid: '{self.json_file_path}'")
         if self.prompt_template and self.prompt_func_path:
-            logger.info("Both prompt_template and prompt_func are provided. Using prompt_template for preprocessing.")
+            logger.log_rank_zero(
+                "Both prompt_template and prompt_func are provided. Using prompt_template for preprocessing."
+            )
         if self.completion_template and self.completion_func_path:
-            logger.info(
+            logger.log_rank_zero(
                 "Both completion_template and completion_func are provided. Using completion_template for preprocessing."
             )
         if self.prompt_template is None and self.prompt_func_path is None:
diff --git a/QEfficient/finetune/experimental/core/logger.py b/QEfficient/finetune/experimental/core/logger.py
index a1b9c771f6..c4f5b47bd8 100644
--- a/QEfficient/finetune/experimental/core/logger.py
+++ b/QEfficient/finetune/experimental/core/logger.py
@@ -7,13 +7,13 @@
 
 
 import logging
-import sys
 from pathlib import Path
 from typing import Optional
 
 from transformers.utils.logging import get_logger as hf_get_logger
 
-from QEfficient.finetune.experimental.core.utils.dist_utils import get_local_rank
+from QEfficient.finetune.experimental.core.utils.dist_utils import is_global_rank_zero
+
 
 # -----------------------------------------------------------------------------
 # Logger usage:
@@ -27,6 +27,34 @@
 # Attach file handler later if needed:
 #   logger.prepare_for_logs(output_dir="logs", log_level="DEBUG")
 # -----------------------------------------------------------------------------
+class QEffFormatter(logging.Formatter):
+    """
+    Formatter class used to set colors for printing different logging levels of messages on console.
+    """
+
+    cyan: str = "\x1b[38;5;14m"
+    yellow: str = "\x1b[33;20m"
+    red: str = "\x1b[31;20m"
+    bold_red: str = "\x1b[31;1m"
+    reset: str = "\x1b[0m"
+    common_format: str = "%(levelname)s - %(name)s - %(message)s"  # type: ignore
+    format_with_line_info = "%(levelname)s - %(name)s - %(message)s  (%(filename)s:%(lineno)d)"  # type: ignore
+
+    FORMATS = {
+        logging.DEBUG: cyan + format_with_line_info + reset,
+        logging.INFO: cyan + common_format + reset,
+        logging.WARNING: yellow + common_format + reset,
+        logging.ERROR: red + format_with_line_info + reset,
+        logging.CRITICAL: bold_red + format_with_line_info + reset,
+    }
+
+    def format(self, record):
+        """
+        Overriding the base class method to Choose format based on log level.
+        """
+        log_fmt = self.FORMATS.get(record.levelno)
+        formatter = logging.Formatter(log_fmt)
+        return formatter.format(record)
 
 
 class Logger:
@@ -48,7 +76,7 @@ def __init__(
         """
         self.logger = hf_get_logger(name)
         self.logger.setLevel(level)
-
+        self.logger.propagate = False
         # Clear any existing handlers
         self.logger.handlers.clear()
 
@@ -56,9 +84,9 @@ def __init__(
         self.formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 
         # Console handler
-        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler = logging.StreamHandler()
         console_handler.setLevel(level)
-        console_handler.setFormatter(self.formatter)
+        console_handler.setFormatter(QEffFormatter())
         self.logger.addHandler(console_handler)
 
         # File handler (if log_file is provided)
@@ -100,7 +128,7 @@ def log_rank_zero(self, message: str, level: int = logging.INFO) -> None:
             message: Message to log
             level: Logging level
         """
-        if get_local_rank() == 0:
+        if is_global_rank_zero():
             self.logger.log(level, message)
 
     def log_exception(self, message: str, exception: Exception, raise_exception: bool = True) -> None:
@@ -130,6 +158,7 @@ def prepare_for_logs(self, output_dir: Optional[str] = None, log_level: str = "I
         # Convert string log level to logging constant
         level = getattr(logging, log_level.upper(), logging.INFO)
         self.logger.setLevel(level)
+        self.logger.propagate = False
 
         # Update existing handlers' levels
         for handler in self.logger.handlers:
diff --git a/QEfficient/finetune/experimental/core/model.py b/QEfficient/finetune/experimental/core/model.py
index f9a4d2fab1..0f087e6653 100644
--- a/QEfficient/finetune/experimental/core/model.py
+++ b/QEfficient/finetune/experimental/core/model.py
@@ -18,7 +18,6 @@
 from QEfficient.finetune.experimental.core.utils.dataset_utils import insert_pad_token
 
 logger = Logger(__name__)
-logger.logger.propagate = False
 
 
 class BaseModel(nn.Module, ABC):
diff --git a/QEfficient/finetune/experimental/core/utils/dist_utils.py b/QEfficient/finetune/experimental/core/utils/dist_utils.py
index aed88862d8..069d91445a 100644
--- a/QEfficient/finetune/experimental/core/utils/dist_utils.py
+++ b/QEfficient/finetune/experimental/core/utils/dist_utils.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+import os
 
 import torch.distributed as dist
 
@@ -37,3 +38,19 @@ def get_world_size() -> int:
 def is_main_process() -> bool:
     """Check if the current process is the main process (rank 0)."""
     return get_rank() == 0
+
+
+def get_global_rank() -> int:
+    """Return global rank if available (torchrun/deepspeed), else fall back to local rank."""
+    r = os.environ.get("RANK")
+    if r is not None:
+        try:
+            return int(r)
+        except ValueError:
+            return 0
+    # Fallback to local rank
+    return int(get_local_rank())
+
+
+def is_global_rank_zero() -> bool:
+    return get_global_rank() == 0
diff --git a/QEfficient/finetune/experimental/tests/test_logger.py b/QEfficient/finetune/experimental/tests/test_logger.py
index 0af0c8b512..3c0506881a 100644
--- a/QEfficient/finetune/experimental/tests/test_logger.py
+++ b/QEfficient/finetune/experimental/tests/test_logger.py
@@ -48,6 +48,7 @@ def test_init_with_file(self, tmp_path):
     def test_log_levels(self, caplog):
         """Test all log levels work correctly"""
         logger = Logger("level_test_logger", level=logging.DEBUG)
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.DEBUG):
             logger.debug("Debug message")
@@ -63,22 +64,24 @@ def test_log_levels(self, caplog):
             assert "Error message" in caplog.text
             assert "Critical message" in caplog.text
 
-    @patch("QEfficient.finetune.experimental.core.logger.get_local_rank")
-    def test_log_rank_zero_positive_case(self, mock_get_local_rank, caplog):
+    @patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero")
+    def test_log_rank_zero_positive_case(self, mock_get_global_rank, caplog):
         """Test rank zero logging functionality"""
-        mock_get_local_rank.return_value = 0
+        mock_get_global_rank.return_value = True
         logger = Logger("rank_test_logger")
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.INFO):
             logger.log_rank_zero("Rank zero message")
 
             assert "Rank zero message" in caplog.text
 
-    @patch("QEfficient.finetune.experimental.core.logger.get_local_rank")
-    def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog):
+    @patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero")
+    def test_log_rank_zero_negative_case(self, mock_get_global_rank, caplog):
         """Test to verify that only rank‑zero messages are logged"""
-        mock_get_local_rank.return_value = 1
+        mock_get_global_rank.return_value = False
         logger = Logger("rank_test_logger")
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.INFO):
             logger.log_rank_zero("Should not appear")
@@ -88,6 +91,7 @@ def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog):
     def test_log_exception_raise(self, caplog):
         """Test exception logging with raising"""
         logger = Logger("exception_test_logger")
+        logger.logger.propagate = True
 
         with pytest.raises(ValueError), caplog.at_level(logging.ERROR):
             logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=True)
@@ -99,6 +103,7 @@ def test_log_exception_raise(self, caplog):
     def test_log_exception_no_raise(self, caplog):
         """Test exception logging without raising"""
         logger = Logger("exception_test_logger")
+        logger.logger.propagate = True
 
         with caplog.at_level(logging.ERROR):
             logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=False)
@@ -188,6 +193,7 @@ def test_complete_workflow(self, tmp_path, caplog):
         # Setup
         log_file = tmp_path / "workflow.log"
         logger = Logger("workflow_test", str(log_file), logging.DEBUG)
+        logger.logger.propagate = True
 
         # Test all methods
         logger.debug("Debug test")
@@ -203,8 +209,8 @@ def test_complete_workflow(self, tmp_path, caplog):
             logger.log_exception("Caught exception", e, raise_exception=False)
 
         # Test rank zero logging
-        with patch("QEfficient.finetune.experimental.core.logger.get_local_rank") as mock_rank:
-            mock_rank.return_value = 0
+        with patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero") as mock_rank:
+            mock_rank.return_value = True
             logger.log_rank_zero("Rank zero test")
 
         # Verify all messages were logged
diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py
index 15bcfa2983..149b12a8a0 100644
--- a/QEfficient/utils/device_utils.py
+++ b/QEfficient/utils/device_utils.py
@@ -42,8 +42,6 @@ def is_nsp_free():
             # Check if NSP free is eqaul to total nsp
             if nsp_free != nsp_total:
                 raise RuntimeError(f"QAIC device {qid_idx} does not have {nsp_total} NSP free")
-            else:
-                logger.info(f"QAIC device {qid_idx} has {nsp_free} NSP free")
         else:
             logger.warning("Failed to parse NSP free information from qaic-util output")
 
diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md
index f966dff58a..62c43597dc 100644
--- a/docs/source/hf_finetune.md
+++ b/docs/source/hf_finetune.md
@@ -50,31 +50,51 @@ export QAIC_DEVICE_LOG_LEVEL=0   # Device-level logs
 export QAIC_DEBUG=1              # Show CPU fallback ops, etc.
 
 # Set temp directory
-export TMPDIR = $HOME/tmp
+export TMPDIR=$HOME/tmp
 ```
 
 ### Step-by-Step Guide to run a fine-tuning job
 
+### For QAIC Training
 For Docker-based environments, use the provided `torch-qaic-env` environment.
 
 ```bash
-source /opt/torch-qaic-env/bin/activate
+python -m venv finetune_env
+source finetune_env/bin/activate
 git clone https://github.com/quic/efficient-transformers.git
-git checkout ft_experimental
 cd efficient-transformers
+git checkout ft_experimental
 pip install -e .
 pip install   --index-url https://download.pytorch.org/whl/cpu   --extra-index-url     https://devpi.qualcomm.com/qcom/dev/+simple   --trusted-host devpi.qualcomm.com   "torch==2.9.1+cpu"   "torchvision==0.24.1+cpu"   "torchaudio==2.9.1+cpu"
 pip install trl==0.22.0
-git clone https://github.com/quic-swatia/transformers.git
+cd .. && git clone https://github.com/quic-swatia/transformers.git
 cd transformers 
 git checkout version-4.55.0 && pip install -e .
-cd .. && QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+cd .. && cd efficient-transformers
+QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
 
 ```
 
 > **Note**  
 > If you’re using the `torch-qaic-env` Docker environment, `torch_qaic` and `accelerate` may already be installed.
 
+### For CUDA Training
+
+```bash
+python -m venv finetune_env
+source finetune_env/bin/activate
+git clone https://github.com/quic/efficient-transformers.git
+cd efficient-transformers
+git checkout ft_experimental
+pip install -e .
+pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
+pip install trl==0.22.0
+cd .. && git clone https://github.com/quic-swatia/transformers.git
+cd transformers 
+git checkout version-4.55.0 && pip install -e .
+cd .. && cd efficient-transformers
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 -m QEfficient.cloud.finetune_experimental --device cuda --num_epochs 1 --model_name meta-llama/Llama-3.2-3B --dataset_name  yahma/alpaca-cleaned --train_batch_size 1 --gradient_accumulation_steps 768 --prompt_func QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt --completion_template {output}
+```
 ***
 ## Finetuning
 

From b29c9eba17d6bec56629e3d65bc725d96f260e68 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Fri, 13 Mar 2026 06:05:37 +0000
Subject: [PATCH 2/4] updated test_logger.py

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py             | 3 ++-
 QEfficient/finetune/experimental/tests/test_logger.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index de45786d39..7f6a2b6f7f 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -265,7 +265,8 @@ def _create_trainer(
         if num_samples > 0:
             # Truncating datasets to a smaller number of samples.
             # If you want to use all data, set dataset_num_samples to -1 or remove it from config.
-            logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING)
+            if (num_samples*split_ratio)/len(train_dataset) <=0.05:
+                 logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING)
             subset_train_indices = list(range(0, int(num_samples * split_ratio)))
             subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio)))
             eval_dataset = eval_dataset.select(subset_eval_indices)
diff --git a/QEfficient/finetune/experimental/tests/test_logger.py b/QEfficient/finetune/experimental/tests/test_logger.py
index 3c0506881a..d976dc5c0a 100644
--- a/QEfficient/finetune/experimental/tests/test_logger.py
+++ b/QEfficient/finetune/experimental/tests/test_logger.py
@@ -173,7 +173,7 @@ def test_get_logger_with_file(self, tmp_path):
 
         # Check that we have 2 handlers (console + file)
         assert len(logger.logger.handlers) == 2  # Console + file
-        assert isinstance(logger.logger.handlers[1], logging.FileHandler)
+        any(isinstance(h, logging.FileHandler) for h in logger.logger.handlers)
 
         # Check file exists
         assert log_file.exists()

From 346bd74e4293b3ba9fbe82790f5d8f831a2a4722 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Fri, 13 Mar 2026 06:07:54 +0000
Subject: [PATCH 3/4] updated test_logger.py

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 7f6a2b6f7f..08ea8f5e5b 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -265,8 +265,8 @@ def _create_trainer(
         if num_samples > 0:
             # Truncating datasets to a smaller number of samples.
             # If you want to use all data, set dataset_num_samples to -1 or remove it from config.
-            if (num_samples*split_ratio)/len(train_dataset) <=0.05:
-                 logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING)
+            if (num_samples * split_ratio) / len(train_dataset) <= 0.05:
+                logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING)
             subset_train_indices = list(range(0, int(num_samples * split_ratio)))
             subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio)))
             eval_dataset = eval_dataset.select(subset_eval_indices)

From 2500754f110342009476bddc8b0e7f53bd094a0c Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Mon, 16 Mar 2026 11:11:44 +0000
Subject: [PATCH 4/4] Removing tensorboard from callback

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/configs/sft_ddp_config.yaml     | 1 -
 .../experimental/configs/sft_single_device_alpaca_config.yaml    | 1 -
 .../experimental/configs/sft_single_device_gsm8k_config.yaml     | 1 -
 3 files changed, 3 deletions(-)

diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
index 242a81ef89..f7a0f6b1a9 100644
--- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -53,4 +53,3 @@ callbacks:
   early_stopping:
     early_stopping_patience: 3 # Number of epochs to wait before stopping training
     early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
-  tensorboard:
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
index 6dcd25ced4..dfc5bd09c3 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
@@ -46,4 +46,3 @@ callbacks:
   early_stopping:
     early_stopping_patience: 3 # Number of epochs to wait before stopping training
     early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
-  tensorboard:
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
index cd295e06f8..f8627f6dad 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -47,4 +47,3 @@ callbacks:
   early_stopping:
     early_stopping_patience: 3 # Number of epochs to wait before stopping training
     early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
-  tensorboard: