From 40d4e2540e0e0a459738e4f1096b943ecf72908d Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Fri, 13 Mar 2026 04:04:35 +0000 Subject: [PATCH 1/4] Updating logger.py Signed-off-by: Tanisha Chawada --- QEfficient/cloud/finetune_experimental.py | 2 +- .../experimental/core/config_manager.py | 1 - .../finetune/experimental/core/dataset.py | 7 ++-- .../finetune/experimental/core/logger.py | 41 ++++++++++++++++--- .../finetune/experimental/core/model.py | 1 - .../experimental/core/utils/dist_utils.py | 17 ++++++++ .../experimental/tests/test_logger.py | 22 ++++++---- QEfficient/utils/device_utils.py | 2 - docs/source/hf_finetune.md | 30 +++++++++++--- 9 files changed, 96 insertions(+), 27 deletions(-) diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py index 9828ea81ec..de45786d39 100644 --- a/QEfficient/cloud/finetune_experimental.py +++ b/QEfficient/cloud/finetune_experimental.py @@ -265,7 +265,7 @@ def _create_trainer( if num_samples > 0: # Truncating datasets to a smaller number of samples. # If you want to use all data, set dataset_num_samples to -1 or remove it from config. - logger.warning("Using fewer samples may impact finetuning quality.") + logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING) subset_train_indices = list(range(0, int(num_samples * split_ratio))) subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio))) eval_dataset = eval_dataset.select(subset_eval_indices) diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index 256904d225..a3e0a3cd2f 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -25,7 +25,6 @@ from QEfficient.utils.device_utils import is_nsp_free logger = Logger(__name__) -logger.logger.propagate = False @dataclass diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py index 766d851457..22594cb81b 100644 --- a/QEfficient/finetune/experimental/core/dataset.py +++ b/QEfficient/finetune/experimental/core/dataset.py @@ -26,7 +26,6 @@ ) logger = Logger(__name__) -logger.logger.propagate = False class BaseDataset(Dataset, ABC): @@ -102,9 +101,11 @@ def __init__( if not os.path.isfile(self.json_file_path): raise FileNotFoundError(f"JSON file not found or invalid: '{self.json_file_path}'") if self.prompt_template and self.prompt_func_path: - logger.info("Both prompt_template and prompt_func are provided. Using prompt_template for preprocessing.") + logger.log_rank_zero( + "Both prompt_template and prompt_func are provided. Using prompt_template for preprocessing." + ) if self.completion_template and self.completion_func_path: - logger.info( + logger.log_rank_zero( "Both completion_template and completion_func are provided. Using completion_template for preprocessing." ) if self.prompt_template is None and self.prompt_func_path is None: diff --git a/QEfficient/finetune/experimental/core/logger.py b/QEfficient/finetune/experimental/core/logger.py index a1b9c771f6..c4f5b47bd8 100644 --- a/QEfficient/finetune/experimental/core/logger.py +++ b/QEfficient/finetune/experimental/core/logger.py @@ -7,13 +7,13 @@ import logging -import sys from pathlib import Path from typing import Optional from transformers.utils.logging import get_logger as hf_get_logger -from QEfficient.finetune.experimental.core.utils.dist_utils import get_local_rank +from QEfficient.finetune.experimental.core.utils.dist_utils import is_global_rank_zero + # ----------------------------------------------------------------------------- # Logger usage: @@ -27,6 +27,34 @@ # Attach file handler later if needed: # logger.prepare_for_logs(output_dir="logs", log_level="DEBUG") # ----------------------------------------------------------------------------- +class QEffFormatter(logging.Formatter): + """ + Formatter class used to set colors for printing different logging levels of messages on console. + """ + + cyan: str = "\x1b[38;5;14m" + yellow: str = "\x1b[33;20m" + red: str = "\x1b[31;20m" + bold_red: str = "\x1b[31;1m" + reset: str = "\x1b[0m" + common_format: str = "%(levelname)s - %(name)s - %(message)s" # type: ignore + format_with_line_info = "%(levelname)s - %(name)s - %(message)s (%(filename)s:%(lineno)d)" # type: ignore + + FORMATS = { + logging.DEBUG: cyan + format_with_line_info + reset, + logging.INFO: cyan + common_format + reset, + logging.WARNING: yellow + common_format + reset, + logging.ERROR: red + format_with_line_info + reset, + logging.CRITICAL: bold_red + format_with_line_info + reset, + } + + def format(self, record): + """ + Overriding the base class method to Choose format based on log level. + """ + log_fmt = self.FORMATS.get(record.levelno) + formatter = logging.Formatter(log_fmt) + return formatter.format(record) class Logger: @@ -48,7 +76,7 @@ def __init__( """ self.logger = hf_get_logger(name) self.logger.setLevel(level) - + self.logger.propagate = False # Clear any existing handlers self.logger.handlers.clear() @@ -56,9 +84,9 @@ def __init__( self.formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") # Console handler - console_handler = logging.StreamHandler(sys.stdout) + console_handler = logging.StreamHandler() console_handler.setLevel(level) - console_handler.setFormatter(self.formatter) + console_handler.setFormatter(QEffFormatter()) self.logger.addHandler(console_handler) # File handler (if log_file is provided) @@ -100,7 +128,7 @@ def log_rank_zero(self, message: str, level: int = logging.INFO) -> None: message: Message to log level: Logging level """ - if get_local_rank() == 0: + if is_global_rank_zero(): self.logger.log(level, message) def log_exception(self, message: str, exception: Exception, raise_exception: bool = True) -> None: @@ -130,6 +158,7 @@ def prepare_for_logs(self, output_dir: Optional[str] = None, log_level: str = "I # Convert string log level to logging constant level = getattr(logging, log_level.upper(), logging.INFO) self.logger.setLevel(level) + self.logger.propagate = False # Update existing handlers' levels for handler in self.logger.handlers: diff --git a/QEfficient/finetune/experimental/core/model.py b/QEfficient/finetune/experimental/core/model.py index f9a4d2fab1..0f087e6653 100644 --- a/QEfficient/finetune/experimental/core/model.py +++ b/QEfficient/finetune/experimental/core/model.py @@ -18,7 +18,6 @@ from QEfficient.finetune.experimental.core.utils.dataset_utils import insert_pad_token logger = Logger(__name__) -logger.logger.propagate = False class BaseModel(nn.Module, ABC): diff --git a/QEfficient/finetune/experimental/core/utils/dist_utils.py b/QEfficient/finetune/experimental/core/utils/dist_utils.py index aed88862d8..069d91445a 100644 --- a/QEfficient/finetune/experimental/core/utils/dist_utils.py +++ b/QEfficient/finetune/experimental/core/utils/dist_utils.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +import os import torch.distributed as dist @@ -37,3 +38,19 @@ def get_world_size() -> int: def is_main_process() -> bool: """Check if the current process is the main process (rank 0).""" return get_rank() == 0 + + +def get_global_rank() -> int: + """Return global rank if available (torchrun/deepspeed), else fall back to local rank.""" + r = os.environ.get("RANK") + if r is not None: + try: + return int(r) + except ValueError: + return 0 + # Fallback to local rank + return int(get_local_rank()) + + +def is_global_rank_zero() -> bool: + return get_global_rank() == 0 diff --git a/QEfficient/finetune/experimental/tests/test_logger.py b/QEfficient/finetune/experimental/tests/test_logger.py index 0af0c8b512..3c0506881a 100644 --- a/QEfficient/finetune/experimental/tests/test_logger.py +++ b/QEfficient/finetune/experimental/tests/test_logger.py @@ -48,6 +48,7 @@ def test_init_with_file(self, tmp_path): def test_log_levels(self, caplog): """Test all log levels work correctly""" logger = Logger("level_test_logger", level=logging.DEBUG) + logger.logger.propagate = True with caplog.at_level(logging.DEBUG): logger.debug("Debug message") @@ -63,22 +64,24 @@ def test_log_levels(self, caplog): assert "Error message" in caplog.text assert "Critical message" in caplog.text - @patch("QEfficient.finetune.experimental.core.logger.get_local_rank") - def test_log_rank_zero_positive_case(self, mock_get_local_rank, caplog): + @patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero") + def test_log_rank_zero_positive_case(self, mock_get_global_rank, caplog): """Test rank zero logging functionality""" - mock_get_local_rank.return_value = 0 + mock_get_global_rank.return_value = True logger = Logger("rank_test_logger") + logger.logger.propagate = True with caplog.at_level(logging.INFO): logger.log_rank_zero("Rank zero message") assert "Rank zero message" in caplog.text - @patch("QEfficient.finetune.experimental.core.logger.get_local_rank") - def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog): + @patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero") + def test_log_rank_zero_negative_case(self, mock_get_global_rank, caplog): """Test to verify that only rank‑zero messages are logged""" - mock_get_local_rank.return_value = 1 + mock_get_global_rank.return_value = False logger = Logger("rank_test_logger") + logger.logger.propagate = True with caplog.at_level(logging.INFO): logger.log_rank_zero("Should not appear") @@ -88,6 +91,7 @@ def test_log_rank_zero_negative_case(self, mock_get_local_rank, caplog): def test_log_exception_raise(self, caplog): """Test exception logging with raising""" logger = Logger("exception_test_logger") + logger.logger.propagate = True with pytest.raises(ValueError), caplog.at_level(logging.ERROR): logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=True) @@ -99,6 +103,7 @@ def test_log_exception_raise(self, caplog): def test_log_exception_no_raise(self, caplog): """Test exception logging without raising""" logger = Logger("exception_test_logger") + logger.logger.propagate = True with caplog.at_level(logging.ERROR): logger.log_exception("Custom error", ValueError("Test exception"), raise_exception=False) @@ -188,6 +193,7 @@ def test_complete_workflow(self, tmp_path, caplog): # Setup log_file = tmp_path / "workflow.log" logger = Logger("workflow_test", str(log_file), logging.DEBUG) + logger.logger.propagate = True # Test all methods logger.debug("Debug test") @@ -203,8 +209,8 @@ def test_complete_workflow(self, tmp_path, caplog): logger.log_exception("Caught exception", e, raise_exception=False) # Test rank zero logging - with patch("QEfficient.finetune.experimental.core.logger.get_local_rank") as mock_rank: - mock_rank.return_value = 0 + with patch("QEfficient.finetune.experimental.core.logger.is_global_rank_zero") as mock_rank: + mock_rank.return_value = True logger.log_rank_zero("Rank zero test") # Verify all messages were logged diff --git a/QEfficient/utils/device_utils.py b/QEfficient/utils/device_utils.py index 15bcfa2983..149b12a8a0 100644 --- a/QEfficient/utils/device_utils.py +++ b/QEfficient/utils/device_utils.py @@ -42,8 +42,6 @@ def is_nsp_free(): # Check if NSP free is eqaul to total nsp if nsp_free != nsp_total: raise RuntimeError(f"QAIC device {qid_idx} does not have {nsp_total} NSP free") - else: - logger.info(f"QAIC device {qid_idx} has {nsp_free} NSP free") else: logger.warning("Failed to parse NSP free information from qaic-util output") diff --git a/docs/source/hf_finetune.md b/docs/source/hf_finetune.md index f966dff58a..62c43597dc 100644 --- a/docs/source/hf_finetune.md +++ b/docs/source/hf_finetune.md @@ -50,31 +50,51 @@ export QAIC_DEVICE_LOG_LEVEL=0 # Device-level logs export QAIC_DEBUG=1 # Show CPU fallback ops, etc. # Set temp directory -export TMPDIR = $HOME/tmp +export TMPDIR=$HOME/tmp ``` ### Step-by-Step Guide to run a fine-tuning job +### For QAIC Training For Docker-based environments, use the provided `torch-qaic-env` environment. ```bash -source /opt/torch-qaic-env/bin/activate +python -m venv finetune_env +source finetune_env/bin/activate git clone https://github.com/quic/efficient-transformers.git -git checkout ft_experimental cd efficient-transformers +git checkout ft_experimental pip install -e . pip install --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://devpi.qualcomm.com/qcom/dev/+simple --trusted-host devpi.qualcomm.com "torch==2.9.1+cpu" "torchvision==0.24.1+cpu" "torchaudio==2.9.1+cpu" pip install trl==0.22.0 -git clone https://github.com/quic-swatia/transformers.git +cd .. && git clone https://github.com/quic-swatia/transformers.git cd transformers git checkout version-4.55.0 && pip install -e . -cd .. && QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml +cd .. && cd efficient-transformers +QAIC_VISIBLE_DEVICES=0 python QEfficient/cloud/finetune_experimental.py QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml ``` > **Note** > If you’re using the `torch-qaic-env` Docker environment, `torch_qaic` and `accelerate` may already be installed. +### For CUDA Training + +```bash +python -m venv finetune_env +source finetune_env/bin/activate +git clone https://github.com/quic/efficient-transformers.git +cd efficient-transformers +git checkout ft_experimental +pip install -e . +pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130 +pip install trl==0.22.0 +cd .. && git clone https://github.com/quic-swatia/transformers.git +cd transformers +git checkout version-4.55.0 && pip install -e . +cd .. && cd efficient-transformers +CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 -m QEfficient.cloud.finetune_experimental --device cuda --num_epochs 1 --model_name meta-llama/Llama-3.2-3B --dataset_name yahma/alpaca-cleaned --train_batch_size 1 --gradient_accumulation_steps 768 --prompt_func QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt --completion_template {output} +``` *** ## Finetuning From b29c9eba17d6bec56629e3d65bc725d96f260e68 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Fri, 13 Mar 2026 06:05:37 +0000 Subject: [PATCH 2/4] updated test_logger.py Signed-off-by: Tanisha Chawada --- QEfficient/cloud/finetune_experimental.py | 3 ++- QEfficient/finetune/experimental/tests/test_logger.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py index de45786d39..7f6a2b6f7f 100644 --- a/QEfficient/cloud/finetune_experimental.py +++ b/QEfficient/cloud/finetune_experimental.py @@ -265,7 +265,8 @@ def _create_trainer( if num_samples > 0: # Truncating datasets to a smaller number of samples. # If you want to use all data, set dataset_num_samples to -1 or remove it from config. - logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING) + if (num_samples*split_ratio)/len(train_dataset) <=0.05: + logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING) subset_train_indices = list(range(0, int(num_samples * split_ratio))) subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio))) eval_dataset = eval_dataset.select(subset_eval_indices) diff --git a/QEfficient/finetune/experimental/tests/test_logger.py b/QEfficient/finetune/experimental/tests/test_logger.py index 3c0506881a..d976dc5c0a 100644 --- a/QEfficient/finetune/experimental/tests/test_logger.py +++ b/QEfficient/finetune/experimental/tests/test_logger.py @@ -173,7 +173,7 @@ def test_get_logger_with_file(self, tmp_path): # Check that we have 2 handlers (console + file) assert len(logger.logger.handlers) == 2 # Console + file - assert isinstance(logger.logger.handlers[1], logging.FileHandler) + any(isinstance(h, logging.FileHandler) for h in logger.logger.handlers) # Check file exists assert log_file.exists() From 346bd74e4293b3ba9fbe82790f5d8f831a2a4722 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Fri, 13 Mar 2026 06:07:54 +0000 Subject: [PATCH 3/4] updated test_logger.py Signed-off-by: Tanisha Chawada --- QEfficient/cloud/finetune_experimental.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py index 7f6a2b6f7f..08ea8f5e5b 100644 --- a/QEfficient/cloud/finetune_experimental.py +++ b/QEfficient/cloud/finetune_experimental.py @@ -265,8 +265,8 @@ def _create_trainer( if num_samples > 0: # Truncating datasets to a smaller number of samples. # If you want to use all data, set dataset_num_samples to -1 or remove it from config. - if (num_samples*split_ratio)/len(train_dataset) <=0.05: - logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING) + if (num_samples * split_ratio) / len(train_dataset) <= 0.05: + logger.log_rank_zero("Using fewer samples may impact finetuning quality.", logging.WARNING) subset_train_indices = list(range(0, int(num_samples * split_ratio))) subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio))) eval_dataset = eval_dataset.select(subset_eval_indices) From 2500754f110342009476bddc8b0e7f53bd094a0c Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 16 Mar 2026 11:11:44 +0000 Subject: [PATCH 4/4] Removing tensorboard from callback Signed-off-by: Tanisha Chawada --- QEfficient/finetune/experimental/configs/sft_ddp_config.yaml | 1 - .../experimental/configs/sft_single_device_alpaca_config.yaml | 1 - .../experimental/configs/sft_single_device_gsm8k_config.yaml | 1 - 3 files changed, 3 deletions(-) diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml index 242a81ef89..f7a0f6b1a9 100644 --- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml @@ -53,4 +53,3 @@ callbacks: early_stopping: early_stopping_patience: 3 # Number of epochs to wait before stopping training early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement - tensorboard: diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml index 6dcd25ced4..dfc5bd09c3 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml @@ -46,4 +46,3 @@ callbacks: early_stopping: early_stopping_patience: 3 # Number of epochs to wait before stopping training early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement - tensorboard: diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml index cd295e06f8..f8627f6dad 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml @@ -47,4 +47,3 @@ callbacks: early_stopping: early_stopping_patience: 3 # Number of epochs to wait before stopping training early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement - tensorboard: