From a4ee682776a3c778ace8669be1cc39efaf1d0a95 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 16 Mar 2026 11:02:26 +0000 Subject: [PATCH 1/6] Adding style remix dataset config Signed-off-by: Tanisha Chawada --- ...t_single_device_custom_dataset_config.yaml | 48 +++++++++++++++++++ .../experimental/core/config_manager.py | 6 ++- .../finetune/experimental/core/dataset.py | 8 ++++ 3 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml new file mode 100644 index 000000000..8efc196dd --- /dev/null +++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml @@ -0,0 +1,48 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- +# Model configuration +model: + model_type: "hf" # Hugging Face model + auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with + model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name + use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning) + peft_config: + lora_r: 8 + lora_alpha: 16 + lora_dropout: 0 + target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA + task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. + peft_type: "LORA" # Options: LORA, IA3, etc. + +# Dataset configuration +dataset: + dataset_type: "sft_dataset" + dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub + prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields + completion_template: "{generation}" # Model will be trained on this part. + dataset_disc_style: "sarcasm_more" # Style of dataset to use + +# Training configuration +training: + type: "sft" + gradient_accumulation_steps: 1 # Number of steps to accumulate gradients + per_device_train_batch_size: 1 # Batch size per device during training + num_train_epochs: 1 + torch_compile: False # Whether to use torch.compile + +# Optimizer configuration +optimizers: + optimizer_name: "adamw" + lr: 2e-4 + +scheduler: + scheduler_name: "cosine" + +callbacks: + early_stopping: + early_stopping_patience: 3 # Number of epochs to wait before stopping training + early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index a3e0a3cd2..a2317ca91 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -144,9 +144,13 @@ class DatasetConfig: metadata={"help": "Function for formatting output completions (e.g., '{output}')."}, ) collate_fn: str = field( - default="dynamic_padding", + default=None, metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."}, ) + dataset_disc_style: str = field( + default=None, + metadata={"help": "Style of dataset"}, + ) group_by_length: bool = field( default=True, metadata={"help": "Whether to group samples by length to minimize padding."}, diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py index 22594cb81..9954ef5ae 100644 --- a/QEfficient/finetune/experimental/core/dataset.py +++ b/QEfficient/finetune/experimental/core/dataset.py @@ -96,6 +96,7 @@ def __init__( self.completion_func_path = kwargs.get("completion_func", None) self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True) self.config_name = kwargs.get("config_name", None) + self.dataset_disc_style = kwargs.get("dataset_disc_style", None) if self.json_file_path not in (None, ""): if not os.path.isfile(self.json_file_path): @@ -149,6 +150,13 @@ def _initialize_dataset(self): load_split = "train" # FIXME: Add streaming support for larger datasets. self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs) + if self.dataset_disc_style: + available_styles = set(self.dataset["category"]) + if self.dataset_disc_style not in available_styles: + raise RuntimeError( + f"For DiSC dataset the provided disc_style '{self.dataset_disc_style}' is not supported." + ) + self.dataset = self.dataset.filter(lambda example: example["category"] == self.dataset_disc_style) if len(available_splits) == 1: self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed) From 6f66c42cfd9696ba162e552f03a1e7c80ec6ef02 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 16 Mar 2026 11:34:23 +0000 Subject: [PATCH 2/6] Added documentation for style-remix Signed-off-by: Tanisha Chawada --- docs/source/config.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/config.md b/docs/source/config.md index 7b5be6d0c..170c2fc42 100644 --- a/docs/source/config.md +++ b/docs/source/config.md @@ -63,6 +63,8 @@ If provided, this takes precedence over dataset_name. * **train_batch_size**: `default = 1` → Per-device batch size during training. * **eval_batch_size**: `default = 1` → Per-device batch size during evaluation. * **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch). +* **dataset_disc_style**: `default = None` → Selects the style remix category to apply to the dataset during preprocessing; when None, no style remixing is applied and the original dataset style is preserved. + * **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching. * **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field). * **num_workers**: `default = 4` → Number of subprocesses to use for data loading. @@ -159,6 +161,17 @@ dataset: completion_template: "{target}" ``` +#### **5. Style-Remix (hallisky/DiSC)** + +```yaml +dataset: + dataset_type: "sft_dataset" + dataset_name: "hallisky/DiSC" + prompt_template: "### Original:{original} \n ### Rewrite:\n" + completion_template: "{generation}" + dataset_disc_style: "sarcasm_more" + +``` *** ## 3. Training Configuration From ed42e52d8333eca1c4b6c9f802bb6dbe948f69ca Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 23 Mar 2026 06:11:46 +0000 Subject: [PATCH 3/6] Removing grammar dataset from config.md Signed-off-by: Tanisha Chawada --- .../sft_single_device_custom_dataset_config.yaml | 1 + docs/source/config.md | 15 +-------------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml index 8efc196dd..630790661 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +# Dataset: Style-Remix (hallisky/DiSC) # Model configuration model: model_type: "hf" # Hugging Face model diff --git a/docs/source/config.md b/docs/source/config.md index 170c2fc42..702fe90ca 100644 --- a/docs/source/config.md +++ b/docs/source/config.md @@ -146,22 +146,9 @@ dataset: completion_template: "{answer}" ``` - *** -#### **4. grammar (grammar_dataset)** - -```yaml -dataset: - tokenizer_name: "meta-llama/Llama-3.2-1B" - dataset_type: "sft_dataset" - dataset_name: "grammar" - train_split: "train" - split_ratio: 0.8 - prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n" - completion_template: "{target}" -``` -#### **5. Style-Remix (hallisky/DiSC)** +#### **4. Style-Remix (hallisky/DiSC)** ```yaml dataset: From 644c18fbeff0cf2ee151084536ebba287863c0ea Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Mon, 23 Mar 2026 09:06:36 +0000 Subject: [PATCH 4/6] Corrected prompt_func Signed-off-by: Tanisha Chawada --- docs/source/config.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/config.md b/docs/source/config.md index 702fe90ca..88f36baf3 100644 --- a/docs/source/config.md +++ b/docs/source/config.md @@ -90,7 +90,7 @@ dataset: train_split: "train" test_split: "test" max_seq_length: 512 - prompt_func: "preprocess/alpaca_func:create_alpaca_prompt" + prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" completion_template: "{output}" ``` From 65f937a2eb49302275a1778b4779bf0cf9cd899f Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Tue, 24 Mar 2026 06:03:36 +0000 Subject: [PATCH 5/6] Adding seed to dataset Signed-off-by: Tanisha Chawada --- QEfficient/finetune/experimental/core/dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py index 9954ef5ae..e607ef2b9 100644 --- a/QEfficient/finetune/experimental/core/dataset.py +++ b/QEfficient/finetune/experimental/core/dataset.py @@ -89,6 +89,7 @@ def __init__( **kwargs, ): self.split_ratio = split_ratio + self.seed = seed self.json_file_path = kwargs.get("json_file_path", None) self.prompt_template = kwargs.get("prompt_template", None) self.completion_template = kwargs.get("completion_template", None) @@ -128,6 +129,7 @@ def _initialize_dataset(self): # Load dataset from JSON file validate_json_structure(self.json_file_path) self.dataset = load_dataset("json", data_files=self.json_file_path, split="train") + self.dataset = self.dataset.shuffle(seed=self.seed) # Apply train/test split if needed if self.split in ["train", "test"]: self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed) @@ -150,6 +152,7 @@ def _initialize_dataset(self): load_split = "train" # FIXME: Add streaming support for larger datasets. self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs) + self.dataset = self.dataset.shuffle(seed=self.seed) if self.dataset_disc_style: available_styles = set(self.dataset["category"]) if self.dataset_disc_style not in available_styles: From 45b4ec3b12bef857a010503d37a6de15aa89e736 Mon Sep 17 00:00:00 2001 From: Tanisha Chawada Date: Wed, 25 Mar 2026 08:36:44 +0000 Subject: [PATCH 6/6] Added seed for dataset Signed-off-by: Tanisha Chawada --- QEfficient/cloud/finetune_experimental.py | 2 +- .../finetune/experimental/configs/sft_ddp_config.yaml | 1 + .../configs/sft_single_device_alpaca_config.yaml | 1 + .../sft_single_device_custom_dataset_config.yaml | 1 + .../configs/sft_single_device_gsm8k_config.yaml | 1 + .../finetune/experimental/core/config_manager.py | 1 + .../finetune/experimental/tests/test_dataset.py | 11 +++++++---- .../finetune/experimental/tests/test_finetune.py | 1 + 8 files changed, 14 insertions(+), 5 deletions(-) diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py index 08ea8f5e5..43fcde5f8 100644 --- a/QEfficient/cloud/finetune_experimental.py +++ b/QEfficient/cloud/finetune_experimental.py @@ -115,7 +115,7 @@ def _create_datasets(self) -> Tuple[Any, Any]: dataset_name = dataset_config.get("dataset_name") train_split = dataset_config.get("train_split", "train") test_split = dataset_config.get("test_split", "test") - seed = self.config.training["seed"] + seed = dataset_config.get("data_seed", 42) # Create a copy of dataset_config excluding keys that are passed explicitly # to avoid duplicate keyword arguments when unpacking diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml index f7a0f6b1a..a426dd614 100644 --- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml @@ -25,6 +25,7 @@ dataset: prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields completion_template: "{answer}" # Model will be trained on this part. config_name: "main" # Config name for the dataset + data_seed: 42 # Random seed for dataset shuffling diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml index dfc5bd09c..2bdf800bc 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml @@ -24,6 +24,7 @@ dataset: dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields completion_template: "{output}" # Model will be trained on this part. + data_seed: 42 # Random seed for dataset shuffling # Training configuration diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml index 630790661..fbdcc88d6 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml @@ -26,6 +26,7 @@ dataset: prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields completion_template: "{generation}" # Model will be trained on this part. dataset_disc_style: "sarcasm_more" # Style of dataset to use + data_seed: 42 # Random seed for dataset shuffling # Training configuration training: diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml index f8627f6da..9391fb0bd 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml @@ -25,6 +25,7 @@ dataset: prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields completion_template: "{answer}" # Model will be trained on this part. config_name: "main" # Config name for the dataset + data_seed: 42 # Random seed for dataset shuffling # Training configuration diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index a2317ca91..10b61c795 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -188,6 +188,7 @@ class DatasetConfig: metadata={"help": "Name of the hf configuration file."}, ) json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."}) + data_seed: int = field(default=42, metadata={"help": "Seed for data shuffling and sampling."}) @dataclass diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py index d6dc5729c..81d37db90 100644 --- a/QEfficient/finetune/experimental/tests/test_dataset.py +++ b/QEfficient/finetune/experimental/tests/test_dataset.py @@ -44,7 +44,9 @@ def setUp(self): {"question": "What is AI?", "answer": "Artificial Intelligence"}, {"question": "What is ML?", "answer": "Machine Learning"}, {"question": "What is DL?", "answer": "Deep Learning"}, + {"question": "What is LLM?", "answer": "Large Language Model"}, {"question": "What is NLP?", "answer": "Natural Language Processing"}, + {"question": "What is VLM?", "answer": "Vision Language Model"}, {"question": "", "answer": "Empty question"}, # Empty question {"question": "Valid question", "answer": ""}, # Empty answer {"question": None, "answer": "None question"}, # None question @@ -78,6 +80,7 @@ def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder, def create_mock_dataset(): mock_dataset = MagicMock() mock_dataset.column_names = ["text", "label"] + mock_dataset.shuffle.return_value = mock_dataset mock_dataset.num_rows = 3 # Mock __getitem__ to return processed samples @@ -177,7 +180,7 @@ def test_sft_dataset_json_file_without_filtering(self): ) # When filtering is disabled and split="train" is used, it still applies train/test split - # So we get ~80% of 8 samples = ~6 samples + # So we get ~80% of 10 samples = ~8 samples self.assertGreater(len(dataset), 0) self.assertLessEqual(len(dataset), 8) @@ -203,12 +206,12 @@ def test_sft_dataset_train_test_split_from_json(self): seed=SEED, ) - # After filtering, we have 4 valid samples - # With split ratio, train should have ~3 samples, test should have ~1 sample + # After filtering, we have 6 valid samples + # With split ratio, train should have ~4 samples, test should have ~2 sample self.assertGreater(len(train_dataset), 0) self.assertGreater(len(test_dataset), 0) # Total should equal the filtered dataset size - self.assertEqual(len(train_dataset) + len(test_dataset), 4) + self.assertEqual(len(train_dataset) + len(test_dataset), 6) def test_sft_dataset_with_custom_prompt_function(self): """Test loading with custom prompt function.""" diff --git a/QEfficient/finetune/experimental/tests/test_finetune.py b/QEfficient/finetune/experimental/tests/test_finetune.py index 8e3ead3e9..9eb857be7 100644 --- a/QEfficient/finetune/experimental/tests/test_finetune.py +++ b/QEfficient/finetune/experimental/tests/test_finetune.py @@ -226,6 +226,7 @@ def test_create_datasets_called_and_assigned( "dataset_name": "test_dataset", "train_split": train_split, "test_split": test_split, + "data_seed": 42, } train_ds = MagicMock(name="train_ds")