From a4ee682776a3c778ace8669be1cc39efaf1d0a95 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Mon, 16 Mar 2026 11:02:26 +0000
Subject: [PATCH 1/6] Adding style remix dataset config

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 ...t_single_device_custom_dataset_config.yaml | 48 +++++++++++++++++++
 .../experimental/core/config_manager.py       |  6 ++-
 .../finetune/experimental/core/dataset.py     |  8 ++++
 3 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml

diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
new file mode 100644
index 000000000..8efc196dd
--- /dev/null
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
@@ -0,0 +1,48 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
+  completion_template: "{generation}"    # Model will be trained on this part. 
+  dataset_disc_style: "sarcasm_more" # Style of dataset to use
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index a3e0a3cd2..a2317ca91 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -144,9 +144,13 @@ class DatasetConfig:
         metadata={"help": "Function for formatting output completions (e.g., '{output}')."},
     )
     collate_fn: str = field(
-        default="dynamic_padding",
+        default=None,
         metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."},
     )
+    dataset_disc_style: str = field(
+        default=None,
+        metadata={"help": "Style of dataset"},
+    )
     group_by_length: bool = field(
         default=True,
         metadata={"help": "Whether to group samples by length to minimize padding."},
diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index 22594cb81..9954ef5ae 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -96,6 +96,7 @@ def __init__(
         self.completion_func_path = kwargs.get("completion_func", None)
         self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True)
         self.config_name = kwargs.get("config_name", None)
+        self.dataset_disc_style = kwargs.get("dataset_disc_style", None)
 
         if self.json_file_path not in (None, ""):
             if not os.path.isfile(self.json_file_path):
@@ -149,6 +150,13 @@ def _initialize_dataset(self):
                 load_split = "train"
             # FIXME: Add streaming support for larger datasets.
             self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+            if self.dataset_disc_style:
+                available_styles = set(self.dataset["category"])
+                if self.dataset_disc_style not in available_styles:
+                    raise RuntimeError(
+                        f"For DiSC dataset the provided disc_style '{self.dataset_disc_style}' is not supported."
+                    )
+                self.dataset = self.dataset.filter(lambda example: example["category"] == self.dataset_disc_style)
 
             if len(available_splits) == 1:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)

From 6f66c42cfd9696ba162e552f03a1e7c80ec6ef02 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Mon, 16 Mar 2026 11:34:23 +0000
Subject: [PATCH 2/6] Added documentation for style-remix

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 docs/source/config.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/config.md b/docs/source/config.md
index 7b5be6d0c..170c2fc42 100644
--- a/docs/source/config.md
+++ b/docs/source/config.md
@@ -63,6 +63,8 @@ If provided, this takes precedence over dataset_name.
 *   **train_batch_size**: `default = 1` → Per-device batch size during training.
 *   **eval_batch_size**: `default = 1` → Per-device batch size during evaluation.
 *   **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch).
+*   **dataset_disc_style**: `default = None` →  Selects the style remix category to apply to the dataset during preprocessing; when None, no style remixing is applied and the original dataset style is preserved.
+
 *   **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching.
 *   **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field).
 *   **num_workers**: `default = 4` → Number of subprocesses to use for data loading.
@@ -159,6 +161,17 @@ dataset:
   completion_template: "{target}"
 ```
 
+#### **5. Style-Remix (hallisky/DiSC)**
+
+```yaml
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "hallisky/DiSC" 
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" 
+  completion_template: "{generation}"     
+  dataset_disc_style: "sarcasm_more" 
+
+```
 ***
 
 ## 3. Training Configuration

From ed42e52d8333eca1c4b6c9f802bb6dbe948f69ca Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Mon, 23 Mar 2026 06:11:46 +0000
Subject: [PATCH 3/6] Removing grammar dataset from config.md

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 .../sft_single_device_custom_dataset_config.yaml  |  1 +
 docs/source/config.md                             | 15 +--------------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
index 8efc196dd..630790661 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+# Dataset: Style-Remix (hallisky/DiSC)
 # Model configuration
 model:
   model_type: "hf"  # Hugging Face model
diff --git a/docs/source/config.md b/docs/source/config.md
index 170c2fc42..702fe90ca 100644
--- a/docs/source/config.md
+++ b/docs/source/config.md
@@ -146,22 +146,9 @@ dataset:
   completion_template: "{answer}"
 
 ```
-
 ***
-#### **4. grammar (grammar_dataset)**
-
-```yaml
-dataset:
-  tokenizer_name: "meta-llama/Llama-3.2-1B"
-  dataset_type: "sft_dataset"
-  dataset_name: "grammar"
-  train_split: "train"
-  split_ratio: 0.8
-  prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n"
-  completion_template: "{target}"
-```
 
-#### **5. Style-Remix (hallisky/DiSC)**
+#### **4. Style-Remix (hallisky/DiSC)**
 
 ```yaml
 dataset:

From 644c18fbeff0cf2ee151084536ebba287863c0ea Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Mon, 23 Mar 2026 09:06:36 +0000
Subject: [PATCH 4/6] Corrected prompt_func

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 docs/source/config.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/config.md b/docs/source/config.md
index 702fe90ca..88f36baf3 100644
--- a/docs/source/config.md
+++ b/docs/source/config.md
@@ -90,7 +90,7 @@ dataset:
   train_split: "train"
   test_split: "test"
   max_seq_length: 512
-  prompt_func: "preprocess/alpaca_func:create_alpaca_prompt"
+  prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt"
   completion_template: "{output}"
 
 ```

From 65f937a2eb49302275a1778b4779bf0cf9cd899f Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Tue, 24 Mar 2026 06:03:36 +0000
Subject: [PATCH 5/6] Adding seed to dataset

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/finetune/experimental/core/dataset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py
index 9954ef5ae..e607ef2b9 100644
--- a/QEfficient/finetune/experimental/core/dataset.py
+++ b/QEfficient/finetune/experimental/core/dataset.py
@@ -89,6 +89,7 @@ def __init__(
         **kwargs,
     ):
         self.split_ratio = split_ratio
+        self.seed = seed
         self.json_file_path = kwargs.get("json_file_path", None)
         self.prompt_template = kwargs.get("prompt_template", None)
         self.completion_template = kwargs.get("completion_template", None)
@@ -128,6 +129,7 @@ def _initialize_dataset(self):
             # Load dataset from JSON file
             validate_json_structure(self.json_file_path)
             self.dataset = load_dataset("json", data_files=self.json_file_path, split="train")
+            self.dataset = self.dataset.shuffle(seed=self.seed)
             # Apply train/test split if needed
             if self.split in ["train", "test"]:
                 self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed)
@@ -150,6 +152,7 @@ def _initialize_dataset(self):
                 load_split = "train"
             # FIXME: Add streaming support for larger datasets.
             self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs)
+            self.dataset = self.dataset.shuffle(seed=self.seed)
             if self.dataset_disc_style:
                 available_styles = set(self.dataset["category"])
                 if self.dataset_disc_style not in available_styles:

From 45b4ec3b12bef857a010503d37a6de15aa89e736 Mon Sep 17 00:00:00 2001
From: Tanisha Chawada <tchawada@qti.qualcomm.com>
Date: Wed, 25 Mar 2026 08:36:44 +0000
Subject: [PATCH 6/6] Added seed for dataset

Signed-off-by: Tanisha Chawada <tchawada@qti.qualcomm.com>
---
 QEfficient/cloud/finetune_experimental.py             |  2 +-
 .../finetune/experimental/configs/sft_ddp_config.yaml |  1 +
 .../configs/sft_single_device_alpaca_config.yaml      |  1 +
 .../sft_single_device_custom_dataset_config.yaml      |  1 +
 .../configs/sft_single_device_gsm8k_config.yaml       |  1 +
 .../finetune/experimental/core/config_manager.py      |  1 +
 .../finetune/experimental/tests/test_dataset.py       | 11 +++++++----
 .../finetune/experimental/tests/test_finetune.py      |  1 +
 8 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
index 08ea8f5e5..43fcde5f8 100644
--- a/QEfficient/cloud/finetune_experimental.py
+++ b/QEfficient/cloud/finetune_experimental.py
@@ -115,7 +115,7 @@ def _create_datasets(self) -> Tuple[Any, Any]:
         dataset_name = dataset_config.get("dataset_name")
         train_split = dataset_config.get("train_split", "train")
         test_split = dataset_config.get("test_split", "test")
-        seed = self.config.training["seed"]
+        seed = dataset_config.get("data_seed", 42)
 
         # Create a copy of dataset_config excluding keys that are passed explicitly
         # to avoid duplicate keyword arguments when unpacking
diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
index f7a0f6b1a..a426dd614 100644
--- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -25,6 +25,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
index dfc5bd09c..2bdf800bc 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
@@ -24,6 +24,7 @@ dataset:
   dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
   prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
   completion_template: "{output}"    # Model will be trained on this part. 
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 # Training configuration
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
index 630790661..fbdcc88d6 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
@@ -26,6 +26,7 @@ dataset:
   prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
   completion_template: "{generation}"    # Model will be trained on this part. 
   dataset_disc_style: "sarcasm_more" # Style of dataset to use
+  data_seed: 42 # Random seed for dataset shuffling
 
 # Training configuration
 training:
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
index f8627f6da..9391fb0bd 100644
--- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
+++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -25,6 +25,7 @@ dataset:
   prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
   completion_template: "{answer}"    # Model will be trained on this part. 
   config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
 
 
 # Training configuration
diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py
index a2317ca91..10b61c795 100644
--- a/QEfficient/finetune/experimental/core/config_manager.py
+++ b/QEfficient/finetune/experimental/core/config_manager.py
@@ -188,6 +188,7 @@ class DatasetConfig:
         metadata={"help": "Name of the hf configuration file."},
     )
     json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."})
+    data_seed: int = field(default=42, metadata={"help": "Seed for data shuffling and sampling."})
 
 
 @dataclass
diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py
index d6dc5729c..81d37db90 100644
--- a/QEfficient/finetune/experimental/tests/test_dataset.py
+++ b/QEfficient/finetune/experimental/tests/test_dataset.py
@@ -44,7 +44,9 @@ def setUp(self):
             {"question": "What is AI?", "answer": "Artificial Intelligence"},
             {"question": "What is ML?", "answer": "Machine Learning"},
             {"question": "What is DL?", "answer": "Deep Learning"},
+            {"question": "What is LLM?", "answer": "Large Language Model"},
             {"question": "What is NLP?", "answer": "Natural Language Processing"},
+            {"question": "What is VLM?", "answer": "Vision Language Model"},
             {"question": "", "answer": "Empty question"},  # Empty question
             {"question": "Valid question", "answer": ""},  # Empty answer
             {"question": None, "answer": "None question"},  # None question
@@ -78,6 +80,7 @@ def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder,
         def create_mock_dataset():
             mock_dataset = MagicMock()
             mock_dataset.column_names = ["text", "label"]
+            mock_dataset.shuffle.return_value = mock_dataset
             mock_dataset.num_rows = 3
 
             # Mock __getitem__ to return processed samples
@@ -177,7 +180,7 @@ def test_sft_dataset_json_file_without_filtering(self):
         )
 
         # When filtering is disabled and split="train" is used, it still applies train/test split
-        # So we get ~80% of 8 samples = ~6 samples
+        # So we get ~80% of 10 samples = ~8 samples
         self.assertGreater(len(dataset), 0)
         self.assertLessEqual(len(dataset), 8)
 
@@ -203,12 +206,12 @@ def test_sft_dataset_train_test_split_from_json(self):
             seed=SEED,
         )
 
-        # After filtering, we have 4 valid samples
-        # With split ratio, train should have ~3 samples, test should have ~1 sample
+        # After filtering, we have 6 valid samples
+        # With split ratio, train should have ~4 samples, test should have ~2 sample
         self.assertGreater(len(train_dataset), 0)
         self.assertGreater(len(test_dataset), 0)
         # Total should equal the filtered dataset size
-        self.assertEqual(len(train_dataset) + len(test_dataset), 4)
+        self.assertEqual(len(train_dataset) + len(test_dataset), 6)
 
     def test_sft_dataset_with_custom_prompt_function(self):
         """Test loading with custom prompt function."""
diff --git a/QEfficient/finetune/experimental/tests/test_finetune.py b/QEfficient/finetune/experimental/tests/test_finetune.py
index 8e3ead3e9..9eb857be7 100644
--- a/QEfficient/finetune/experimental/tests/test_finetune.py
+++ b/QEfficient/finetune/experimental/tests/test_finetune.py
@@ -226,6 +226,7 @@ def test_create_datasets_called_and_assigned(
         "dataset_name": "test_dataset",
         "train_split": train_split,
         "test_split": test_split,
+        "data_seed": 42,
     }
 
     train_ds = MagicMock(name="train_ds")