diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py index 08ea8f5e5..43fcde5f8 100644 --- a/QEfficient/cloud/finetune_experimental.py +++ b/QEfficient/cloud/finetune_experimental.py @@ -115,7 +115,7 @@ def _create_datasets(self) -> Tuple[Any, Any]: dataset_name = dataset_config.get("dataset_name") train_split = dataset_config.get("train_split", "train") test_split = dataset_config.get("test_split", "test") - seed = self.config.training["seed"] + seed = dataset_config.get("data_seed", 42) # Create a copy of dataset_config excluding keys that are passed explicitly # to avoid duplicate keyword arguments when unpacking diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml index f7a0f6b1a..a426dd614 100644 --- a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml @@ -25,6 +25,7 @@ dataset: prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields completion_template: "{answer}" # Model will be trained on this part. config_name: "main" # Config name for the dataset + data_seed: 42 # Random seed for dataset shuffling diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml index dfc5bd09c..2bdf800bc 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml @@ -24,6 +24,7 @@ dataset: dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields completion_template: "{output}" # Model will be trained on this part. + data_seed: 42 # Random seed for dataset shuffling # Training configuration diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml new file mode 100644 index 000000000..fbdcc88d6 --- /dev/null +++ b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml @@ -0,0 +1,50 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- +# Dataset: Style-Remix (hallisky/DiSC) +# Model configuration +model: + model_type: "hf" # Hugging Face model + auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with + model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name + use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning) + peft_config: + lora_r: 8 + lora_alpha: 16 + lora_dropout: 0 + target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA + task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc. + peft_type: "LORA" # Options: LORA, IA3, etc. + +# Dataset configuration +dataset: + dataset_type: "sft_dataset" + dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub + prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields + completion_template: "{generation}" # Model will be trained on this part. + dataset_disc_style: "sarcasm_more" # Style of dataset to use + data_seed: 42 # Random seed for dataset shuffling + +# Training configuration +training: + type: "sft" + gradient_accumulation_steps: 1 # Number of steps to accumulate gradients + per_device_train_batch_size: 1 # Batch size per device during training + num_train_epochs: 1 + torch_compile: False # Whether to use torch.compile + +# Optimizer configuration +optimizers: + optimizer_name: "adamw" + lr: 2e-4 + +scheduler: + scheduler_name: "cosine" + +callbacks: + early_stopping: + early_stopping_patience: 3 # Number of epochs to wait before stopping training + early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml index f8627f6da..9391fb0bd 100644 --- a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml +++ b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml @@ -25,6 +25,7 @@ dataset: prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields completion_template: "{answer}" # Model will be trained on this part. config_name: "main" # Config name for the dataset + data_seed: 42 # Random seed for dataset shuffling # Training configuration diff --git a/QEfficient/finetune/experimental/core/config_manager.py b/QEfficient/finetune/experimental/core/config_manager.py index a3e0a3cd2..10b61c795 100644 --- a/QEfficient/finetune/experimental/core/config_manager.py +++ b/QEfficient/finetune/experimental/core/config_manager.py @@ -144,9 +144,13 @@ class DatasetConfig: metadata={"help": "Function for formatting output completions (e.g., '{output}')."}, ) collate_fn: str = field( - default="dynamic_padding", + default=None, metadata={"help": "The collation function to use (e.g., 'dynamic_padding')."}, ) + dataset_disc_style: str = field( + default=None, + metadata={"help": "Style of dataset"}, + ) group_by_length: bool = field( default=True, metadata={"help": "Whether to group samples by length to minimize padding."}, @@ -184,6 +188,7 @@ class DatasetConfig: metadata={"help": "Name of the hf configuration file."}, ) json_file_path: str = field(default=None, metadata={"help": "Path to a JSON file containing data."}) + data_seed: int = field(default=42, metadata={"help": "Seed for data shuffling and sampling."}) @dataclass diff --git a/QEfficient/finetune/experimental/core/dataset.py b/QEfficient/finetune/experimental/core/dataset.py index 22594cb81..e607ef2b9 100644 --- a/QEfficient/finetune/experimental/core/dataset.py +++ b/QEfficient/finetune/experimental/core/dataset.py @@ -89,6 +89,7 @@ def __init__( **kwargs, ): self.split_ratio = split_ratio + self.seed = seed self.json_file_path = kwargs.get("json_file_path", None) self.prompt_template = kwargs.get("prompt_template", None) self.completion_template = kwargs.get("completion_template", None) @@ -96,6 +97,7 @@ def __init__( self.completion_func_path = kwargs.get("completion_func", None) self.remove_samples_with_empty_columns = kwargs.get("remove_samples_with_empty_columns", True) self.config_name = kwargs.get("config_name", None) + self.dataset_disc_style = kwargs.get("dataset_disc_style", None) if self.json_file_path not in (None, ""): if not os.path.isfile(self.json_file_path): @@ -127,6 +129,7 @@ def _initialize_dataset(self): # Load dataset from JSON file validate_json_structure(self.json_file_path) self.dataset = load_dataset("json", data_files=self.json_file_path, split="train") + self.dataset = self.dataset.shuffle(seed=self.seed) # Apply train/test split if needed if self.split in ["train", "test"]: self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed) @@ -149,6 +152,14 @@ def _initialize_dataset(self): load_split = "train" # FIXME: Add streaming support for larger datasets. self.dataset = load_dataset(self.dataset_name, split=load_split, **load_kwargs) + self.dataset = self.dataset.shuffle(seed=self.seed) + if self.dataset_disc_style: + available_styles = set(self.dataset["category"]) + if self.dataset_disc_style not in available_styles: + raise RuntimeError( + f"For DiSC dataset the provided disc_style '{self.dataset_disc_style}' is not supported." + ) + self.dataset = self.dataset.filter(lambda example: example["category"] == self.dataset_disc_style) if len(available_splits) == 1: self.dataset = apply_train_test_split(self.dataset, self.split_ratio, self.split, self.seed) diff --git a/QEfficient/finetune/experimental/tests/test_dataset.py b/QEfficient/finetune/experimental/tests/test_dataset.py index d6dc5729c..81d37db90 100644 --- a/QEfficient/finetune/experimental/tests/test_dataset.py +++ b/QEfficient/finetune/experimental/tests/test_dataset.py @@ -44,7 +44,9 @@ def setUp(self): {"question": "What is AI?", "answer": "Artificial Intelligence"}, {"question": "What is ML?", "answer": "Machine Learning"}, {"question": "What is DL?", "answer": "Deep Learning"}, + {"question": "What is LLM?", "answer": "Large Language Model"}, {"question": "What is NLP?", "answer": "Natural Language Processing"}, + {"question": "What is VLM?", "answer": "Vision Language Model"}, {"question": "", "answer": "Empty question"}, # Empty question {"question": "Valid question", "answer": ""}, # Empty answer {"question": None, "answer": "None question"}, # None question @@ -78,6 +80,7 @@ def test_sft_dataset_with_huggingface_dataset_and_templates(self, mock_builder, def create_mock_dataset(): mock_dataset = MagicMock() mock_dataset.column_names = ["text", "label"] + mock_dataset.shuffle.return_value = mock_dataset mock_dataset.num_rows = 3 # Mock __getitem__ to return processed samples @@ -177,7 +180,7 @@ def test_sft_dataset_json_file_without_filtering(self): ) # When filtering is disabled and split="train" is used, it still applies train/test split - # So we get ~80% of 8 samples = ~6 samples + # So we get ~80% of 10 samples = ~8 samples self.assertGreater(len(dataset), 0) self.assertLessEqual(len(dataset), 8) @@ -203,12 +206,12 @@ def test_sft_dataset_train_test_split_from_json(self): seed=SEED, ) - # After filtering, we have 4 valid samples - # With split ratio, train should have ~3 samples, test should have ~1 sample + # After filtering, we have 6 valid samples + # With split ratio, train should have ~4 samples, test should have ~2 sample self.assertGreater(len(train_dataset), 0) self.assertGreater(len(test_dataset), 0) # Total should equal the filtered dataset size - self.assertEqual(len(train_dataset) + len(test_dataset), 4) + self.assertEqual(len(train_dataset) + len(test_dataset), 6) def test_sft_dataset_with_custom_prompt_function(self): """Test loading with custom prompt function.""" diff --git a/QEfficient/finetune/experimental/tests/test_finetune.py b/QEfficient/finetune/experimental/tests/test_finetune.py index 8e3ead3e9..9eb857be7 100644 --- a/QEfficient/finetune/experimental/tests/test_finetune.py +++ b/QEfficient/finetune/experimental/tests/test_finetune.py @@ -226,6 +226,7 @@ def test_create_datasets_called_and_assigned( "dataset_name": "test_dataset", "train_split": train_split, "test_split": test_split, + "data_seed": 42, } train_ds = MagicMock(name="train_ds") diff --git a/docs/source/config.md b/docs/source/config.md index 7b5be6d0c..88f36baf3 100644 --- a/docs/source/config.md +++ b/docs/source/config.md @@ -63,6 +63,8 @@ If provided, this takes precedence over dataset_name. * **train_batch_size**: `default = 1` → Per-device batch size during training. * **eval_batch_size**: `default = 1` → Per-device batch size during evaluation. * **collate_fn**: `default = "dynamic_padding"` → Collation function used to build batches (e.g., dynamic padding to match the longest sequence in the batch). +* **dataset_disc_style**: `default = None` → Selects the style remix category to apply to the dataset during preprocessing; when None, no style remixing is applied and the original dataset style is preserved. + * **group_by_length**: `default = true` → Whether to group samples of similar lengths together for efficient batching. * **length_column_name**: `default = "input_ids"` → Column name used to determine sequence length for grouping (commonly the token IDs field). * **num_workers**: `default = 4` → Number of subprocesses to use for data loading. @@ -88,7 +90,7 @@ dataset: train_split: "train" test_split: "test" max_seq_length: 512 - prompt_func: "preprocess/alpaca_func:create_alpaca_prompt" + prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" completion_template: "{output}" ``` @@ -144,21 +146,19 @@ dataset: completion_template: "{answer}" ``` - *** -#### **4. grammar (grammar_dataset)** + +#### **4. Style-Remix (hallisky/DiSC)** ```yaml dataset: - tokenizer_name: "meta-llama/Llama-3.2-1B" dataset_type: "sft_dataset" - dataset_name: "grammar" - train_split: "train" - split_ratio: 0.8 - prompt_template: f"Correct the grammar in the following sentence:\n\n{'input'}\n\nCorrected:\n" - completion_template: "{target}" -``` + dataset_name: "hallisky/DiSC" + prompt_template: "### Original:{original} \n ### Rewrite:\n" + completion_template: "{generation}" + dataset_disc_style: "sarcasm_more" +``` *** ## 3. Training Configuration