quic · smedhe · Mar 18, 2026 · Mar 18, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/QEfficient/cloud/finetune_experimental.py b/QEfficient/cloud/finetune_experimental.py
diff --git a/QEfficient/finetune/experimental/configs/sample_config.yaml b/QEfficient/finetune/experimental/configs/sample_config.yaml
diff --git a/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml b/QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
@@ -0,0 +1,59 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc..
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset
+  data_seed: 42 # Random seed for dataset shuffling, for deterministic shuffling and reproducibility
+  dataset_num_samples: 1000
+
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  torch_compile: False # Whether to use torch.compile
+  ddp_degree: 4
+  ddp_config: # DDP configuration
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: True
+    ddp_timeout: 1800
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_alpaca_config.yaml
@@ -0,0 +1,49 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 16
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
+  prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
+  completion_template: "{output}"    # Model will be trained on this part. 
+  data_seed: 42 # Random seed for dataset shuffling
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 2  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 2  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_custom_dataset_config.yaml
@@ -0,0 +1,50 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Dataset: Style-Remix (hallisky/DiSC)
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub
+  prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
+  completion_template: "{generation}"    # Model will be trained on this part. 
+  dataset_disc_style: "sarcasm_more" # Style of dataset to use
+  data_seed: 42 # Random seed for dataset shuffling
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 2e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_single_device_gsm8k_config.yaml
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+  dataset_num_samples: 1000
+
+
+# Training configuration
+training:
+  type: "sft"
+  output_dir: "./training_result_single_device"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_ddp_gsm8k_config.yaml
@@ -0,0 +1,63 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "meta-llama/Llama-3.2-1b"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["q_proj", "v_proj", "k_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+  dataset_num_samples: 100
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+  tp_degree: 2
+  ddp_degree: 2
+  pp_degree: 1
+  device: "qaic"
+  ddp_config: # DDP configuration
+    ddp_backend: "qccl"
+    ddp_find_unused_parameters: False
+    ddp_bucket_cap_mb: 25
+    ddp_broadcast_buffers: True
+    ddp_timeout: 1800
+
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
diff --git a/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml b/QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# Model configuration
+model:
+  model_type: "hf"  # Hugging Face model
+  auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
+  model_name: "HuggingFaceTB/SmolLM-135M"  # Pretrained model name
+  use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
+  peft_config:
+    lora_r: 8 # LoRA rank
+    lora_alpha: 16
+    lora_dropout: 0
+    target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
+    task_type: "CAUSAL_LM"  # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
+    peft_type: "LORA"  # Options: LORA, IA3, etc.
+
+# Dataset configuration
+dataset:
+  dataset_type: "sft_dataset"
+  dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
+  prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
+  completion_template: "{answer}"    # Model will be trained on this part. 
+  config_name: "main" # Config name for the dataset 
+  data_seed: 42 # Random seed for dataset shuffling
+
+
+# Training configuration
+training:
+  type: "sft"
+  gradient_accumulation_steps: 1  # Number of steps to accumulate gradients
+  per_device_train_batch_size: 1  # Batch size per device during training
+  num_train_epochs: 1
+  torch_compile: False # Whether to use torch.compile
+  tp_degree: 2
+  ddp_degree: 1
+
+# Optimizer configuration
+optimizers:
+  optimizer_name: "adamw"
+  lr: 1e-4
+
+scheduler:
+  scheduler_name: "cosine"
+
+callbacks:
+  early_stopping:
+    early_stopping_patience: 3 # Number of epochs to wait before stopping training
+    early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement