Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f44e5d0
[QEff. Finetune]: Added logger and its test cases. (#644) (#868)
smedhe Mar 18, 2026
afdab67
[QEff. Finetune_experimental] cherrypicking pr (#870)
smedhe Mar 18, 2026
888fbb1
[QEff. finetune_experimnetal] cherry picking PR-787, 791,813,795 (#872)
smedhe Mar 25, 2026
14260f9
Formatted docs
Mar 25, 2026
3b6558f
[QEff.finetune] Adding style remix dataset config (#858)
tchawada Mar 26, 2026
2d6d60b
Commented unit test from cloud module
Mar 26, 2026
6887919
Added exception handling for dataset loading
Mar 27, 2026
fbd1f64
Format
Mar 27, 2026
c03e455
[QEff. Finetuning]: Tests for Pipeline Parallelism and updated docume…
quic-swatia Mar 26, 2026
5d5f0ce
Fixed repolinter error
Mar 27, 2026
53d6855
Corrected file paths
Mar 30, 2026
fb3fb86
Updates
Mar 30, 2026
6241bdd
Added Trainer arguments reference
Mar 30, 2026
d782717
Addressed some Qgenie reviews and fixed code
Mar 30, 2026
a0f0e80
Added security checks for import_func in dataset.py
Mar 30, 2026
a3fefcf
Adding reference data test for finetune (#897)
tchawada Mar 31, 2026
e9e7a7f
[QEff. Finetuning]: Updating PP documentation (#899)
quic-swatia Mar 31, 2026
ee92e08
Initial commit: Adding TP+DDP support in hf trainer stack
smedhe Mar 31, 2026
7c9965d
adding back the AOT stack imports
smedhe Apr 6, 2026
96b71de
adding readme and config files for tp+ddp
smedhe Apr 6, 2026
0e02eea
adding test cases for tp and ddp, along with other fixes
smedhe Apr 16, 2026
6a73c0f
Adding local world size checks
smedhe Apr 16, 2026
79541aa
adding tp+ddp related config changes
smedhe Apr 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
401 changes: 401 additions & 0 deletions QEfficient/cloud/finetune_experimental.py

Large diffs are not rendered by default.

Empty file.
59 changes: 59 additions & 0 deletions QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8 # LoRA rank
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc..

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling, for deterministic shuffling and reproducibility
dataset_num_samples: 1000



# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
torch_compile: False # Whether to use torch.compile
ddp_degree: 4
ddp_config: # DDP configuration
ddp_backend: "qccl"
ddp_find_unused_parameters: False
ddp_bucket_cap_mb: 25
ddp_broadcast_buffers: True
ddp_timeout: 1800

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 1e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 16
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # function to create prompt from dataset fields
completion_template: "{output}" # Model will be trained on this part.
data_seed: 42 # Random seed for dataset shuffling


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 2 # Number of steps to accumulate gradients
per_device_train_batch_size: 2 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 2e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Dataset: Style-Remix (hallisky/DiSC)
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "hallisky/DiSC" # Dataset name from Hugging Face Hub
prompt_template: "### Original:{original} \n ### Rewrite:\n" # function to create prompt from dataset fields
completion_template: "{generation}" # Model will be trained on this part.
dataset_disc_style: "sarcasm_more" # Style of dataset to use
data_seed: 42 # Random seed for dataset shuffling

# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 2e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8 # LoRA rank
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling
dataset_num_samples: 1000


# Training configuration
training:
type: "sft"
output_dir: "./training_result_single_device"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 1e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "meta-llama/Llama-3.2-1b" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8 # LoRA rank
lora_alpha: 16
lora_dropout: 0
target_modules: ["q_proj", "v_proj", "k_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling
dataset_num_samples: 100


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile
tp_degree: 2
ddp_degree: 2
pp_degree: 1
device: "qaic"
ddp_config: # DDP configuration
ddp_backend: "qccl"
ddp_find_unused_parameters: False
ddp_bucket_cap_mb: 25
ddp_broadcast_buffers: True
ddp_timeout: 1800


# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 1e-4


scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
52 changes: 52 additions & 0 deletions QEfficient/finetune/experimental/configs/sft_tp_gsm8k_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 8 # LoRA rank
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "openai/gsm8k" # Dataset name from Hugging Face Hub
prompt_template: "Solve the following math problem step by step.\n\n### Question:\n{question}\n\n### Answer:\n" # Template to create prompt from dataset fields
completion_template: "{answer}" # Model will be trained on this part.
config_name: "main" # Config name for the dataset
data_seed: 42 # Random seed for dataset shuffling


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 1 # Number of steps to accumulate gradients
per_device_train_batch_size: 1 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile
tp_degree: 2
ddp_degree: 1

# Optimizer configuration
optimizers:
optimizer_name: "adamw"
lr: 1e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
Loading