Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
f81ef6e
General disagg fix for prefill-only model (#698)
ochougul Jan 6, 2026
c57392d
Adding Vae Decoder in Wan (#688)
mohiso22 Jan 9, 2026
75367b1
Evaluating the values of CCL lists for different scenarios (#710)
vjanfaza Jan 9, 2026
1e63710
Updating 2-layer instruction for Wan (#715)
tv-karthikeya Jan 12, 2026
1ef9935
Updated finetune docs for MULTI NODE Training (#717)
quic-akuruvil Jan 13, 2026
c76d5ea
Adding support for multi-node DDP training (#708)
smedhe Jan 13, 2026
7a39933
Updating MDP partition config: prioritizing dump over load (#720)
asmigosw Jan 13, 2026
08bce2c
Updated docs (#722)
quic-akuruvil Jan 13, 2026
8b00c1b
HOTFIX: changes in alpaca and grammar dataset utils (#724)
smedhe Jan 13, 2026
b074af0
Fixing the default value of CCL in infer.py (#725)
vjanfaza Jan 15, 2026
5fdde19
Adding support for multi-node PP+DDP (#726)
smedhe Jan 16, 2026
1f2ac51
Added default NPI file (#657)
quic-akuruvil Jan 19, 2026
dcbb7be
Release 1.21 docs (#718)
tv-karthikeya Jan 19, 2026
1ec3975
HOTFIX : Added support for repeat kv heads aligned Bias scaling for A…
quic-dhirajku Jan 20, 2026
e61a1a3
Removed OpenGVLab/InternVL2_5-1B and OpenGVLab/InternVL3_5-1B (#736)
quic-rishinr Jan 20, 2026
47a0fec
Qeff versioning (#741)
quic-rishinr Jan 20, 2026
3a8e5e9
Revert "Qeff versioning" (#746)
quic-rishinr Jan 21, 2026
0ffa4ea
Fix for Qwen 2.5 VL with subfunction (#733)
abhishek-singh591 Jan 21, 2026
32f30c0
Fixed torch patch for subfunction with VLMs (#750)
abhishek-singh591 Jan 22, 2026
eb74758
Added support of subfunction for VLMs (#699)
abhishek-singh591 Jan 23, 2026
742b7bd
Updated reduce sum calculation to use einsum for gpt_oss (#754)
asmigosw Jan 27, 2026
5a129c7
Updating pytest config for InternVL (#758)
tv-karthikeya Jan 28, 2026
b777e8b
Wan support to skip compilation (#734)
tv-karthikeya Jan 28, 2026
75bf976
Fixing SW issue in Gemma3 (#740)
qcdipankar Jan 28, 2026
3751f7e
Fix documentation of Multinode FT (#764)
quic-akuruvil Jan 29, 2026
27ebe8e
Adding support for gemma3 in continous batching script for CI (#763)
qcdipankar Jan 30, 2026
536e3fc
Subfunction Fix (#766)
abhishek-singh591 Feb 1, 2026
f64f703
Mainline version update (#752)
quic-rishinr Feb 2, 2026
1a3e09c
Updated compile from qaic-exec to qaic-compile (#703)
asmigosw Feb 3, 2026
e8e5c43
Fix for Diffusers subfunction (#759)
tv-karthikeya Feb 9, 2026
fc42332
Added One hot fix for MOE model with subfunction (#777)
abhishek-singh591 Feb 12, 2026
544327a
Adding support of QEFFAutoModelForSequenceClassification (#729)
quic-amitraj Feb 13, 2026
facae5f
CI test optimization (#751)
quic-rishinr Feb 13, 2026
cd25784
Merge remote-tracking branch 'upstream/ft_experimental' into final_hf
tchawada Feb 17, 2026
3f6315c
Adding qaic validation in config manager, default value to prompt_func
tchawada Feb 17, 2026
9015bf6
Adding qaic validation in config manager, default value to prompt_func
tchawada Feb 17, 2026
fb28705
Adding a function to check whether NSP for given QAIC is free or not
tchawada Feb 18, 2026
5f1470e
Moved is_nsp_free func to device_utils.py
tchawada Feb 19, 2026
674b2f5
Adding num_samples in config
tchawada Feb 20, 2026
b47839f
Adding num_samples in config
tchawada Feb 20, 2026
4e390e4
Adding integrated_test
tchawada Feb 20, 2026
9f88237
updating is_nsp_free() function
tchawada Feb 23, 2026
4b53a95
Adding more unit tests in test_config_manager.py
tchawada Feb 25, 2026
dbf2182
fixing lint error
tchawada Feb 26, 2026
f2d0cb4
Tested test_integrated.py for DDP
tchawada Feb 26, 2026
d254a29
Updated finetune_experimental.py
tchawada Mar 3, 2026
cf80242
Updated finetune_experimental.py
tchawada Mar 3, 2026
6762d1a
Merge branch 'ft_experimental' into final_hf
tchawada Mar 3, 2026
07b5e54
Updated hf_finetune.md and config.md
tchawada Mar 4, 2026
f77df70
Updating hf_finetune.md
tchawada Mar 4, 2026
dce11f8
Added output dir structure in config.md
tchawada Mar 4, 2026
65b1693
Added output dir structure in config.md
tchawada Mar 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 57 additions & 44 deletions QEfficient/cloud/finetune_experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,45 @@ def __init__(self, config_manager: ConfigManager):
self.output_dir = Path(self.config.training["output_dir"])
self._setup_environment()

# Prepare training configuration
self.training_config = prepare_training_config(config_manager=self.config_manager)

# Create datasets
logger.log_rank_zero("Creating datasets...")
self.train_dataset, self.eval_dataset = self._create_datasets()

# Create model and tokenizer
logger.log_rank_zero("Loading model and tokenizer...")
model_instance = self._create_model()
self.model = model_instance.model
self.tokenizer = model_instance.tokenizer

# Create optimizer
logger.log_rank_zero("Preparing optimizer...")
self.optimizer_cls_and_kwargs = self._create_optimizer()

# Create callbacks
logger.log_rank_zero("Creating callbacks...")
self.callbacks = self._create_callbacks()

# Create trainer
logger.log_rank_zero("Initializing trainer...")
self.trainer = self._create_trainer(
model=self.model,
tokenizer=self.tokenizer,
train_dataset=self.train_dataset,
eval_dataset=self.eval_dataset,
optimizer_cls_and_kwargs=self.optimizer_cls_and_kwargs,
callbacks=self.callbacks,
training_config=self.training_config,
)

def get_model_and_tokenizer(self):
return self.model, self.tokenizer

def get_trainer(self):
return self.trainer

def _setup_environment(self) -> None:
"""Set up environment variables for output directories."""
os.environ["OUTPUT_DIR"] = str(self.output_dir)
Expand Down Expand Up @@ -95,7 +134,6 @@ def create_dataset_for_split(split_name: str) -> Any:
# Create training and evaluation datasets using config values
train_dataset = create_dataset_for_split(train_split)
eval_dataset = create_dataset_for_split(test_split)

return train_dataset, eval_dataset

def _create_model(self) -> Any:
Expand Down Expand Up @@ -157,6 +195,8 @@ def _create_callbacks(self) -> List[Any]:

# callback_config.callbacks is a dictionary of callback configurations
for callback_name, callback_kwargs in callback_config["callbacks"].items():
if callback_kwargs is None:
callback_kwargs = {}
try:
callback_instance = ComponentFactory.create_callback(callback_name, **callback_kwargs)
callbacks.append(callback_instance)
Expand Down Expand Up @@ -216,14 +256,26 @@ def _create_trainer(

# Create trainer arguments instance
args = args_cls(**training_config)
# Initialize trainer
dataset_config_dict = self.config_manager.get_dataset_config()
split_ratio = dataset_config_dict.get("split_ratio", 0.8)
num_samples = dataset_config_dict.get("dataset_num_samples", -1)
train_dataset = train_dataset.dataset
eval_dataset = eval_dataset.dataset
if num_samples > 0:
Comment thread
tchawada marked this conversation as resolved.
# Truncating datasets to a smaller number of samples.
# If you want to use all data, set dataset_num_samples to -1 or remove it from config.
logger.warning("Using fewer samples may impact finetuning quality.")
subset_train_indices = list(range(0, int(num_samples * split_ratio)))
subset_eval_indices = list(range(0, int(num_samples - num_samples * split_ratio)))
eval_dataset = eval_dataset.select(subset_eval_indices)
train_dataset = train_dataset.select(subset_train_indices)
trainer = trainer_cls(
model=model,
processing_class=tokenizer,
args=args,
compute_loss_func=None,
train_dataset=train_dataset.dataset,
eval_dataset=eval_dataset.dataset,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
callbacks=callbacks,
**additional_kwargs,
Expand All @@ -234,48 +286,9 @@ def _create_trainer(
return trainer

def run(self) -> None:
"""
Execute the complete fine-tuning pipeline.
"""
# Validate configuration
self.config_manager.validate_config()

# Prepare training configuration
training_config = prepare_training_config(config_manager=self.config_manager)

# Create datasets
logger.log_rank_zero("Creating datasets...")
train_dataset, eval_dataset = self._create_datasets()

# Create model and tokenizer
logger.log_rank_zero("Loading model and tokenizer...")
model_instance = self._create_model()
model = model_instance.model
tokenizer = model_instance.tokenizer

# Create optimizer
logger.log_rank_zero("Preparing optimizer...")
optimizer_cls_and_kwargs = self._create_optimizer()

# Create callbacks
logger.log_rank_zero("Creating callbacks...")
callbacks = self._create_callbacks()

# Create trainer
logger.log_rank_zero("Initializing trainer...")
trainer = self._create_trainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
callbacks=callbacks,
training_config=training_config,
)

# Start training
logger.log_rank_zero("Starting training...")
trainer.train()
self.trainer.train()


def main():
Expand Down
47 changes: 0 additions & 47 deletions QEfficient/finetune/experimental/configs/sample_config.yaml

This file was deleted.

54 changes: 54 additions & 0 deletions QEfficient/finetune/experimental/configs/sft_ddp_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 16
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc..

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # Function to create prompt from dataset fields
completion_template: "{output}" # Template for completion field in dataset


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 2 # Number of steps to accumulate gradients
per_device_train_batch_size: 2 # Batch size per device during training
torch_compile: False # Whether to use torch.compile
ddp_config: # DDP configuration
ddp_backend: "qccl"
ddp_find_unused_parameters: False
ddp_bucket_cap_mb: 25
ddp_broadcast_buffers: True
ddp_timeout: 1800

# Optimizer configuration
optimizers:
optimizer_name: "AdamW"
lr: 2e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
tensorboard:
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
# Model configuration
model:
model_type: "hf" # Hugging Face model
auto_class_name: "AutoModelForCausalLM" # Auto class to load the model with
model_name: "HuggingFaceTB/SmolLM-135M" # Pretrained model name
use_peft: true # Enable PEFT (Parameter Efficient Fine-Tuning)
peft_config:
lora_r: 16
lora_alpha: 16
lora_dropout: 0
target_modules: ["k_proj","gate_proj","q_proj","up_proj","v_proj","down_proj","o_proj"] # Target modules for LoRA
task_type: "CAUSAL_LM" # Options: CAUSAL_LM, SEQ_2_SEQ_LM, etc.
peft_type: "LORA" # Options: LORA, IA3, etc.

# Dataset configuration
dataset:
dataset_type: "sft_dataset"
dataset_name: "yahma/alpaca-cleaned" # Dataset name from Hugging Face Hub
prompt_func: "QEfficient.finetune.experimental.preprocessing.alpaca_func:create_alpaca_prompt" # Function to create prompt from dataset fields
completion_template: "{output}" # Template for completion field in dataset


# Training configuration
training:
type: "sft"
gradient_accumulation_steps: 2 # Number of steps to accumulate gradients
per_device_train_batch_size: 2 # Batch size per device during training
num_train_epochs: 1
torch_compile: False # Whether to use torch.compile

# Optimizer configuration
optimizers:
optimizer_name: "AdamW"
lr: 2e-4

scheduler:
scheduler_name: "cosine"

callbacks:
early_stopping:
early_stopping_patience: 3 # Number of epochs to wait before stopping training
early_stopping_threshold: 0.001 # Minimum change in metric to qualify as improvement
tensorboard:
Loading