Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
e861279
[plugin] support all-gather overlap for hybrid parallel (#5919)
ver217 Jul 18, 2024
09d5ffc
add kto
YeAnbang Jul 18, 2024
845ea72
Merge branch 'main' of https://github.com/hpcaitech/ColossalAI into kto
YeAnbang Jul 18, 2024
544b7a3
fix style, add kto data sample
YeAnbang Jul 18, 2024
8cc8f64
[Examples] Add lazy init to OPT and GPT examples (#5924)
Edenzzzz Jul 19, 2024
f585d4e
[ColossalChat] Hotfix for ColossalChat (#5910)
TongLi3701 Jul 19, 2024
d08c99b
Merge branch 'main' into kto
TongLi3701 Jul 19, 2024
d49550f
refactor tokenization
YeAnbang Jul 19, 2024
150505c
Merge branch 'kto' of https://github.com/hpcaitech/ColossalAI into kto
YeAnbang Jul 19, 2024
4ec17a7
[FIX BUG] UnboundLocalError: cannot access local variable 'default_co…
zhurunhua Jul 21, 2024
c5f582f
fix test data
YeAnbang Jul 22, 2024
12fe8b5
refactor evaluation
YeAnbang Jul 22, 2024
b0e15d5
remove real data path
YeAnbang Jul 22, 2024
9688e19
remove real data path
YeAnbang Jul 22, 2024
a521ffc
Add n_fused as an input from native_module (#5894)
insujang Jul 23, 2024
5fb958c
[FIX BUG] convert env param to int in (#5934)
flymin Jul 24, 2024
2069472
[Hotfix] Fix ZeRO typo #5936
Edenzzzz Jul 25, 2024
ad35a98
[Feature] Add a switch to control whether the model checkpoint needs …
zhurunhua Jul 26, 2024
8a3ff4f
fix style
YeAnbang Jul 26, 2024
de1bf08
fix style
YeAnbang Jul 26, 2024
6fd9e86
fix style
YeAnbang Jul 29, 2024
c8332b9
Merge pull request #5922 from hpcaitech/kto
YeAnbang Jul 29, 2024
9664b1b
[shardformer] hotfix attn mask (#5945)
ver217 Jul 29, 2024
7b38964
[shardformer] hotfix attn mask (#5947)
ver217 Jul 29, 2024
bcf0181
[Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)
LRY89757 Jul 30, 2024
0608921
[zero] hotfix update master params (#5951)
ver217 Jul 30, 2024
09c5f72
[release] update version (#5952)
ver217 Jul 31, 2024
30f4e31
[Chat] Fix lora (#5946)
YeAnbang Jul 31, 2024
66fbf2e
Update README.md (#5958)
YeAnbang Jul 31, 2024
1aeb5e8
[hotfix] Remove unused plan section (#5957)
TongLi3701 Jul 31, 2024
f9b6fcf
[test] add mixtral for sequence classification
botbw Jul 2, 2024
0b76b57
[test] add mixtral transformer test
botbw Jul 2, 2024
8ae8525
[moe] fix plugin
botbw Jul 2, 2024
a249e71
[test] mixtra pp shard test
botbw Jul 4, 2024
0fad23c
[chore] handle non member group
botbw Jul 5, 2024
46c069b
[zero] solve hang
botbw Jul 5, 2024
37443cc
[test] pass mixtral shardformer test
botbw Jul 8, 2024
b5bfeb2
[moe] implement transit between non moe tp and ep
botbw Jul 8, 2024
13b48ac
[zero] solve hang
botbw Jul 9, 2024
fe24789
[misc] solve booster hang by rename the variable
Hz188 Jul 9, 2024
5ed5e8c
solve hang when parallel mode = pp + dp
Hz188 Jul 11, 2024
e28e053
[moe] implement submesh initialization
botbw Jul 11, 2024
9b9b76b
[moe] add mixtral dp grad scaling when not all experts are activated
botbw Jul 12, 2024
014faf6
[chore] manually revert unintended commit
botbw Jul 12, 2024
8dbb868
[chore] trivial fix
botbw Jul 12, 2024
102b784
[chore] arg pass & remove drop token
botbw Jul 12, 2024
0b5bbe9
[test] add mixtral modelling test
botbw Jul 15, 2024
dc583aa
[moe] implement tp
botbw Jul 16, 2024
74eccac
[moe] test deepseek
botbw Jul 16, 2024
3e2b613
[moe] clean legacy code
botbw Jul 16, 2024
404b16f
[Feature] MoE Ulysses Support (#5918)
Hz188 Jul 18, 2024
09d6280
[chore] minor fix
botbw Jul 18, 2024
877d94b
[moe] init moe plugin comm setting with sp
botbw Jul 18, 2024
2cddeac
moe sp + ep bug fix
Hz188 Jul 18, 2024
7077d38
[moe] finalize test (no pp)
botbw Jul 18, 2024
803878b
[moe] full test for deepseek and mixtral (pp + sp to fix)
botbw Jul 19, 2024
46037c2
[chore] minor fix after rebase
botbw Jul 19, 2024
52d346f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 19, 2024
70c9924
[chore] solve moe ckpt test failure and some other arg pass failure
botbw Jul 22, 2024
74b03de
[moe] remove ops
botbw Jul 22, 2024
067e18f
[test] fix test: test_zero1_2
botbw Jul 22, 2024
96d0fbc
[bug] fix: somehow logger hangs the program
botbw Jul 23, 2024
b2952a5
[moe] deepseek moe sp support
Hz188 Jul 23, 2024
6c39f0b
[test] add check
botbw Jul 23, 2024
c3dc9b4
[deepseek] replace attn (a workaround for bug in transformers)
botbw Jul 23, 2024
59bcf56
[misc] skip redunant test
Hz188 Jul 24, 2024
034020b
[misc] remove debug/print code
Hz188 Jul 24, 2024
cb01c0d
[moe] refactor mesh assignment
botbw Jul 25, 2024
5b4c123
Revert "[moe] implement submesh initialization"
botbw Jul 25, 2024
606b089
[chore] change moe_pg_mesh to private
botbw Jul 25, 2024
12d043c
[misc] remove incompatible test config
Hz188 Jul 25, 2024
70793ce
[misc] fix ci failure: change default value to false in moe plugin
Hz188 Jul 25, 2024
7e737df
[misc] remove useless condition
Hz188 Jul 25, 2024
f7c5485
[chore] docstring
botbw Jul 25, 2024
7bedd03
[moe] remove force_overlap_comm flag and add warning instead
botbw Jul 25, 2024
65daa87
[doc] add MoeHybridParallelPlugin docstring
botbw Jul 26, 2024
d1d1ab8
[moe] solve dp axis issue
botbw Jul 26, 2024
62cdac6
[chore] remove redundant test case, print string & reduce test tokens
botbw Jul 30, 2024
19d1510
[feat] Dist Loader for Eval (#5950)
TongLi3701 Aug 2, 2024
75c9636
[lora] lora support hybrid parallel plugin (#5956)
wangbluo Aug 2, 2024
0b2d55c
Support overall loss, update KTO logging
YeAnbang Aug 2, 2024
fe71917
Merge pull request #5962 from hpcaitech/colossalchat
YeAnbang Aug 2, 2024
9179d40
[Docs] clarify launch port
Edenzzzz Aug 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/run_chatgpt_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
mkdir sft_data
mkdir prompt_data
mkdir preference_data
mkdir kto_data
./tests/test_data_preparation.sh
./tests/test_train.sh
env:
Expand All @@ -61,3 +62,4 @@ jobs:
SFT_DATASET: ./sft_data
PROMPT_DATASET: ./prompt_data
PREFERENCE_DATASET: ./preference_data
KTO_DATASET: ./kto_data
4 changes: 3 additions & 1 deletion applications/Colossal-LLaMA/prepare_sft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import os
from multiprocessing import cpu_count

from colossal_llama.dataset.conversation import LLaMA2_Conv
from colossal_llama.dataset.conversation import LLaMA2_Conv, LLaMA3_Conv
from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
from datasets import dataset_dict, load_dataset
from transformers import AddedToken, AutoTokenizer
Expand Down Expand Up @@ -75,6 +75,8 @@ def main():
# Prepare to the tokenizer.
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)

default_conversation = LLaMA3_Conv

# Fix </s> split issue: https://github.com/huggingface/transformers/issues/23833
if args.llama_version == 2:
tokenizer.add_tokens(AddedToken("</s>", normalized=False, special=True), special_tokens=True)
Expand Down
18 changes: 15 additions & 3 deletions applications/Colossal-LLaMA/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ def main() -> None:
parser.add_argument("--zero", type=int, default=1)
parser.add_argument("--pad_token", choices=["eos", "unk"], default="eos")
parser.add_argument("--padding_mode", choices=["max_length", "longest"], default="max_length")
parser.add_argument(
"--skip_save_each_epoch",
action="store_true",
default=False,
help="skip saving the model checkpoint after each epoch is completed.",
)
args = parser.parse_args()

with open(args.config_file, "w") as f:
Expand Down Expand Up @@ -370,11 +376,17 @@ def main() -> None:
)
total_loss.fill_(0.0)
pbar.update()

# Save modeling.

if (args.save_interval > 0 and (step + 1) % (args.save_interval * args.accumulation_steps) == 0) or (
step + 1
) == len(dataloader):
save_model_condition = (
args.save_interval > 0 and (step + 1) % (args.save_interval * args.accumulation_steps) == 0
)

if not args.skip_save_each_epoch:
save_model_condition = save_model_condition or (step + 1) == len(dataloader)

if save_model_condition:
coordinator.print_on_master("\nStart saving model checkpoint with running states")

if args.use_neft:
Expand Down
3 changes: 3 additions & 0 deletions applications/ColossalChat/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ docs/.build
examples/wandb/
examples/logs/
examples/output/
examples/training_scripts/logs
examples/training_scripts/wandb
examples/training_scripts/output

examples/awesome-chatgpt-prompts/
temp/
Expand Down
31 changes: 8 additions & 23 deletions applications/ColossalChat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
- [Limitation for LLaMA-finetuned models](#limitation)
- [Limitation of dataset](#limitation)
- [Alternative Option For RLHF: DPO](#alternative-option-for-rlhf-direct-preference-optimization)
- [Alternative Option For RLHF: SimPO](#alternative-option-for-rlhf-simple-preference-optimization)
- [Alternative Option For RLHF: SimPO](#alternative-option-for-rlhf-simple-preference-optimization-simpo)
- [Alternative Option For RLHF: ORPO](#alternative-option-for-rlhf-odds-ratio-preference-optimization-orpo)
- [Alternative Option For RLHF: KTO](#alternative-option-for-rlhf-kahneman-tversky-optimization-kto)
- [FAQ](#faq)
- [How to save/load checkpoint](#faq)
- [How to train with limited resources](#faq)
Expand Down Expand Up @@ -137,17 +139,15 @@ The first step in Stage 1 is to collect a dataset of human demonstrations of the
{"messages":
[
{
"from": "human",
"from": "user",
"content": "what are some pranks with a pen i can do?"
},
{
"from": "assistant",
"content": "Are you looking for practical joke ideas?"
},
...
]
},
...
]
```

Expand All @@ -173,23 +173,20 @@ Below shows the preference dataset format used in training the reward model.
"from": "human",
"content": "Introduce butterflies species in Oregon."
}
]
],
"chosen": [
{
"from": "assistant",
"content": "About 150 species of butterflies live in Oregon, with about 100 species are moths..."
},
...
],
"rejected": [
{
"from": "assistant",
"content": "Are you interested in just the common butterflies? There are a few common ones which will be easy to find..."
},
...
]
},
...
]
```

Expand Down Expand Up @@ -218,7 +215,6 @@ PPO uses two kind of training data--- the prompt data and the sft data (optional
"from": "human",
"content": "what are some pranks with a pen i can do?"
}
...
]
},
]
Expand Down Expand Up @@ -284,6 +280,9 @@ Simple Preference Optimization (SimPO) from this [paper](https://arxiv.org/pdf/2
## Alternative Option For RLHF: Odds Ratio Preference Optimization (ORPO)
Odds Ratio Preference Optimization (ORPO) from this [paper](https://arxiv.org/pdf/2403.07691) is a reference model free alignment method that use a mixture of SFT loss and a reinforcement leanring loss calculated based on odds-ratio-based implicit reward to makes the training more efficient and stable. Read this [README](./examples/README.md) for more information.

## Alternative Option For RLHF: Kahneman-Tversky Optimization (KTO)
We support the method introduced in the paper [KTO:Model Alignment as Prospect Theoretic Optimization](https://arxiv.org/pdf/2402.01306) (KTO). Which is a aligment method that directly maximize "human utility" of generation results. Read this [README](./examples/README.md) for more information.

### Inference Quantization and Serving - After Training

We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
Expand Down Expand Up @@ -448,20 +447,6 @@ If you only have a single 24G GPU. Generally, using lora and "zero2-cpu" will be
If you have multiple GPUs each has very limited VRAM, say 8GB. You can try the `3d` for the plugin option, which supports tensor parellelism, set `--tp` to the number of GPUs that you have.
</details>

## The Plan

- [x] implement PPO fine-tuning
- [x] implement training reward model
- [x] support LoRA
- [x] support inference
- [x] support llama from [facebook](https://github.com/facebookresearch/llama)
- [x] implement PPO-ptx fine-tuning
- [x] support flash-attention
- [x] implement DPO fine-tuning
- [ ] integrate with Ray
- [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL),
- [ ] support chain-of-thought by [langchain](https://github.com/hwchase17/langchain)

### Real-time progress

You will find our progress in github [project broad](https://github.com/orgs/hpcaitech/projects/17/views/1).
Expand Down
21 changes: 12 additions & 9 deletions applications/ColossalChat/benchmarks/benchmark_dpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,33 @@ PROJECT_NAME="dpo"
PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
PRETRAINED_MODEL_PATH="" # huggingface or local model path
PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
BENCHMARK_DATA_DIR="./temp/dpo" # Path to benchmark data
DATASET_SIZE=320

TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
declare -a dataset=(
$BENCHMARK_DATA_DIR/arrow/part-0
)

colossalai run --nproc_per_node 4 --master_port 31313 benchmark_dpo.py \
# Generate dummy test data
python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference


colossalai run --nproc_per_node 4 --master_port 31313 ../examples/training_scripts/train_dpo.py \
--pretrain $PRETRAINED_MODEL_PATH \
--tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
--config_file $CONFIG_FILE \
--dataset ${dataset[@]} \
--plugin "zero2_cpu" \
--max_epochs 1 \
--accumulation_steps 1 \
--batch_size 8 \
--batch_size 4 \
--lr 1e-6 \
--beta 0.1 \
--gamma 0.6 \
--mixed_precision "bf16" \
--grad_clip 1.0 \
--max_length 2048 \
--dataset_size 640 \
--weight_decay 0.01 \
--warmup_steps 60 \
--disable_reference_model \
--length_normalization \
--grad_checkpoint \
--use_flash_attn
51 changes: 51 additions & 0 deletions applications/ColossalChat/benchmarks/benchmark_kto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
set_n_least_used_CUDA_VISIBLE_DEVICES() {
local n=${1:-"9999"}
echo "GPU Memory Usage:"
local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
tail -n +2 |
nl -v 0 |
tee /dev/tty |
sort -g -k 2 |
awk '{print $1}' |
head -n $n)
export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
echo "Now CUDA_VISIBLE_DEVICES is set to:"
echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}
set_n_least_used_CUDA_VISIBLE_DEVICES 4

PROJECT_NAME="kto"
PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
PRETRAINED_MODEL_PATH="" # huggingface or local model path
PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
BENCHMARK_DATA_DIR="./temp/kto" # Path to benchmark data
DATASET_SIZE=80

TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
declare -a dataset=(
$BENCHMARK_DATA_DIR/arrow/part-0
)

# Generate dummy test data
python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type kto


colossalai run --nproc_per_node 2 --master_port 31313 ../examples/training_scripts/train_kto.py \
--pretrain $PRETRAINED_MODEL_PATH \
--tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
--dataset ${dataset[@]} \
--plugin "zero2_cpu" \
--max_epochs 1 \
--accumulation_steps 1 \
--batch_size 2 \
--lr 1e-5 \
--beta 0.1 \
--mixed_precision "bf16" \
--grad_clip 1.0 \
--max_length 2048 \
--weight_decay 0.01 \
--warmup_steps 60 \
--grad_checkpoint \
--use_flash_attn
Loading