Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ jobs:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ] && cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ] && [ ! -z "$(ls -A /github/home/testmon_cache/${MAIN_BRANCH})" ]; then
cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
fi
env:
MAIN_BRANCH: ${{ github.event.master_branch }}
Expand All @@ -60,12 +60,15 @@ jobs:
defaults:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Copy testmon cache
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d "/github/home/testmon_cache/${BASE}" ]; then
[ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ] && mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
if [ -d "/github/home/testmon_cache/${BASE}" ] and [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
mkdir -p /github/home/testmon_cache/_pull && cp -p -r "/github/home/testmon_cache/${BASE}" /github/home/testmon_cache/_pull/${PR_NUMBER}
fi
env:
PR_NUMBER: ${{ github.event.number }}
Expand All @@ -83,6 +86,9 @@ jobs:
changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }}
anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }}
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v2
with:
Expand Down Expand Up @@ -140,6 +146,9 @@ jobs:
defaults:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Checkout TensorNVMe
uses: actions/checkout@v2
Expand All @@ -150,7 +159,9 @@ jobs:

- name: Restore TensorNVMe Cache
run: |
[ ! -z "$(ls -A /github/home/tensornvme_cache/)" ] && cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
if [ -d /github/home/tensornvme_cache ] && [ ! -z "$(ls -A /github/home/tensornvme_cache/)" ]; then
cp -p -r /github/home/tensornvme_cache/* /__w/ColossalAI/ColossalAI/TensorNVMe
fi

- name: Install TensorNVMe
run: |
Expand All @@ -173,7 +184,9 @@ jobs:
if: needs.detect.outputs.anyExtensionFileChanged != 'true'
run: |
# -p flag is required to preserve the file timestamp to avoid ninja rebuild
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then
cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
fi

- name: Install Colossal-AI
run: |
Expand Down Expand Up @@ -264,8 +277,8 @@ jobs:
if: github.event.pull_request.merged == true
run: | # branch name may contain slash, we need to replace it with space
export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ]; then
[ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ] && cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
fi
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/compatiblity_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v3
- id: set-matrix
Expand Down Expand Up @@ -40,6 +43,9 @@ jobs:
image: ${{ matrix.container }}
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 120
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Install dependencies
run: |
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/doc_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ jobs:
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v2

Expand All @@ -31,6 +34,9 @@ jobs:
github.event.pull_request.draft == false &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: ubuntu-latest
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v2
with:
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/doc_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ jobs:
outputs:
any_changed: ${{ steps.changed-files.outputs.any_changed }}
changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
name: Detect changed example files
steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -59,6 +62,9 @@ jobs:
defaults:
run:
shell: bash
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- name: Checkout ColossalAI-Documentation
uses: actions/checkout@v2
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ jobs:
matrix: ${{ steps.setup-matrix.outputs.matrix }}
anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
name: Detect changed example files
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -77,6 +80,9 @@ jobs:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
concurrency:
group: ${{ github.head_ref }}
cancel-in-progress: false
steps:
- uses: actions/checkout@v3

Expand Down
8 changes: 7 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ pip install <options> -e .
### Unit Tests
We use [PyTest](https://docs.pytest.org/en/latest/) to execute tests. You can install pytest by `pip install pytest`. As some of the tests require initialization of the distributed backend, GPUs are needed to execute these tests.

To set up the environment for unit testing, first change your current directory to the root directory of your local ColossalAI repository, then run
```bash
pip install -r requirements/requirements-test.txt
```
If you encounter an error telling "Could not find a version that satisfies the requirement fbgemm-gpu==0.2.0", please downgrade your python version to 3.8 or 3.9 and try again.

If you only want to run CPU tests, you can run

```bash
Expand Down Expand Up @@ -138,4 +144,4 @@ You can now create a pull request on the GitHub webpage of your repository. The

Do write clearly the description of your pull request and [link the pull request to your target issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). This will automatically close the issue when the pull request is approved.

In case of code conflict, you should rebase your branch and resolve the conflicts manually.
In case of code conflict, you should rebase your branch and resolve the conflicts manually.
16 changes: 8 additions & 8 deletions applications/Chat/coati/dataset/prompt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ def __init__(self,
logger.info(f"Limiting dataset to {max_datasets_size} examples.")
list_data_dict = list_data_dict[:max_datasets_size]

for data_dict in list_data_dict:
token = tokenizer(data_dict["instruction"],
return_tensors='pt',
max_length=max_length,
padding='max_length',
truncation=True)
for k, tensor in token.items():
self.keyed_prompt[k].extend(tensor.to(torch.cuda.current_device()).unbind())
instructions = [data_dict["instruction"] for data_dict in list_data_dict]
tokens = tokenizer(instructions,
return_tensors='pt',
max_length=max_length,
padding='max_length',
truncation=True)
for k, tensor in tokens.items():
self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()

def __len__(self):
return len(self.keyed_prompt["input_ids"])
Expand Down
30 changes: 15 additions & 15 deletions applications/Chat/coati/dataset/sft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,21 +74,18 @@ def __getitem__(self, idx):
return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])


def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, max_length: int) -> Dict:
def _tokenize_fn(strings: Sequence[str],
tokenizer: transformers.PreTrainedTokenizer,
max_length: int
) -> Dict[str, torch.Tensor]:
"""Tokenize a list of strings."""
tokenized_list = [
tokenizer(
text,
return_tensors="pt",
padding="longest",
max_length=max_length,
truncation=True,
) for text in strings
]
input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
]
tokenized_list = tokenizer(
strings, return_tensors="pt", padding="longest",
max_length=max_length, truncation=True
)
input_ids = labels = tokenized_list["input_ids"]
input_ids_lens = labels_lens = \
tokenized_list["input_ids"].ne(tokenizer.pad_token_id).sum(dim=-1)
return dict(
input_ids=input_ids,
labels=labels,
Expand All @@ -105,7 +102,10 @@ def preprocess(
) -> Dict:
"""Preprocess the data by tokenizing."""
examples = [s + t for s, t in zip(sources, targets)]
examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources)]
examples_tokenized, sources_tokenized = [
_tokenize_fn(strings, tokenizer, max_length)
for strings in (examples, sources)
]
input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids)
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
Expand Down
12 changes: 8 additions & 4 deletions applications/Chat/coati/experience_maker/naive.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import torch
from coati.models.utils import compute_reward, normalize
from coati.models.generation import generate_with_actor
from coati.models.utils import calc_action_log_probs, compute_reward, normalize

from .base import Experience, ExperienceMaker

Expand All @@ -16,13 +17,16 @@ def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experie
self.initial_model.eval()
self.reward_model.eval()

sequences, attention_mask, action_mask = self.actor.generate(input_ids,
sequences, attention_mask, action_mask = generate_with_actor(self.actor,
input_ids,
return_action_mask=True,
**generate_kwargs)
num_actions = action_mask.size(1)

action_log_probs = self.actor(sequences, num_actions, attention_mask)
base_action_log_probs = self.initial_model(sequences, num_actions, attention_mask)
actor_output = self.actor(sequences, attention_mask)
action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
base_model_output = self.initial_model(sequences, attention_mask)
base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
value = self.critic(sequences, action_mask, attention_mask)
r = self.reward_model(sequences, attention_mask)
reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
Expand Down
14 changes: 8 additions & 6 deletions applications/Chat/coati/models/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
from typing import Union

import torch.nn as nn

from .actor import Actor
from .critic import Critic
from .reward_model import RewardModel


def get_base_model(model: nn.Module) -> nn.Module:
def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
"""Get the base model of our wrapper classes.
For Actor, it's base model is ``actor.model`` and it's usually a ``transformers.PreTrainedModel``.
For Critic and RewardModel, it's base model is itself.
For Actor, Critic and RewardModel, return ``model.model``,
it's usually a ``transformers.PreTrainedModel``.

Args:
model (nn.Module): model to get base model from

Returns:
nn.Module: the base model
"""
if isinstance(model, Actor):
return model.get_base_model()
return model
assert isinstance(model, (Actor, Critic, RewardModel)), \
f'Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first.'
return model.model


__all__ = ['Actor', 'Critic', 'RewardModel', 'get_base_model']
53 changes: 12 additions & 41 deletions applications/Chat/coati/models/base/actor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from typing import Optional, Tuple, Union
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F

from ..generation import generate
from ..lora import LoRAModule
from ..utils import log_probs_from_logits


class Actor(LoRAModule):
Expand All @@ -24,42 +21,16 @@ def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str =
self.model = model
self.convert_to_lora()

@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor,
return_action_mask: bool = True,
**kwargs
) -> Union[Tuple[torch.LongTensor, torch.LongTensor], Tuple[torch.LongTensor, torch.LongTensor, torch.BoolTensor]]:
sequences = generate(self.model, input_ids, **kwargs)
attention_mask = None
pad_token_id = kwargs.get('pad_token_id', None)
if pad_token_id is not None:
attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
if not return_action_mask:
return sequences, attention_mask, None
input_len = input_ids.size(1)
eos_token_id = kwargs.get('eos_token_id', None)
if eos_token_id is None:
action_mask = torch.ones_like(sequences, dtype=torch.bool)
else:
# left padding may be applied, only mask action
action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
action_mask = F.pad(action_mask, (1 + input_len, -1), value=True) # include eos token and input
action_mask[:, :input_len] = False
action_mask = action_mask[:, 1:]
return sequences, attention_mask, action_mask[:, -(sequences.size(1) - input_len):]

def forward(self,
sequences: torch.LongTensor,
num_actions: int,
attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
"""Returns action log probs
input_ids: torch.LongTensor,
attention_mask: Optional[torch.Tensor] = None,
**model_kwargs, # HACK: `generate` method may pass more kwargs
) -> torch.Tensor:
"""Returns model output.
"""
output = self.model(sequences, attention_mask=attention_mask)
logits = output['logits']
log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
return log_probs[:, -num_actions:]

def get_base_model(self):
return self.model
output = self.model(
input_ids,
attention_mask=attention_mask,
**model_kwargs
)
return output
Loading