Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/example_check_on_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
Comment thread
ver217 marked this conversation as resolved.
timeout-minutes: 15
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ jobs:
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
timeout-minutes: 15
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }}
cancel-in-progress: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/example_check_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/coati/trainer/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm
from transformers import PreTrainedTokenizerBase

from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator

from .base import OnPolicyTrainer
from .callbacks import Callback
Expand Down Expand Up @@ -105,7 +105,7 @@ def __init__(
self.critic_optim = critic_optim

self.offload_inference_models = offload_inference_models
self.device = get_current_device()
self.device = get_accelerator().get_current_device()

def _before_fit(
self,
Expand Down
13 changes: 11 additions & 2 deletions applications/Chat/coati/trainer/strategies/colossalai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import colossalai
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
from colossalai.utils import get_current_device
from colossalai.zero.gemini.gemini_ddp import GeminiDDP

from .ddp import DDPStrategy
Expand Down Expand Up @@ -158,9 +157,19 @@ def __init__(

warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")

# colossalai has changed api for get_current_device in 0.3.4 version or newer
try:
from colossalai.accelerator import get_accelerator

chunk_init_device = get_accelerator().get_current_device()
except:
from colossalai.utils import get_current_device

chunk_init_device = get_current_device()

# NOTE: dist should be initialized before calling get_current_device()
plugin_initializer = lambda: GeminiPlugin(
chunk_init_device=get_current_device(),
chunk_init_device=chunk_init_device,
placement_policy=placement_policy,
shard_param_frac=shard_param_frac,
offload_optim_frac=offload_optim_frac,
Expand Down
56 changes: 28 additions & 28 deletions applications/Colossal-LLaMA-2/train.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,37 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Continual Pre-training of LLaMA-2 developed by Colossal-AI Team
Continual Pre-training of LLaMA-2 developed by Colossal-AI Team
"""

import json
import argparse
import json
import os
import resource
from contextlib import nullcontext
from tqdm import tqdm

import torch
import torch.distributed as dist
from colossal_llama2.dataset.loader import (
DataCollatorForSupervisedDataset,
StatefulDistributedSampler,
load_tokenized_dataset,
setup_distributed_dataloader,
)
from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
from colossal_llama2.utils.froze import freeze_non_embeds_parameters
from torch.utils.tensorboard import SummaryWriter
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
from tqdm import tqdm
Comment thread
ver217 marked this conversation as resolved.
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer

import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import (
GeminiPlugin,
LowLevelZeroPlugin,
HybridParallelPlugin,
)
from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
from colossalai.cluster import DistCoordinator
from colossalai.lazy import LazyInitContext
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device

from colossal_llama2.dataset.loader import (
load_tokenized_dataset,
setup_distributed_dataloader,
DataCollatorForSupervisedDataset,
StatefulDistributedSampler,
)

from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
from colossal_llama2.utils.froze import freeze_non_embeds_parameters


def get_model_numel(model: torch.nn.Module) -> int:
Expand Down Expand Up @@ -215,9 +208,18 @@ def main() -> None:
# ======================================================
# Initialize Model, Objective, Optimizer and LR Scheduler
# ======================================================
init_ctx = (
LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
)

# colossalai has changed api for get_current_device in 0.3.4 version or newer
try:
from colossalai.accelerator import get_accelerator

current_device = get_accelerator().get_current_device()
except:
from colossalai.utils import get_current_device

current_device = get_current_device()

init_ctx = LazyInitContext(default_device=current_device) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
with init_ctx:
model = LlamaForCausalLM(LlamaConfig.from_pretrained(args.pretrained))
# Freeze part of parameters.
Expand Down Expand Up @@ -320,7 +322,7 @@ def main() -> None:
initial=start_step,
) as pbar:
for step, batch in pbar:
batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
batch = {k: v.to(current_device) for k, v in batch.items() if isinstance(v, torch.Tensor)}

batch_output = model(**batch)

Expand Down Expand Up @@ -372,9 +374,7 @@ def main() -> None:
# Final save.
coordinator.print_on_master("Start saving final model checkpoint")
booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
coordinator.print_on_master(
f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}"
)
coordinator.print_on_master(f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}")

coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")

Expand Down
2 changes: 2 additions & 0 deletions colossalai/accelerator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .api import auto_set_accelerator, get_accelerator, set_accelerator
from .base_accelerator import BaseAccelerator
from .cpu_accelerator import CpuAccelerator
from .cuda_accelerator import CudaAccelerator
from .npu_accelerator import NpuAccelerator

Expand All @@ -10,4 +11,5 @@
"BaseAccelerator",
"CudaAccelerator",
"NpuAccelerator",
"CpuAccelerator",
]
13 changes: 6 additions & 7 deletions colossalai/accelerator/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Union

from .base_accelerator import BaseAccelerator
from .cpu_accelerator import CpuAccelerator
from .cuda_accelerator import CudaAccelerator
from .npu_accelerator import NpuAccelerator

Expand All @@ -15,7 +16,7 @@
# we use ordered dictionary here to associate the
# order with device check priority
# i.e. auto_set_accelerator will check cuda first
_ACCELERATOR_MAPPING = OrderedDict(cuda=CudaAccelerator, npu=NpuAccelerator)
_ACCELERATOR_MAPPING = OrderedDict(cuda=CudaAccelerator, npu=NpuAccelerator, cpu=CpuAccelerator)


def set_accelerator(accelerator: Union[str, BaseAccelerator]) -> None:
Expand Down Expand Up @@ -43,19 +44,17 @@ def auto_set_accelerator() -> None:
"""
global _ACCELERATOR

for _, accelerator_cls in _ACCELERATOR_MAPPING.items():
for accelerator_name, accelerator_cls in _ACCELERATOR_MAPPING.items():
try:
accelerator = accelerator_cls()
if accelerator.is_available():
if accelerator_name == "cpu" or accelerator.is_available():
_ACCELERATOR = accelerator
break
break
except:
pass

if _ACCELERATOR is None:
raise RuntimeError(
f"No accelerator is available. Please check your environment. The list of accelerators we support is {list(_ACCELERATOR_MAPPING.keys())}"
)
raise RuntimeError("No accelerator is available.")


def get_accelerator() -> BaseAccelerator:
Expand Down
Loading