Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion colossalai/booster/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def backward(self, loss: torch.Tensor, optimizer: Optimizer) -> None:
loss (torch.Tensor): The loss to be backpropagated.
optimizer (Optimizer): The optimizer to be updated.
"""
# TODO: implement this method with plugin
# TODO(frank lee): implement this method with plugin
optimizer.backward(loss)

def execute_pipeline(self,
Expand Down
2 changes: 0 additions & 2 deletions colossalai/shardformer/layer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ class Randomizer:
_INDEX = 0

def __init__(self, seed: int):
# TODO: remove colossalai.context.random

self.seed = seed

# Handle CUDA rng state
Expand Down
8 changes: 4 additions & 4 deletions colossalai/shardformer/modeling/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def bert_model_forward(
hidden_states: Optional[torch.FloatTensor] = None, # this is from the previous stage
stage_index: Optional[List[int]] = None,
):
# TODO: add explaination of the output here.
# TODO(jianghai): add explaination of the output here.
r"""
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
Expand Down Expand Up @@ -113,7 +113,7 @@ def bert_model_forward(
batch_size, seq_length = input_shape
device = hidden_states.device

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -272,7 +272,7 @@ def bert_for_pretraining_forward(
logger = logging.get_logger(__name__)

return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai) left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -534,7 +534,7 @@ def bert_for_next_sentence_prediction_forward(
stage_index: Optional[List[int]] = None,
**kwargs,
):
#-> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
# -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
Expand Down
12 changes: 6 additions & 6 deletions colossalai/shardformer/modeling/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def custom_forward(*inputs):
# Add last hidden state
hidden_states = self.ln_f(hidden_states)

# TODO: deal with all_hidden_states, all_self_attentions, presents
# TODO(jianghai): deal with all_hidden_states, all_self_attentions, presents
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

Expand Down Expand Up @@ -307,7 +307,7 @@ def bloom_for_causal_lm_forward(self: BloomForCausalLM,
raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")

return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -402,7 +402,7 @@ def bloom_for_sequence_classification_forward(

return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -431,7 +431,7 @@ def bloom_for_sequence_classification_forward(
all_cross_attentions = None
if stage_manager.is_last_stage():
batch_size = hidden_states.shape[0]
#update batch size
# update batch size
hidden_states = transformer_outputs[0]
logits = self.score(hidden_states)

Expand Down Expand Up @@ -525,7 +525,7 @@ def bloom_for_token_classification_forward(

return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -611,7 +611,7 @@ def bloom_for_question_answering_forward(
logger = logging.get_logger(__name__)

return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down
2 changes: 1 addition & 1 deletion colossalai/shardformer/modeling/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def chatglm_model_forward(
if output_hidden_states is not None else self.config.output_hidden_states)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if past_key_values:
logger.warning_once('Non-empty past_key_values is not supported for pipeline models at the moment.')
past_key_values = None
Expand Down
2 changes: 1 addition & 1 deletion colossalai/shardformer/modeling/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def gpt2_model_forward(
logger = logging.get_logger(__name__)

# Preprocess passed in arguments
# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(baizhou): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if past_key_values:
logger.warning_once('Non-empty past_key_values is not supported for pipeline models at the moment.')
past_key_values = None
Expand Down
6 changes: 3 additions & 3 deletions colossalai/shardformer/modeling/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def llama_model_forward(
seq_length_with_past = seq_length
past_key_values_length = 0

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -216,7 +216,7 @@ def llama_for_causal_lm_forward(
if output_hidden_states is not None else self.config.output_hidden_states)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down Expand Up @@ -301,7 +301,7 @@ def llama_for_sequence_classification_forward(
logger = logging.get_logger(__name__)

return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
Expand Down
2 changes: 1 addition & 1 deletion colossalai/shardformer/modeling/opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def opt_model_forward(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
use_cache = False

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(baizhou): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if past_key_values:
logger.warning_once('Non-empty past_key_values is not supported for pipeline models at the moment.')
past_key_values = None
Expand Down
6 changes: 3 additions & 3 deletions colossalai/shardformer/modeling/t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def t5_stack_forward(

logger = logging.get_logger(__name__)

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(baizhou): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if past_key_values:
logger.warning_once('Non-empty past_key_values is not supported for pipeline models at the moment.')
past_key_values = None
Expand Down Expand Up @@ -285,7 +285,7 @@ def t5_model_forward(

logger = logging.get_logger(__name__)

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(baizhou): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if past_key_values:
logger.warning_once('Non-empty past_key_values is not supported for pipeline models at the moment.')
past_key_values = None
Expand Down Expand Up @@ -422,7 +422,7 @@ def t5_for_conditional_generation_forward(

logger = logging.get_logger(__name__)

# TODO: left the recording kv-value tensors as () or None type, this feature may be added in the future.
# TODO(baizhou): left the recording kv-value tensors as () or None type, this feature may be added in the future.
if past_key_values:
logger.warning_once('Non-empty past_key_values is not supported for pipeline models at the moment.')
past_key_values = None
Expand Down
2 changes: 1 addition & 1 deletion colossalai/shardformer/modeling/vit.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def pp_forward(
if pixel_values is None:
raise ValueError("You have to specify pixel_values")

# TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
# TODO(FoolPlayer): maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
if pixel_values.dtype != expected_dtype:
pixel_values = pixel_values.to(expected_dtype)
Expand Down
1 change: 0 additions & 1 deletion colossalai/shardformer/shard/shard_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class ShardConfig:
enable_flash_attention: bool = False
enable_jit_fused: bool = False

# TODO: add support for tensor parallel
# pipeline_parallel_size: int
# data_parallel_size: int
# tensor_parallel_mode: Literal['1d', '2d', '2.5d', '3d']
Expand Down
2 changes: 1 addition & 1 deletion tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_gpt():
for name, (model_fn, data_gen_fn, _, _, _) in sub_registry.items():
model = model_fn()

# TODO: support the following models
# TODO(ver217): support the following models
# 1. GPT2DoubleHeadsModel
# as they are not supported, let's skip them
if model.__class__.__name__ in ['GPT2DoubleHeadsModel', 'GPT2ForQuestionAnswering']:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def rearrange(tensor: torch.Tensor, dim: int):
return rearanged_tensor


# TODO: solve lazy_init True is not working
# TODO(FoolPlayer): solve lazy_init True is not working
@parameterize('lazy_init', [False])
def check_linear_conv_1d_col(lazy_init: bool):
ctx = LazyInitContext() if lazy_init else nullcontext()
Expand Down
171 changes: 0 additions & 171 deletions tests/test_shardformer/test_model/test_pure_pipeline.py

This file was deleted.

Loading