🐛 Describe the bug
|
def preprocess( |
|
sources: Sequence[str], |
|
targets: Sequence[str], |
|
tokenizer: transformers.PreTrainedTokenizer, |
|
max_length: int, |
|
) -> Dict: |
|
"""Preprocess the data by tokenizing.""" |
|
examples = [s + t for s, t in zip(sources, targets)] |
|
examples_tokenized, sources_tokenized = [ |
|
_tokenize_fn(strings, tokenizer, max_length) |
|
for strings in (examples, sources) |
|
] |
|
input_ids = examples_tokenized["input_ids"] |
|
labels = copy.deepcopy(input_ids) |
|
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): |
|
label[:source_len] = IGNORE_INDEX |
|
return dict(input_ids=input_ids, labels=labels) |
label[:source_len] = IGNORE_INDEX in preprocess() function may meet error when left padding.

Environment
No response
🐛 Describe the bug
ColossalAI/applications/Chat/coati/dataset/sft_dataset.py
Lines 97 to 113 in 09fe9dc
label[:source_len] = IGNORE_INDEXinpreprocess()function may meet error when left padding.Environment
No response