From 2265ea821632b47292a3196461ee46d2bb68b659 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 30 Jul 2024 14:14:32 +0200 Subject: [PATCH 01/14] commit --- src/transformers/trainer.py | 103 ++++++++++++++++++++++++++++-- src/transformers/training_args.py | 21 ++++++ 2 files changed, 119 insertions(+), 5 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 25b7b6993092..de265e0d45c8 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -61,7 +61,12 @@ from .debug_utils import DebugOption, DebugUnderflowOverflow from .feature_extraction_sequence_utils import SequenceFeatureExtractor from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend -from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available +from .integrations.deepspeed import ( + deepspeed_init, + deepspeed_load_checkpoint, + is_deepspeed_available, + is_deepspeed_zero3_enabled, +) from .integrations.tpu import tpu_spmd_dataloader from .modelcard import TrainingSummary from .modeling_utils import PreTrainedModel, load_sharded_checkpoint @@ -305,9 +310,12 @@ class Trainer: The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided. data_collator (`DataCollator`, *optional*): - The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will + The function to use to form a batch from a list of elements of `train_dataset`. Will default to [`default_data_collator`] if no `tokenizer` is provided, an instance of [`DataCollatorWithPadding`] otherwise. + eval_data_collator (`typing.Union[DataCollator, NoneType]`, *optional*): + The function to use to form a batch from a list of elements of `eval_dataset` and `train_dataset`. Will + default to `data_collator` if no `eval_data_collator` is provided. train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*): The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. @@ -379,6 +387,7 @@ def __init__( model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, + eval_data_collator: Optional[DataCollator] = None, train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, @@ -523,6 +532,7 @@ def __init__( else default_data_collator ) self.data_collator = data_collator if data_collator is not None else default_collator + self.eval_data_collator = eval_data_collator if eval_data_collator is not None else data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.tokenizer = tokenizer @@ -961,7 +971,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None if eval_dataset is not None else self.eval_dataset ) - data_collator = self.data_collator + data_collator = self.eval_data_collator if self.eval_data_collator is not None else self.data_collator if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") @@ -1003,7 +1013,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. """ - data_collator = self.data_collator + data_collator = self.eval_data_collator if self.eval_data_collator is not None else self.data_collator if is_datasets_available() and isinstance(test_dataset, datasets.Dataset): test_dataset = self._remove_unused_columns(test_dataset, description="test") @@ -3600,6 +3610,7 @@ def evaluate( eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", + **gen_kwargs, ) -> Dict[str, float]: """ Run evaluation and returns metrics. @@ -3634,6 +3645,8 @@ def evaluate( metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "eval_bleu" if the prefix is "eval" (default) + gen_kwargs: + Additional `generate` specific kwargs. Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The @@ -3649,10 +3662,22 @@ def evaluate( eval_dataset=_eval_dataset if override else eval_dataset_name, ignore_keys=ignore_keys, metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}", + **gen_kwargs, ) metrics.update(dataset_metrics) return metrics + # Set generation-related kwargs + if self.args.generation_config is not None: + gen_config = self.args.generation_config + self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place + unused_kwargs = self.gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) + # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -3700,7 +3725,11 @@ def evaluate( return output.metrics def predict( - self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test" + self, + test_dataset: Dataset, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "test", + **gen_kwargs, ) -> PredictionOutput: """ Run prediction and returns predictions and potential metrics. @@ -3718,6 +3747,8 @@ def predict( metric_key_prefix (`str`, *optional*, defaults to `"test"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "test_bleu" if the prefix is "test" (default) + gen_kwargs: + Additional `generate` specific kwargs. @@ -3734,6 +3765,17 @@ def predict( - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained labels). """ + # Set generation-related kwargs + if self.args.generation_config is not None: + gen_config = self.args.generation_config + self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place + unused_kwargs = self.gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) + # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -4001,6 +4043,7 @@ def prediction_step( inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, + **gen_kwargs, ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform an evaluation step on `model` using `inputs`. @@ -4020,12 +4063,27 @@ def prediction_step( ignore_keys (`List[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. + gen_kwargs: + Additional `generate` specific kwargs. Return: Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and labels (each being optional). """ has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names) + + # Prioroty: gen_kwargs > args.gen_config > model.generation_config > default GenerationConfig() + gen_config = self.gen_config + default_synced_gpus = True if is_deepspeed_zero3_enabled() else False + synced_gpus = gen_kwargs.get("synced_gpus", default_synced_gpus) + if len(gen_kwargs) > 0: + unused_kwargs = gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `prediction_step` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) + # For CLIP-like models capable of returning loss values. # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` # is `True` in `model.forward`. @@ -4049,6 +4107,38 @@ def prediction_step( else: labels = None + # If the `generation_input_ids` was passed in inputs, the model can generate and we need to modify + # input keys. Otherwise, we don't know the `prompt` to generate from + if self.args.predict_with_generate and not prediction_loss_only: + generation_inputs = inputs.copy() + if "generation_input_ids" in generation_inputs: + # get inputs that are related to text and contain only generation prompt + generation_only_inputs = { + k.replace("generation_", ""): v for k, v in generation_inputs.items() if "generation_" in k + } + + # get common inputs that are not related to text, e.g. pixel-values + gen_keys = generation_only_inputs.keys() + generation_inputs_common = { + k: v + for k, v in generation_inputs.items() + if k.replace("generation_", "") not in gen_keys and "generation" not in k + } + generated_tokens = self.model.generate( + **generation_inputs_common, + **generation_only_inputs, + generation_config=gen_config, + synced_gpus=synced_gpus, + ) + else: + logger.warning_once( + "`predict_with_generate` is set to `True` but no inputs are passed for generation. ", + "Make sure you have `generation_input_ids` and `generation_attention_mask`.", + ) + generated_tokens = None + + # clean up inputs for loss from generation related input tensors if there are any before doing `forward` + inputs = {k: v for k, v in inputs.items() if "generation_" not in k} with torch.no_grad(): if is_sagemaker_mp_enabled(): raw_outputs = smp_forward_only(model, inputs) @@ -4094,6 +4184,9 @@ def prediction_step( if prediction_loss_only: return (loss, None, None) + if self.args.predict_with_generate and not prediction_loss_only: + return (loss, generated_tokens, labels) + logits = nested_detach(logits) if len(logits) == 1: logits = logits[0] diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b87a3d9d0554..b956bd57e655 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -28,6 +28,7 @@ from packaging import version from .debug_utils import DebugOption +from .generation import GenerationConfig from .trainer_utils import ( EvaluationStrategy, FSDPOption, @@ -789,6 +790,12 @@ class TrainingArguments: eval_use_gather_object (`bool`, *optional*, defaults to `False`): Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices. This should only be enabled if users are not just returning tensors, and this is actively discouraged by PyTorch. + predict_with_generate (`bool`, *optional*, defaults to `False`): + Whether to use generate to calculate generative metrics (ROUGE, BLEU). + generation_config ([`~generation.GenerationConfig`], *optional*): + The [`~generation.GenerationConfig`] object that will be used during generation if `predict_with_generate` is set to `True`. + Arguments passed in GenerationConfig will have higher priority than model's generation config. Anything not set by this config + will fallback to `model.generation_config` by default. """ framework = "pt" @@ -1496,6 +1503,20 @@ class TrainingArguments: }, ) + predict_with_generate: bool = field( + default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} + ) + generation_config: Optional[GenerationConfig] = field( + default=None, + metadata={ + "help": ( + "The GenerationConfig that will be used during prediction. Args from this config ", + "will have higher priority than model's generation config. Anything not set by this config ", + "will fallback to `model.generation_config`.", + ) + }, + ) + def __post_init__(self): # Parse in args that could be `dict` sent in from the CLI as a string for field in _VALID_DICT_FIELDS: From 3c11a1cc1e57ab8976b83927b736c119f45eeaf7 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 31 Jul 2024 09:02:29 +0200 Subject: [PATCH 02/14] nits & add ignore-keys in VLMs --- src/transformers/models/idefics/configuration_idefics.py | 1 + src/transformers/models/idefics2/configuration_idefics2.py | 1 + src/transformers/models/llava/configuration_llava.py | 1 + .../models/llava_next/configuration_llava_next.py | 1 + .../llava_next_video/configuration_llava_next_video.py | 1 + .../models/llava_next_video/diff_llava_next_video.py | 1 + src/transformers/models/paligemma/configuration_paligemma.py | 1 + .../models/video_llava/configuration_video_llava.py | 1 + src/transformers/models/vipllava/configuration_vipllava.py | 1 + src/transformers/trainer.py | 5 +++-- 10 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 56b6025a8e89..7e66721189e9 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -236,6 +236,7 @@ class IdeficsConfig(PretrainedConfig): model_type = "idefics" is_composition = False + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 1333895407e6..618cca9a5402 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -213,6 +213,7 @@ class Idefics2Config(PretrainedConfig): model_type = "idefics2" is_composition = True + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 344b882c8735..2d205ce5ea7c 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -72,6 +72,7 @@ class LlavaConfig(PretrainedConfig): model_type = "llava" is_composition = False + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 311139386723..49a35527adaa 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -77,6 +77,7 @@ class LlavaNextConfig(PretrainedConfig): model_type = "llava_next" is_composition = False + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 59bf460e84a6..47f5779c0444 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -84,6 +84,7 @@ class LlavaNextVideoConfig(PretrainedConfig): model_type = "llava_next_video" is_composition = True + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/llava_next_video/diff_llava_next_video.py b/src/transformers/models/llava_next_video/diff_llava_next_video.py index ec41cefed77d..1b904cdcf811 100644 --- a/src/transformers/models/llava_next_video/diff_llava_next_video.py +++ b/src/transformers/models/llava_next_video/diff_llava_next_video.py @@ -99,6 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig): model_type = "llava_next_video" is_composition = True + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py index 7ba3e008c42c..5c8ee2460f4d 100644 --- a/src/transformers/models/paligemma/configuration_paligemma.py +++ b/src/transformers/models/paligemma/configuration_paligemma.py @@ -74,6 +74,7 @@ class PaliGemmaConfig(PretrainedConfig): model_type = "paligemma" is_composition = False + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 9fd236e595bf..bfccde8d3396 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -75,6 +75,7 @@ class VideoLlavaConfig(PretrainedConfig): model_type = "video_llava" is_composition = False + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index c80487702c65..1d0f9a236da7 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -71,6 +71,7 @@ class VipLlavaConfig(PretrainedConfig): model_type = "vipllava" is_composition = False + keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 3c9d944daf88..3c14635fac20 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3677,6 +3677,8 @@ def evaluate( f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", ) + else: + self.gen_config = self.model.generation_config # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -4131,11 +4133,10 @@ def prediction_step( synced_gpus=synced_gpus, ) else: - logger.warning_once( + raise ValueError( "`predict_with_generate` is set to `True` but no inputs are passed for generation. ", "Make sure you have `generation_input_ids` and `generation_attention_mask`.", ) - generated_tokens = None # clean up inputs for loss from generation related input tensors if there are any before doing `forward` inputs = {k: v for k, v in inputs.items() if "generation_" not in k} From 6832a155b950044128883431a69ee5b0ebb68696 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 31 Jul 2024 09:33:13 +0200 Subject: [PATCH 03/14] fix tests --- src/transformers/trainer.py | 72 +++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 3c14635fac20..f036b0db6eb9 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3668,17 +3668,21 @@ def evaluate( return metrics # Set generation-related kwargs - if self.args.generation_config is not None: - gen_config = self.args.generation_config - self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place - unused_kwargs = self.gen_config.update(**gen_kwargs) - if unused_kwargs: - logger.warning_once( - f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", - "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", - ) - else: - self.gen_config = self.model.generation_config + if self.args.predict_with_generate: + if self.args.generation_config is not None: + gen_config = self.args.generation_config + self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place + unused_kwargs = self.gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: " + f"{' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) + else: + # We assume the model can generate if predict-with-generate is True + # Therefore, generation_config should be available + self.gen_config = self.model.generation_config # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -3768,15 +3772,21 @@ def predict( labels). """ # Set generation-related kwargs - if self.args.generation_config is not None: - gen_config = self.args.generation_config - self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place - unused_kwargs = self.gen_config.update(**gen_kwargs) - if unused_kwargs: - logger.warning_once( - f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", - "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", - ) + if self.args.predict_with_generate: + if self.args.generation_config is not None: + gen_config = self.args.generation_config + self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place + unused_kwargs = self.gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: " + f"{' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) + else: + # We assume the model can generate if predict-with-generate is True + # Therefore, generation_config should be available + self.gen_config = self.model.generation_config # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -4075,16 +4085,18 @@ def prediction_step( has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names) # Prioroty: gen_kwargs > args.gen_config > model.generation_config > default GenerationConfig() - gen_config = self.gen_config - default_synced_gpus = True if is_deepspeed_zero3_enabled() else False - synced_gpus = gen_kwargs.get("synced_gpus", default_synced_gpus) - if len(gen_kwargs) > 0: - unused_kwargs = gen_config.update(**gen_kwargs) - if unused_kwargs: - logger.warning_once( - f"Following generation related kwargs were passed to `prediction_step` but not used by `generate()`: {' '.join(unused_kwargs.keys())} .", - "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", - ) + if self.args.predict_with_generate: + gen_config = self.gen_config + default_synced_gpus = True if is_deepspeed_zero3_enabled() else False + synced_gpus = gen_kwargs.get("synced_gpus", default_synced_gpus) + if len(gen_kwargs) > 0: + unused_kwargs = gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + "Following generation related kwargs were passed to `prediction_step` but not " + f"used by `generate()`: {' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) # For CLIP-like models capable of returning loss values. # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` From 0959b5ae575339e6dab9a70a4226d2200981ee60 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 21 Aug 2024 06:54:27 +0200 Subject: [PATCH 04/14] typo --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f036b0db6eb9..1f51df8d4dfc 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -314,7 +314,7 @@ class Trainer: default to [`default_data_collator`] if no `tokenizer` is provided, an instance of [`DataCollatorWithPadding`] otherwise. eval_data_collator (`typing.Union[DataCollator, NoneType]`, *optional*): - The function to use to form a batch from a list of elements of `eval_dataset` and `train_dataset`. Will + The function to use to form a batch from a list of elements of `eval_dataset` and `test_dataset`. Will default to `data_collator` if no `eval_data_collator` is provided. train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*): The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the From 0fab8d76e85aba3dfdd2ccf1a1fa9c2459415c71 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 21 Aug 2024 06:55:13 +0200 Subject: [PATCH 05/14] tmp --- run.py | 134 +++++++++++++++++++++++++++++++++++++++++ seq2seq.py | 104 ++++++++++++++++++++++++++++++++ sft.py | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 409 insertions(+) create mode 100644 run.py create mode 100644 seq2seq.py create mode 100644 sft.py diff --git a/run.py b/run.py new file mode 100644 index 000000000000..b5396393919a --- /dev/null +++ b/run.py @@ -0,0 +1,134 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig,TrainingArguments, Trainer +from datasets import load_dataset +from peft import LoraConfig +import torch +import time +from torchmetrics.text import SQuAD +from random import randrange +from transformers.utils import logging +from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM + +# timdettmers/openassistant-guanaco +# Stanford/web_questions +eval_dataset = load_dataset("timdettmers/openassistant-guanaco", split="test") + + +# train_dataset=dataset["train"] +# eval_dataset=dataset["test"] + +# eval_dataset = eval_dataset.select(range(256)) + +quant_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_type=torch.bfloat16 +) + +#model_id = "meta-llama/Llama-2-7b-chat-hf" +model_id="openai-community/gpt2" +model = AutoModelForCausalLM.from_pretrained( + model_id, + #quantization_config=quant_config, + device_map="auto", + torch_dtype=torch.float16, + #attn_implementation="flash_attention_2", +) + +print(f"Param count: {sum(p.numel() for p in model.parameters())}") + +tokenizer = AutoTokenizer.from_pretrained(model_id) + +tokenizer.add_special_tokens({"pad_token":""}) +pad_token_id = tokenizer.pad_token_id +model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) + + +gen_config = model.generation_config +gen_config.max_new_tokens = 200 +gen_config.use_cache = True + + +peft_config = LoraConfig( + r=64, + lora_alpha=16, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", +) + +model.add_adapter(peft_config) +model.enable_adapters() + + +class DataCollatorForGenerationSFT: + def __init__(self, tokenizer, eval_mode=False): + self.tokenizer = tokenizer + self.eval_mode = eval_mode + + def __call__(self, examples): + texts, texts_eval = [], [] + for example in examples: + question = example["text"] + text_eval = question.split("### Assistant")[0] + texts.append(question.strip()) + texts_eval.append(f"{text_eval.strip()}### Assistant:") + + # Make sure we have right padding in train and left padding for eval parts + tokenizer.padding_side = "right" + batch = tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=512) + + if self.eval_mode: + tokenizer.padding_side = "left" + batch_eval = tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=512) + batch['generation_input_ids'] = batch_eval['input_ids'] + batch['generation_attention_mask'] = batch_eval['attention_mask'] + labels = batch["input_ids"].clone() + labels[labels == tokenizer.pad_token_id] = -100 # Ignore index for CE-loss + batch["labels"] = labels + return batch + + +def custom_metrics(prediction_dict): + # unmask for correct detokenization, because preds are padded to max length with -100 + preds = prediction_dict.predictions + preds[preds == -100] = pad_token_id + lbls = prediction_dict.label_ids + lbls[lbls == -100] = pad_token_id + + # Decode and do magic for metrics + preds = tokenizer.batch_decode(preds,skip_special_tokens=True) + lbls = tokenizer.batch_decode(lbls,skip_special_tokens=True) + return {"exact_match" : 0, "f1_score": 0} + + +training_args = TrainingArguments( + per_device_train_batch_size=8, + #per_device_eval_batch_size=128, + num_train_epochs=20, + do_train=True, + do_eval=True, + eval_strategy="steps", + eval_steps=500, + save_steps=500000, + bf16=True, + output_dir="test_predict", + overwrite_output_dir=True, + optim="adafactor", + report_to="wandb", + logging_steps=100000, + remove_unused_columns=False, + predict_with_generate=True, + generation_config=gen_config +) + + +trainer = Trainer( + model=model, + args=training_args, + data_collator=DataCollatorForGenerationSFT(tokenizer), + eval_data_collator=DataCollatorForGenerationSFT(tokenizer, eval_mode=True), + train_dataset=eval_dataset, + eval_dataset=eval_dataset, + compute_metrics=custom_metrics, +) + +trainer.evaluate() diff --git a/seq2seq.py b/seq2seq.py new file mode 100644 index 000000000000..2f623498d335 --- /dev/null +++ b/seq2seq.py @@ -0,0 +1,104 @@ +from datasets import load_dataset +import numpy as np +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments + + +raw_datasets = load_dataset("kde4", lang1="en", lang2="fr", trust_remote_code=True) +split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20) + +split_datasets["validation"] = split_datasets.pop("test") +split_datasets["train"][1]["translation"] + +split_datasets["validation"] = split_datasets["validation"].select(range(256)) + + +model_checkpoint = "google-t5/t5-base" # 11135332352 124439808 783150080 222903552 +model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) + +print(f"Param count: {sum(p.numel() for p in model.parameters())}") + + +tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt") + +en_sentence = split_datasets["train"][1]["translation"]["en"] +fr_sentence = split_datasets["train"][1]["translation"]["fr"] + +inputs = tokenizer(en_sentence, text_target=fr_sentence) + + +def preprocess_function(examples): + inputs = [ex["en"] for ex in examples["translation"]] + targets = [ex["fr"] for ex in examples["translation"]] + model_inputs = tokenizer( + inputs, text_target=targets, max_length=50, truncation=True + ) + return model_inputs + +tokenized_datasets = split_datasets.map( + preprocess_function, + batched=True, + remove_columns=split_datasets["train"].column_names, +) + + +from transformers import DataCollatorForSeq2Seq +import evaluate + +data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) +batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)]) +batch.keys() + +metric = evaluate.load("sacrebleu") + + +def compute_metrics(eval_preds): + preds, labels = eval_preds + # In case the model returns more than the prediction logits + if isinstance(preds, tuple): + preds = preds[0] + + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + + # Replace -100s in the labels as we can't decode them + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # Some simple post-processing + decoded_preds = [pred.strip() for pred in decoded_preds] + decoded_labels = [[label.strip()] for label in decoded_labels] + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": result["score"]} + + +args = Seq2SeqTrainingArguments( + f"marian-finetuned-kde4-en-to-fr", + evaluation_strategy="no", + save_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=32, + per_device_eval_batch_size=64, + weight_decay=0.01, + save_total_limit=3, + num_train_epochs=3, + predict_with_generate=True, + fp16=True, + push_to_hub=True, + generation_max_length=250, +) + +from transformers import Seq2SeqTrainer + +model.generation_config.max_new_tokens=200 + +trainer = Seq2SeqTrainer( + model, + args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, +) + +trainer.evaluate() diff --git a/sft.py b/sft.py new file mode 100644 index 000000000000..6b6f7430ba8d --- /dev/null +++ b/sft.py @@ -0,0 +1,171 @@ +# flake8: noqa +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +# regular: +python sft.py \ + --model_name_or_path="openai-community/gpt2" \ + --dataset_text_field="text" \ + --report_to="wandb" \ + --learning_rate=1.41e-5 \ + --per_device_train_batch_size=64 \ + --gradient_accumulation_steps=16 \ + --output_dir="sft_openassistant-guanaco" \ + --logging_steps=1 \ + --num_train_epochs=3 \ + --max_steps=-1 \ + --push_to_hub \ + --gradient_checkpointing + +# peft: +python sft.py \ + --model_name_or_path="openai-community/gpt2" \ + --dataset_text_field="text" \ + --report_to="wandb" \ + --learning_rate=1.41e-5 \ + --per_device_train_batch_size=64 \ + --gradient_accumulation_steps=16 \ + --output_dir="sft_openassistant-guanaco" \ + --logging_steps=1 \ + --num_train_epochs=3 \ + --max_steps=-1 \ + --push_to_hub \ + --gradient_checkpointing \ + --use_peft \ + --lora_r=64 \ + --lora_alpha=16 +""" + +import logging +import os +from contextlib import nullcontext + +from trl.commands.cli_utils import init_zero_verbose, SFTScriptArguments, TrlParser +from trl.env_utils import strtobool + +TRL_USE_RICH = strtobool(os.getenv("TRL_USE_RICH", "0")) + +if TRL_USE_RICH: + init_zero_verbose() + FORMAT = "%(message)s" + + from rich.console import Console + from rich.logging import RichHandler + +import torch +from datasets import load_dataset + +from tqdm.rich import tqdm +from transformers import AutoTokenizer + +from trl import ( + ModelConfig, + RichProgressCallback, + SFTConfig, + SFTTrainer, + get_peft_config, + get_quantization_config, + get_kbit_device_map, +) + +tqdm.pandas() + +if TRL_USE_RICH: + logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO) + + +if __name__ == "__main__": + parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig)) + args, training_args, model_config = parser.parse_args_and_config() + + # Force use our print callback + if TRL_USE_RICH: + training_args.disable_tqdm = True + console = Console() + + ################ + # Model init kwargs & Tokenizer + ################ + quantization_config = get_quantization_config(model_config) + model_kwargs = dict( + revision=model_config.model_revision, + trust_remote_code=model_config.trust_remote_code, + attn_implementation=model_config.attn_implementation, + torch_dtype=model_config.torch_dtype, + use_cache=False if training_args.gradient_checkpointing else True, + device_map=get_kbit_device_map() if quantization_config is not None else None, + quantization_config=quantization_config, + ) + training_args.model_init_kwargs = model_kwargs + tokenizer = AutoTokenizer.from_pretrained( + model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, use_fast=True + ) + tokenizer.pad_token = tokenizer.eos_token + + ################ + # Dataset + ################ + raw_datasets = load_dataset(args.dataset_name) + + train_dataset = raw_datasets[args.dataset_train_split] + eval_dataset = raw_datasets[args.dataset_test_split] + print("args.dataset_test_split", args.dataset_test_split) + + ################ + # Optional rich context managers + ############### + init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the SFTTrainer...") + save_context = ( + nullcontext() + if not TRL_USE_RICH + else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}") + ) + + import torch + + class DataCollatorForGenerationSFT: + def __init__(self): + pass + + def __call__(self, examples): + keys = examples[0].keys() + out = {} + for k in keys: + inputs = torch.stack([torch.tensor(ex[k]) for ex in examples]) + out[k] = inputs + + return out + + + ################ + # Training + ################ + training_args.predict_with_generate = True + with init_context: + trainer = SFTTrainer( + model=model_config.model_name_or_path, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + data_collator=DataCollatorForGenerationSFT(), + #eval_packing=True, + peft_config=get_peft_config(model_config), + callbacks=[RichProgressCallback] if TRL_USE_RICH else None, + ) + + trainer.evaluate() + + #with save_context: + # trainer.save_model(training_args.output_dir) From 19638d9f38c23cda78bb67b5774bc81e48a29334 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 30 Aug 2024 12:18:25 +0200 Subject: [PATCH 06/14] use wrapped model --- src/transformers/trainer.py | 2 +- src/transformers/trainer_seq2seq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1f51df8d4dfc..a9557f98f0dd 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -4138,7 +4138,7 @@ def prediction_step( for k, v in generation_inputs.items() if k.replace("generation_", "") not in gen_keys and "generation" not in k } - generated_tokens = self.model.generate( + generated_tokens = model.generate( **generation_inputs_common, **generation_only_inputs, generation_config=gen_config, diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py index abc45cffe4ae..081e9e2dc944 100644 --- a/src/transformers/trainer_seq2seq.py +++ b/src/transformers/trainer_seq2seq.py @@ -307,7 +307,7 @@ def prediction_step( generation_inputs = { k: v for k, v in inputs.items() if k not in ("decoder_input_ids", "decoder_attention_mask") } - generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs) + generated_tokens = model.generate(**generation_inputs, **gen_kwargs) # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop # TODO: remove this hack when the legacy code that initializes generation_config from a model config is From d66818ad454c1deae51ab4e7c74ff3b609bd9591 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 2 Sep 2024 13:39:24 +0500 Subject: [PATCH 07/14] Update src/transformers/trainer.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 31e958b2e73a..27005a0002f4 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -551,7 +551,7 @@ def __init__( else default_data_collator ) self.data_collator = data_collator if data_collator is not None else default_collator - self.eval_data_collator = eval_data_collator if eval_data_collator is not None else data_collator + self.eval_data_collator = eval_data_collator if eval_data_collator is not None else self.data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.tokenizer = tokenizer From 2aedb2d8b39ddd8b525a0ed7a9644d17550de848 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 2 Sep 2024 13:39:36 +0500 Subject: [PATCH 08/14] Update src/transformers/trainer.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 27005a0002f4..926b3bdebb4b 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -991,7 +991,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None if eval_dataset is not None else self.eval_dataset ) - data_collator = self.eval_data_collator if self.eval_data_collator is not None else self.data_collator + data_collator = self.eval_data_collator if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") From d510046d2bbea6719202bf4d1be365ac72633332 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 2 Sep 2024 13:40:41 +0500 Subject: [PATCH 09/14] Update src/transformers/trainer.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 926b3bdebb4b..a8621643ea61 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1033,7 +1033,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. """ - data_collator = self.eval_data_collator if self.eval_data_collator is not None else self.data_collator + data_collator = self.eval_data_collator if is_datasets_available() and isinstance(test_dataset, datasets.Dataset): test_dataset = self._remove_unused_columns(test_dataset, description="test") From 7dc778192c2a15a925e4ab51baea3ede255f32bc Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 2 Sep 2024 10:43:25 +0200 Subject: [PATCH 10/14] remove files --- run.py | 134 ----------------------------------------- seq2seq.py | 104 -------------------------------- sft.py | 171 ----------------------------------------------------- 3 files changed, 409 deletions(-) delete mode 100644 run.py delete mode 100644 seq2seq.py delete mode 100644 sft.py diff --git a/run.py b/run.py deleted file mode 100644 index b5396393919a..000000000000 --- a/run.py +++ /dev/null @@ -1,134 +0,0 @@ -from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig,TrainingArguments, Trainer -from datasets import load_dataset -from peft import LoraConfig -import torch -import time -from torchmetrics.text import SQuAD -from random import randrange -from transformers.utils import logging -from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM - -# timdettmers/openassistant-guanaco -# Stanford/web_questions -eval_dataset = load_dataset("timdettmers/openassistant-guanaco", split="test") - - -# train_dataset=dataset["train"] -# eval_dataset=dataset["test"] - -# eval_dataset = eval_dataset.select(range(256)) - -quant_config=BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_type=torch.bfloat16 -) - -#model_id = "meta-llama/Llama-2-7b-chat-hf" -model_id="openai-community/gpt2" -model = AutoModelForCausalLM.from_pretrained( - model_id, - #quantization_config=quant_config, - device_map="auto", - torch_dtype=torch.float16, - #attn_implementation="flash_attention_2", -) - -print(f"Param count: {sum(p.numel() for p in model.parameters())}") - -tokenizer = AutoTokenizer.from_pretrained(model_id) - -tokenizer.add_special_tokens({"pad_token":""}) -pad_token_id = tokenizer.pad_token_id -model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) - - -gen_config = model.generation_config -gen_config.max_new_tokens = 200 -gen_config.use_cache = True - - -peft_config = LoraConfig( - r=64, - lora_alpha=16, - lora_dropout=0.05, - bias="none", - task_type="CAUSAL_LM", -) - -model.add_adapter(peft_config) -model.enable_adapters() - - -class DataCollatorForGenerationSFT: - def __init__(self, tokenizer, eval_mode=False): - self.tokenizer = tokenizer - self.eval_mode = eval_mode - - def __call__(self, examples): - texts, texts_eval = [], [] - for example in examples: - question = example["text"] - text_eval = question.split("### Assistant")[0] - texts.append(question.strip()) - texts_eval.append(f"{text_eval.strip()}### Assistant:") - - # Make sure we have right padding in train and left padding for eval parts - tokenizer.padding_side = "right" - batch = tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=512) - - if self.eval_mode: - tokenizer.padding_side = "left" - batch_eval = tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=512) - batch['generation_input_ids'] = batch_eval['input_ids'] - batch['generation_attention_mask'] = batch_eval['attention_mask'] - labels = batch["input_ids"].clone() - labels[labels == tokenizer.pad_token_id] = -100 # Ignore index for CE-loss - batch["labels"] = labels - return batch - - -def custom_metrics(prediction_dict): - # unmask for correct detokenization, because preds are padded to max length with -100 - preds = prediction_dict.predictions - preds[preds == -100] = pad_token_id - lbls = prediction_dict.label_ids - lbls[lbls == -100] = pad_token_id - - # Decode and do magic for metrics - preds = tokenizer.batch_decode(preds,skip_special_tokens=True) - lbls = tokenizer.batch_decode(lbls,skip_special_tokens=True) - return {"exact_match" : 0, "f1_score": 0} - - -training_args = TrainingArguments( - per_device_train_batch_size=8, - #per_device_eval_batch_size=128, - num_train_epochs=20, - do_train=True, - do_eval=True, - eval_strategy="steps", - eval_steps=500, - save_steps=500000, - bf16=True, - output_dir="test_predict", - overwrite_output_dir=True, - optim="adafactor", - report_to="wandb", - logging_steps=100000, - remove_unused_columns=False, - predict_with_generate=True, - generation_config=gen_config -) - - -trainer = Trainer( - model=model, - args=training_args, - data_collator=DataCollatorForGenerationSFT(tokenizer), - eval_data_collator=DataCollatorForGenerationSFT(tokenizer, eval_mode=True), - train_dataset=eval_dataset, - eval_dataset=eval_dataset, - compute_metrics=custom_metrics, -) - -trainer.evaluate() diff --git a/seq2seq.py b/seq2seq.py deleted file mode 100644 index 2f623498d335..000000000000 --- a/seq2seq.py +++ /dev/null @@ -1,104 +0,0 @@ -from datasets import load_dataset -import numpy as np -from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments - - -raw_datasets = load_dataset("kde4", lang1="en", lang2="fr", trust_remote_code=True) -split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20) - -split_datasets["validation"] = split_datasets.pop("test") -split_datasets["train"][1]["translation"] - -split_datasets["validation"] = split_datasets["validation"].select(range(256)) - - -model_checkpoint = "google-t5/t5-base" # 11135332352 124439808 783150080 222903552 -model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) - -print(f"Param count: {sum(p.numel() for p in model.parameters())}") - - -tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt") - -en_sentence = split_datasets["train"][1]["translation"]["en"] -fr_sentence = split_datasets["train"][1]["translation"]["fr"] - -inputs = tokenizer(en_sentence, text_target=fr_sentence) - - -def preprocess_function(examples): - inputs = [ex["en"] for ex in examples["translation"]] - targets = [ex["fr"] for ex in examples["translation"]] - model_inputs = tokenizer( - inputs, text_target=targets, max_length=50, truncation=True - ) - return model_inputs - -tokenized_datasets = split_datasets.map( - preprocess_function, - batched=True, - remove_columns=split_datasets["train"].column_names, -) - - -from transformers import DataCollatorForSeq2Seq -import evaluate - -data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) -batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)]) -batch.keys() - -metric = evaluate.load("sacrebleu") - - -def compute_metrics(eval_preds): - preds, labels = eval_preds - # In case the model returns more than the prediction logits - if isinstance(preds, tuple): - preds = preds[0] - - decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) - - # Replace -100s in the labels as we can't decode them - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - - # Some simple post-processing - decoded_preds = [pred.strip() for pred in decoded_preds] - decoded_labels = [[label.strip()] for label in decoded_labels] - - result = metric.compute(predictions=decoded_preds, references=decoded_labels) - return {"bleu": result["score"]} - - -args = Seq2SeqTrainingArguments( - f"marian-finetuned-kde4-en-to-fr", - evaluation_strategy="no", - save_strategy="epoch", - learning_rate=2e-5, - per_device_train_batch_size=32, - per_device_eval_batch_size=64, - weight_decay=0.01, - save_total_limit=3, - num_train_epochs=3, - predict_with_generate=True, - fp16=True, - push_to_hub=True, - generation_max_length=250, -) - -from transformers import Seq2SeqTrainer - -model.generation_config.max_new_tokens=200 - -trainer = Seq2SeqTrainer( - model, - args, - train_dataset=tokenized_datasets["train"], - eval_dataset=tokenized_datasets["validation"], - data_collator=data_collator, - tokenizer=tokenizer, - compute_metrics=compute_metrics, -) - -trainer.evaluate() diff --git a/sft.py b/sft.py deleted file mode 100644 index 6b6f7430ba8d..000000000000 --- a/sft.py +++ /dev/null @@ -1,171 +0,0 @@ -# flake8: noqa -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -# regular: -python sft.py \ - --model_name_or_path="openai-community/gpt2" \ - --dataset_text_field="text" \ - --report_to="wandb" \ - --learning_rate=1.41e-5 \ - --per_device_train_batch_size=64 \ - --gradient_accumulation_steps=16 \ - --output_dir="sft_openassistant-guanaco" \ - --logging_steps=1 \ - --num_train_epochs=3 \ - --max_steps=-1 \ - --push_to_hub \ - --gradient_checkpointing - -# peft: -python sft.py \ - --model_name_or_path="openai-community/gpt2" \ - --dataset_text_field="text" \ - --report_to="wandb" \ - --learning_rate=1.41e-5 \ - --per_device_train_batch_size=64 \ - --gradient_accumulation_steps=16 \ - --output_dir="sft_openassistant-guanaco" \ - --logging_steps=1 \ - --num_train_epochs=3 \ - --max_steps=-1 \ - --push_to_hub \ - --gradient_checkpointing \ - --use_peft \ - --lora_r=64 \ - --lora_alpha=16 -""" - -import logging -import os -from contextlib import nullcontext - -from trl.commands.cli_utils import init_zero_verbose, SFTScriptArguments, TrlParser -from trl.env_utils import strtobool - -TRL_USE_RICH = strtobool(os.getenv("TRL_USE_RICH", "0")) - -if TRL_USE_RICH: - init_zero_verbose() - FORMAT = "%(message)s" - - from rich.console import Console - from rich.logging import RichHandler - -import torch -from datasets import load_dataset - -from tqdm.rich import tqdm -from transformers import AutoTokenizer - -from trl import ( - ModelConfig, - RichProgressCallback, - SFTConfig, - SFTTrainer, - get_peft_config, - get_quantization_config, - get_kbit_device_map, -) - -tqdm.pandas() - -if TRL_USE_RICH: - logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO) - - -if __name__ == "__main__": - parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig)) - args, training_args, model_config = parser.parse_args_and_config() - - # Force use our print callback - if TRL_USE_RICH: - training_args.disable_tqdm = True - console = Console() - - ################ - # Model init kwargs & Tokenizer - ################ - quantization_config = get_quantization_config(model_config) - model_kwargs = dict( - revision=model_config.model_revision, - trust_remote_code=model_config.trust_remote_code, - attn_implementation=model_config.attn_implementation, - torch_dtype=model_config.torch_dtype, - use_cache=False if training_args.gradient_checkpointing else True, - device_map=get_kbit_device_map() if quantization_config is not None else None, - quantization_config=quantization_config, - ) - training_args.model_init_kwargs = model_kwargs - tokenizer = AutoTokenizer.from_pretrained( - model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, use_fast=True - ) - tokenizer.pad_token = tokenizer.eos_token - - ################ - # Dataset - ################ - raw_datasets = load_dataset(args.dataset_name) - - train_dataset = raw_datasets[args.dataset_train_split] - eval_dataset = raw_datasets[args.dataset_test_split] - print("args.dataset_test_split", args.dataset_test_split) - - ################ - # Optional rich context managers - ############### - init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the SFTTrainer...") - save_context = ( - nullcontext() - if not TRL_USE_RICH - else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}") - ) - - import torch - - class DataCollatorForGenerationSFT: - def __init__(self): - pass - - def __call__(self, examples): - keys = examples[0].keys() - out = {} - for k in keys: - inputs = torch.stack([torch.tensor(ex[k]) for ex in examples]) - out[k] = inputs - - return out - - - ################ - # Training - ################ - training_args.predict_with_generate = True - with init_context: - trainer = SFTTrainer( - model=model_config.model_name_or_path, - args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=tokenizer, - data_collator=DataCollatorForGenerationSFT(), - #eval_packing=True, - peft_config=get_peft_config(model_config), - callbacks=[RichProgressCallback] if TRL_USE_RICH else None, - ) - - trainer.evaluate() - - #with save_context: - # trainer.save_model(training_args.output_dir) From e21ba68ec4bc22c5c0a587e8c00851926e702342 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 2 Sep 2024 19:06:30 +0200 Subject: [PATCH 11/14] address comments --- src/transformers/trainer.py | 49 ++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a8621643ea61..d332a75bfe6a 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -60,6 +60,7 @@ from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from .debug_utils import DebugOption, DebugUnderflowOverflow from .feature_extraction_sequence_utils import SequenceFeatureExtractor +from .generation.configuration_utils import GenerationConfig from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend from .integrations.deepspeed import ( deepspeed_init, @@ -3810,20 +3811,24 @@ def evaluate( # Set generation-related kwargs if self.args.predict_with_generate: - if self.args.generation_config is not None: + if self.args.generation_config is None: + # We assume the model can generate if predict-with-generate is True + # Therefore, generation_config should be available + self.gen_config = self.model.generation_config + elif isinstance(self.args.generation_config, GenerationConfig): gen_config = self.args.generation_config self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place - unused_kwargs = self.gen_config.update(**gen_kwargs) - if unused_kwargs: - logger.warning_once( - f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: " - f"{' '.join(unused_kwargs.keys())} .", - "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", - ) else: - # We assume the model can generate if predict-with-generate is True - # Therefore, generation_config should be available + # That means `args.generation_config` is passed as a Dict self.gen_config = self.model.generation_config + _ = self.gen_config.update(**self.args.generation_config) + unused_kwargs = self.gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: " + f"{' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) # memory metrics - must set up as early as possible self._memory_tracker.start() @@ -3914,20 +3919,24 @@ def predict( """ # Set generation-related kwargs if self.args.predict_with_generate: - if self.args.generation_config is not None: + if self.args.generation_config is None: + # We assume the model can generate if predict-with-generate is True + # Therefore, generation_config should be available + self.gen_config = self.model.generation_config + elif isinstance(self.args.generation_config, GenerationConfig): gen_config = self.args.generation_config self.gen_config = copy.deepcopy(gen_config) # copy so we don't modify args.gen_config in-place - unused_kwargs = self.gen_config.update(**gen_kwargs) - if unused_kwargs: - logger.warning_once( - f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: " - f"{' '.join(unused_kwargs.keys())} .", - "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", - ) else: - # We assume the model can generate if predict-with-generate is True - # Therefore, generation_config should be available + # That means `args.generation_config` is passed as a Dict self.gen_config = self.model.generation_config + _ = self.gen_config.update(**self.args.generation_config) + unused_kwargs = self.gen_config.update(**gen_kwargs) + if unused_kwargs: + logger.warning_once( + f"Following generation related kwargs were passed to `evaluate` but not used by `generate()`: " + f"{' '.join(unused_kwargs.keys())} .", + "Make sure there are no typos in the passed kwargs or do not pass unused kwargs.", + ) # memory metrics - must set up as early as possible self._memory_tracker.start() From 3e5411ce61ecf440688ba82cff95c5226c181288 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 3 Sep 2024 11:38:13 +0200 Subject: [PATCH 12/14] add example & docstring nits --- .../run_vlm.py | 144 ++++++++++++++++++ src/transformers/training_args.py | 2 +- 2 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 examples/pytorch/multimodal_language_modeling.py/run_vlm.py diff --git a/examples/pytorch/multimodal_language_modeling.py/run_vlm.py b/examples/pytorch/multimodal_language_modeling.py/run_vlm.py new file mode 100644 index 000000000000..04702ae5c9e0 --- /dev/null +++ b/examples/pytorch/multimodal_language_modeling.py/run_vlm.py @@ -0,0 +1,144 @@ +import random + +import numpy as np +import torch +from datasets import load_dataset +from Levenshtein import distance as levenshtein_distance +from peft import LoraConfig + +from transformers import ( + AutoProcessor, + BitsAndBytesConfig, + Idefics2ForConditionalGeneration, + Trainer, + TrainingArguments, +) + + +DEVICE = "cuda:0" +processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b", do_image_splitting=False) +pad_token_id = processor.tokenizer.pad_token_id + +lora_config = LoraConfig( + r=8, + lora_alpha=8, + lora_dropout=0.1, + target_modules=".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$", + use_dora=False, + init_lora_weights="gaussian", +) + +bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16) + +model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b", + torch_dtype=torch.float16, + quantization_config=bnb_config, +) + +model.add_adapter(lora_config) +model.enable_adapters() + +train_dataset = load_dataset("nielsr/docvqa_1200_examples", split="train") +train_dataset = train_dataset.remove_columns(["id", "words", "bounding_boxes", "answer"]) + +eval_dataset = load_dataset("nielsr/docvqa_1200_examples", split="test") +eval_dataset = eval_dataset.remove_columns(["id", "words", "bounding_boxes", "answer"]) + + +class DataCollatorForGeneration: + def __init__(self, processor, eval_mode=False): + self.processor = processor + self.image_token_id = processor.tokenizer.additional_special_tokens_ids[ + processor.tokenizer.additional_special_tokens.index("") + ] + self.eval_mode = eval_mode + + def __call__(self, examples): + texts, texts_eval = [], [] + images = [] + for example in examples: + image = example["image"] + question = example["query"]["en"] + answer = random.choice(example["answers"]) + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Answer briefly."}, + {"type": "image"}, + {"type": "text", "text": question}, + ], + }, + {"role": "assistant", "content": [{"type": "text", "text": answer}]}, + ] + text = processor.apply_chat_template(messages, add_generation_prompt=False) + text_eval = processor.apply_chat_template([messages[0]], add_generation_prompt=True) + texts.append(text.strip()) + texts_eval.append(text_eval.strip()) + images.append([image]) + + # Make sure we have right padding in train and left padding for eval parts + processor.tokenizer.padding_side = "right" + batch = processor(text=texts, images=images, return_tensors="pt", padding=True) + + if self.eval_mode: + processor.tokenizer.padding_side = "left" + batch_eval = processor(text=texts_eval, images=images, return_tensors="pt", padding=True) + batch["generation_input_ids"] = batch_eval["input_ids"] + batch["generation_attention_mask"] = batch_eval["attention_mask"] + + labels = batch["input_ids"].clone() + labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id + batch["labels"] = labels + + return batch + + +def calculate_levenstein(prediction_dict): + # unmask for correct detokenization, because preds are padded to max length with -100 + preds = prediction_dict.predictions + preds[preds == -100] = pad_token_id + lbls = prediction_dict.label_ids + lbls[lbls == -100] = pad_token_id + + # Decode and do magic for metrics + preds = processor.batch_decode(preds) + lbls = processor.batch_decode(lbls) + levenstein_avg = np.mean([levenshtein_distance(pred, lbl) for pred, lbl in zip(preds, lbls)]) + return {"eval_levenstein": levenstein_avg} + + +generation_config = model.generation_config +generation_config.max_length = 200 # generate no more than 200 tokens (it includes image tokens also) + +training_args = TrainingArguments( + max_steps=1000, + per_device_train_batch_size=4, + per_device_eval_batch_size=8, + gradient_accumulation_steps=2, + output_dir="/raid/raushan/idefics-train", + eval_strategy="steps", + fp16=True, + eval_steps=10, + save_steps=10, + remove_unused_columns=False, + report_to="wandb", + predict_with_generate=True, # will generate in eval step so we can compute text-based metrics + generation_config=generation_config, + metric_for_best_model="levenstein", # will save model with lowest levenstein + greater_is_better=False, +) + + +trainer = Trainer( + model=model, + args=training_args, + data_collator=DataCollatorForGeneration(processor), + eval_data_collator=DataCollatorForGeneration(processor, eval_mode=True), + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=calculate_levenstein, +) + +trainer.train() # will run train and eval on the model diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index bd9520977971..ee91037e55d8 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1520,7 +1520,7 @@ class TrainingArguments: predict_with_generate: bool = field( default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} ) - generation_config: Optional[GenerationConfig] = field( + generation_config: Optional[Union[Dict, GenerationConfig]] = field( default=None, metadata={ "help": ( From e342f8b9a1379337c2c5bda3f4748523e7d6fcec Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 3 Sep 2024 11:43:36 +0200 Subject: [PATCH 13/14] docs --- src/transformers/training_args.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index ee91037e55d8..9479ad8f67b7 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -796,7 +796,7 @@ class TrainingArguments: Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices. This should only be enabled if users are not just returning tensors, and this is actively discouraged by PyTorch. predict_with_generate (`bool`, *optional*, defaults to `False`): Whether to use generate to calculate generative metrics (ROUGE, BLEU). - generation_config ([`~generation.GenerationConfig`], *optional*): + generation_config (Union[`~generation.GenerationConfig`, `Dict`], *optional*): The [`~generation.GenerationConfig`] object that will be used during generation if `predict_with_generate` is set to `True`. Arguments passed in GenerationConfig will have higher priority than model's generation config. Anything not set by this config will fallback to `model.generation_config` by default. @@ -1520,7 +1520,7 @@ class TrainingArguments: predict_with_generate: bool = field( default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} ) - generation_config: Optional[Union[Dict, GenerationConfig]] = field( + generation_config: Union[Dict, GenerationConfig] = field( default=None, metadata={ "help": ( From bddd18b46cb5f60fcad509277e1f211c718e5fd0 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 4 Sep 2024 13:39:56 +0200 Subject: [PATCH 14/14] fix tests --- src/transformers/training_args.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 9479ad8f67b7..a8994766af06 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -191,6 +191,7 @@ class OptimizerNames(ExplicitEnum): "deepspeed", "gradient_checkpointing_kwargs", "lr_scheduler_kwargs", + "generation_config", ] @@ -1520,7 +1521,7 @@ class TrainingArguments: predict_with_generate: bool = field( default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} ) - generation_config: Union[Dict, GenerationConfig] = field( + generation_config: Optional[Union[dict, str, GenerationConfig]] = field( default=None, metadata={ "help": (