diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index bb1344a43dcf..76d58a757c2e 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -22,6 +22,7 @@ import os import sys import typing +from collections import Counter from dataclasses import dataclass from pathlib import Path from typing import Annotated, Any, Literal, TypedDict, TypeVar, Union @@ -1424,11 +1425,32 @@ def from_pretrained( if token is not None: kwargs["token"] = token + prebuilt = cls._pop_prebuilt_subprocessors(kwargs) + # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) - args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs) + args = cls._get_arguments_from_pretrained( + pretrained_model_name_or_path, processor_dict, _prebuilt=prebuilt, **kwargs + ) return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs) + @classmethod + def _pop_prebuilt_subprocessors(cls, kwargs: dict) -> dict: + """Pop pre-built sub-processors from `kwargs` by exact attribute name, or by modality + alias (e.g. `tokenizer=` → `bpe_tokenizer`) when that modality is unambiguous. + """ + sub_processors = cls.get_attributes() + modality_counts = Counter(_get_modality_for_attribute(s) for s in sub_processors) + prebuilt = {} + for sub_processor_type in sub_processors: + modality = _get_modality_for_attribute(sub_processor_type) + instance = kwargs.pop(sub_processor_type, None) + if instance is None and modality != sub_processor_type and modality_counts[modality] == 1: + instance = kwargs.pop(modality, None) + if instance is not None: + prebuilt[sub_processor_type] = instance + return prebuilt + @classmethod def get_attributes(cls): args_in_init = inspect.signature(cls.__init__).parameters.keys() @@ -1499,7 +1521,9 @@ def _load_tokenizer_from_pretrained( return tokenizer @classmethod - def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs): + def _get_arguments_from_pretrained( + cls, pretrained_model_name_or_path, processor_dict=None, *, _prebuilt=None, **kwargs + ): """ Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers, and feature extractors. This method inspects the processor's `__init__` signature to identify parameters @@ -1517,15 +1541,21 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor pretrained_model_name_or_path: Path or model id to load from. processor_dict: Optional dict containing processor config (from processor_config.json). Required when loading additional non-tokenizer sub-processors. + _prebuilt: Optional `{attribute: instance}` dict of pre-built sub-processors that skip loading. """ args = [] processor_dict = processor_dict if processor_dict is not None else {} # Remove subfolder from kwargs to avoid duplicate keyword arguments subfolder = kwargs.pop("subfolder", "") + prebuilt = _prebuilt or {} + # get args from processor init signature sub_processors = cls.get_attributes() for sub_processor_type in sub_processors: + if sub_processor_type in prebuilt: + args.append(prebuilt[sub_processor_type]) + continue modality = _get_modality_for_attribute(sub_processor_type) is_primary = sub_processor_type == modality diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index c029ae2cf97d..a8185b55597a 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -498,6 +498,46 @@ def __init__(self, tokenizer, decoder_tokenizer, image_processor): # Verify image processor loaded correctly self.assertEqual(loaded_processor.image_processor.size, image_processor.size) + def test_processor_from_pretrained_with_prebuilt_tokenizer_kwarg(self): + class SingleTokenizerProcessor(ProcessorMixin): + def __init__(self, bpe_tokenizer): + super().__init__(bpe_tokenizer) + + class DualTokenizerProcessor(ProcessorMixin): + def __init__(self, bpe_tokenizer, decoder_tokenizer): + super().__init__(bpe_tokenizer, decoder_tokenizer) + + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM") + + self.assertEqual( + SingleTokenizerProcessor._pop_prebuilt_subprocessors({"tokenizer": tokenizer}), + {"bpe_tokenizer": tokenizer}, + ) + ambiguous_kwargs = {"tokenizer": tokenizer} + self.assertEqual(DualTokenizerProcessor._pop_prebuilt_subprocessors(ambiguous_kwargs), {}) + self.assertIn("tokenizer", ambiguous_kwargs) + + with tempfile.TemporaryDirectory() as tmp_dir: + SingleTokenizerProcessor(bpe_tokenizer=tokenizer).save_pretrained(tmp_dir) + + loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, bpe_tokenizer=tokenizer) + self.assertIs(loaded.bpe_tokenizer, tokenizer) + + loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, tokenizer=tokenizer) + self.assertIs(loaded.bpe_tokenizer, tokenizer) + + loaded, unused = SingleTokenizerProcessor.from_pretrained( + tmp_dir, tokenizer=tokenizer, return_unused_kwargs=True + ) + self.assertIs(loaded.bpe_tokenizer, tokenizer) + self.assertNotIn("tokenizer", unused) + + loaded, unused = SingleTokenizerProcessor.from_pretrained( + tmp_dir, bpe_tokenizer=tokenizer, return_unused_kwargs=True + ) + self.assertIs(loaded.bpe_tokenizer, tokenizer) + self.assertNotIn("bpe_tokenizer", unused) + def test_processor_with_multiple_image_processors_save_load(self): """Test that processors with multiple image processors save and load correctly."""