Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import os
import sys
import typing
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Annotated, Any, Literal, TypedDict, TypeVar, Union
Expand Down Expand Up @@ -1424,11 +1425,32 @@ def from_pretrained(
if token is not None:
kwargs["token"] = token

prebuilt = cls._pop_prebuilt_subprocessors(kwargs)

# Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
args = cls._get_arguments_from_pretrained(
pretrained_model_name_or_path, processor_dict, _prebuilt=prebuilt, **kwargs
)
return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)

@classmethod
def _pop_prebuilt_subprocessors(cls, kwargs: dict) -> dict:
"""Pop pre-built sub-processors from `kwargs` by exact attribute name, or by modality
alias (e.g. `tokenizer=` → `bpe_tokenizer`) when that modality is unambiguous.
"""
sub_processors = cls.get_attributes()
modality_counts = Counter(_get_modality_for_attribute(s) for s in sub_processors)
prebuilt = {}
for sub_processor_type in sub_processors:
modality = _get_modality_for_attribute(sub_processor_type)
instance = kwargs.pop(sub_processor_type, None)
if instance is None and modality != sub_processor_type and modality_counts[modality] == 1:
instance = kwargs.pop(modality, None)
if instance is not None:
prebuilt[sub_processor_type] = instance
return prebuilt
Comment on lines +1438 to +1452
Copy link
Copy Markdown
Member

@zucchini-nlp zucchini-nlp Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really sure about this. The error from GH issue is due to old remote code, and we don't yet support the Pi0-FAST natively in transformers. also cc @yonigozlan ig you might have seen similar issue when refactoring processor loading

We're planning native support though, and waiting for lerobot team to test and convert the configs correctly

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for taking a look, @zucchini-nlp!

Quick scope note: this PR isn't targeting the OP's zero-kwarg traceback (that's the hub layout / old remote code path you mentioned, which I agree is out of scope here and will be obsoleted by native support). It's targeting @ArthurZucker's follow-up comment on the issue:

p = AutoProcessor.from_pretrained("physical-intelligence/fast", tokenizer=tokenizer, trust_remote_code=True, use_fast=False)

"this does not work and it should!"

The underlying behavior is general to ProcessorMixin: when a caller supplies a pre-built sub-processor via kwargs (whether tokenizer= or the exact attribute name like bpe_tokenizer=), the instance is silently dropped and the loader tries to reload from disk anyway. Any processor with a non-primary tokenizer attribute runs into this, so native Pi0-FAST support wouldn't fix it on its own, it'd just mean one fewer processor hitting it.

That said, happy to defer fully. If you and @yonigozlan / @ArthurZucker feel this should wait (or be folded into the native support work, or handled differently), I'm glad to close or rescope, just let me know.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, totally get it. Personally, I think we can deliberately not support it as remote code and not-v5 compatible unless Arthur/Yoni have a different opinion


@classmethod
def get_attributes(cls):
args_in_init = inspect.signature(cls.__init__).parameters.keys()
Expand Down Expand Up @@ -1499,7 +1521,9 @@ def _load_tokenizer_from_pretrained(
return tokenizer

@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
def _get_arguments_from_pretrained(
cls, pretrained_model_name_or_path, processor_dict=None, *, _prebuilt=None, **kwargs
):
"""
Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
Expand All @@ -1517,15 +1541,21 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor
pretrained_model_name_or_path: Path or model id to load from.
processor_dict: Optional dict containing processor config (from processor_config.json).
Required when loading additional non-tokenizer sub-processors.
_prebuilt: Optional `{attribute: instance}` dict of pre-built sub-processors that skip loading.
"""
args = []
processor_dict = processor_dict if processor_dict is not None else {}
# Remove subfolder from kwargs to avoid duplicate keyword arguments
subfolder = kwargs.pop("subfolder", "")

prebuilt = _prebuilt or {}

# get args from processor init signature
sub_processors = cls.get_attributes()
for sub_processor_type in sub_processors:
if sub_processor_type in prebuilt:
args.append(prebuilt[sub_processor_type])
continue
modality = _get_modality_for_attribute(sub_processor_type)
is_primary = sub_processor_type == modality

Expand Down
40 changes: 40 additions & 0 deletions tests/models/auto/test_processor_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,46 @@ def __init__(self, tokenizer, decoder_tokenizer, image_processor):
# Verify image processor loaded correctly
self.assertEqual(loaded_processor.image_processor.size, image_processor.size)

def test_processor_from_pretrained_with_prebuilt_tokenizer_kwarg(self):
class SingleTokenizerProcessor(ProcessorMixin):
def __init__(self, bpe_tokenizer):
super().__init__(bpe_tokenizer)

class DualTokenizerProcessor(ProcessorMixin):
def __init__(self, bpe_tokenizer, decoder_tokenizer):
super().__init__(bpe_tokenizer, decoder_tokenizer)

tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")

self.assertEqual(
SingleTokenizerProcessor._pop_prebuilt_subprocessors({"tokenizer": tokenizer}),
{"bpe_tokenizer": tokenizer},
)
ambiguous_kwargs = {"tokenizer": tokenizer}
self.assertEqual(DualTokenizerProcessor._pop_prebuilt_subprocessors(ambiguous_kwargs), {})
self.assertIn("tokenizer", ambiguous_kwargs)

with tempfile.TemporaryDirectory() as tmp_dir:
SingleTokenizerProcessor(bpe_tokenizer=tokenizer).save_pretrained(tmp_dir)

loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, bpe_tokenizer=tokenizer)
self.assertIs(loaded.bpe_tokenizer, tokenizer)

loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, tokenizer=tokenizer)
self.assertIs(loaded.bpe_tokenizer, tokenizer)

loaded, unused = SingleTokenizerProcessor.from_pretrained(
tmp_dir, tokenizer=tokenizer, return_unused_kwargs=True
)
self.assertIs(loaded.bpe_tokenizer, tokenizer)
self.assertNotIn("tokenizer", unused)

loaded, unused = SingleTokenizerProcessor.from_pretrained(
tmp_dir, bpe_tokenizer=tokenizer, return_unused_kwargs=True
)
self.assertIs(loaded.bpe_tokenizer, tokenizer)
self.assertNotIn("bpe_tokenizer", unused)

def test_processor_with_multiple_image_processors_save_load(self):
"""Test that processors with multiple image processors save and load correctly."""

Expand Down
Loading