diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py index 75e3334d4a..d2de455c22 100644 --- a/nemo_rl/models/generation/vllm/vllm_worker.py +++ b/nemo_rl/models/generation/vllm/vllm_worker.py @@ -388,6 +388,14 @@ def _patch_vllm_vit_flash_attn_backend(): ) # disable quantization vllm_kwargs["hf_overrides"]["quantization_config"] = {} + elif "Gemma3ForConditionalGeneration" in getattr( + hf_config, "architectures", [] + ): + if self.cfg["vllm_cfg"]["skip_tokenizer_init"]: + print( + "Gemma3ForConditionalGeneration models may crash when skip_tokenizer_init is True. NeMo-RL is forcing it to False for this architecture. See https://github.com/NVIDIA-NeMo/RL/issues/1681 for more details." + ) + self.cfg["vllm_cfg"]["skip_tokenizer_init"] = False llm_kwargs = dict( model=self.model_name,