From 85b4e7083ae01f400f42224ecd178c7d4900b015 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 7 Apr 2026 11:17:25 +0200
Subject: [PATCH 01/11] start updating tests

---
 tests/models/gemma4/test_modeling_gemma4.py | 198 ++++----------------
 1 file changed, 36 insertions(+), 162 deletions(-)

diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index c63e9ba20165..50ea0986fd52 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -20,7 +20,6 @@
 from parameterized import parameterized
 
 from transformers import (
-    AutoModelForCausalLM,
     AutoTokenizer,
     Gemma4Config,
     Gemma4TextConfig,
@@ -34,7 +33,6 @@
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
-    require_torch_large_accelerator,
     slow,
     torch_device,
 )
@@ -424,7 +422,8 @@ def test_generate_from_random_inputs_embeds(self):
 @require_torch_accelerator
 class Gemma4IntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = Gemma4Processor.from_pretrained("google/gemma-4-e2b-it", padding_side="left")
+        self.model_name = "google/gemma-4-E2B-it"
+        self.processor = Gemma4Processor.from_pretrained(self.model_name, padding_side="left")
 
         url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         self.messages = [
@@ -443,9 +442,7 @@ def tearDown(self):
 
     @require_deterministic_for_xpu
     def test_model_4b_bf16(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         inputs = self.processor.apply_chat_template(
             self.messages,
@@ -455,28 +452,20 @@ def test_model_4b_bf16(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
-                ("cuda", (8, 0)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'],
-                ("cuda", (8, 6)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
-                ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
-                ("rocm", (9, 5)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant coastline in the background. It looks'],
+                ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_torch_large_accelerator
     @require_deterministic_for_xpu
     def test_model_4b_batch(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         messages_2 = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -502,47 +491,23 @@ def test_model_4b_batch(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n*   **Image 1** shows a cow standing on a beach.',
-                    ],
-                ("cuda", (8,0)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
-                        "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a brown"
-                        ],
-                ("cuda", (8,6)):
+                ("cuda", 8):
                     [
                         'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like',
                         "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a brown"
                     ],
-                ("rocm", (9, 4)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
-                        "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a cow"
-                    ],
-                ("rocm", (9, 5)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. There are some clouds in the blue',
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes. \n\n*   **Image 1** shows a cow standing on a beach',
-                    ],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_torch_large_accelerator
     def test_model_4b_crops(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         crop_config = {
             "images_kwargs": {
@@ -562,19 +527,13 @@ def test_model_4b_crops(self):
             **crop_config,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
         EXPECTED_NUM_IMAGES = 3  # one for the origin image and two crops of images
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
-                ("cuda", 7): [],
-                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a clear blue sky with some white clouds above."],
-                ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
-                ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
-                ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"]
+                ("cuda", 8): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -582,12 +541,9 @@ def test_model_4b_crops(self):
         print(f"Generated text: {output_text}")
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_torch_large_accelerator
     @require_deterministic_for_xpu
     def test_model_4b_batch_crops(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
         crop_config = {
             "images_kwargs": {
                 "do_pan_and_scan": True,
@@ -621,32 +577,14 @@ def test_model_4b_batch_crops(self):
             **crop_config,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = self.processor.batch_decode(output, skip_special_tokens=True)
         EXPECTED_NUM_IMAGES = 9  # 3 * (one for the origin image and two crops of images) = 9
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): [
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
-                ],
-                ("cuda", 7): [],
-                ("cuda", (8,0)): [
-                    "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background",
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                    ],
-                ("cuda", (8, 6)): [
-                    "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the",
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                ],
-                ("rocm", (9, 4)) : [
+                ("cuda", 8): [
                     "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the",
                     'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                    ],
-                ("rocm", (9, 5)) : [
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
                 ],
             }
         )  # fmt: skip
@@ -654,11 +592,8 @@ def test_model_4b_batch_crops(self):
         self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_torch_large_accelerator
     def test_model_4b_multiimage(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -680,17 +615,11 @@ def test_model_4b_multiimage(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = self.processor.batch_decode(output, skip_special_tokens=True)
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n*   **Chinese Arch"],
-                ("cuda", 7): [],
-                ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"],
-                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
-                ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a vibrant"],
-                ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n*   **Chinese Archway:** The most prominent"],
+                ("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -704,32 +633,23 @@ def test_model_1b_text_only(self):
         tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
         inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'],
-                ("cuda", 7): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
                 ("cuda", 8): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
-                ("rocm", (9, 4)): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
-                ("rocm", (9, 5)): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    # TODO: raushan FA2 generates gibberish for no reason, check later
     @require_flash_attn
-    @require_torch_large_accelerator
     @pytest.mark.flash_attn_test
     def test_model_4b_flash_attn(self):
-        model_id = "google/gemma-4-e2b-it"
-
         model = Gemma4ForConditionalGeneration.from_pretrained(
-            model_id, dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-        ).to(torch_device)
+            self.model_name, device_map=torch_device, attn_implementation="flash_attention_2"
+        )
 
         inputs = self.processor.apply_chat_template(
             self.messages,
@@ -739,16 +659,12 @@ def test_model_4b_flash_attn(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
         output_text = self.processor.batch_decode(output, skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
-                ("cuda", 7): [],
                 ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
-                ("rocm", (9, 5)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with a turquoise ocean and a distant island in the background. It looks like a sunny'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -756,11 +672,9 @@ def test_model_4b_flash_attn(self):
 
     @parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)])
     def test_generation_beyond_sliding_window(self, attn_implementation: str):
-        """Test that we can correctly generate beyond the sliding window. This is non trivial as
-        we need to correctly slice the attention mask in all cases (because we use a hybrid cache).
-        Outputs for every attention functions should be coherent and identical.
+        """Test that we can correctly generate beyond the sliding window. Outputs for every attention functions
+        should be coherent and identical.
         """
-        model_id = "google/gemma-3-1b-it"
 
         if attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
             self.skipTest("FlashAttention2 is required for this test.")
@@ -769,12 +683,14 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
             "This is a nice place. " * 800 + "I really enjoy the scenery,",  # This is larger than 4096 tokens
             "A list of colors: red, blue",  # This will almost all be padding tokens
         ]
-        tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left")
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding="left")
         inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device)
 
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, attn_implementation=attn_implementation, dtype=torch.float16
-        ).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(
+            self.model_name,
+            device_map=torch_device,
+            attn_implementation=attn_implementation,
+        )
 
         # Make sure prefill is larger than sliding window
         input_size = inputs.input_ids.shape[-1]
@@ -795,72 +711,30 @@ def test_export_text_only_with_hybrid_cache(self):
 
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        model_id = "google/gemma-3-1b-it"
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        self.assertEqual(model.config.cache_implementation, "hybrid")
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name)
 
         # Export + hybrid cache
-        model.eval()
         exportable_module = TorchExportableModuleForDecoderOnlyLM(model, batch_size=1, max_cache_len=1024)
         exported_program = exportable_module.export(
             input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
         )
-        logging.info(f"\nExported program: {exported_program}")
 
         # Test generation with the exported model
         prompt = "What is the capital of France?"
         max_new_tokens_to_generate = 20
         # Generate text with the exported model
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         export_generated_text = TorchExportableModuleForDecoderOnlyLM.generate(
             exported_program, tokenizer, prompt, max_new_tokens=max_new_tokens_to_generate
         )
         logging.info(f"\nExport generated texts: '{export_generated_text}'")
 
         input_text = tokenizer(prompt, return_tensors="pt")
-        with torch.no_grad():
-            eager_outputs = model.generate(
-                **input_text,
-                max_new_tokens=max_new_tokens_to_generate,
-                do_sample=False,  # Use greedy decoding to match the exported model
-                cache_implementation="hybrid",
-            )
+        eager_outputs = model.generate(
+            **input_text,
+            max_new_tokens=max_new_tokens_to_generate,
+            do_sample=False,  # Use greedy decoding to match the exported model
+        )
 
         eager_generated_text = tokenizer.decode(eager_outputs[0], skip_special_tokens=True)
-        logging.info(f"\nEager generated texts: '{eager_generated_text}'")
-
         self.assertEqual(export_generated_text, eager_generated_text)
-
-    def test_dynamic_sliding_window_is_default(self):
-        """
-        Test that the dynamic sliding window cache (added in #40039) is the default cache implementation for Gemma4
-        models, despite the fact that Hub checkpoints may have `cache_implementation="hybrid"` (static sliding window).
-        """
-        model_id = "google/gemma-3-1b-it"
-        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
-
-        # the default cache is static sliding window
-        self.assertEqual(model.config.cache_implementation, "hybrid")
-        self.assertEqual(model.generation_config.cache_implementation, "hybrid")
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        prompt = "What is the capital of France?"
-        model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-        foward_outputs = model(**model_inputs)
-        self.assertIn("DynamicSlidingWindowLayer", str(foward_outputs.past_key_values))
-
-        generate_outputs = model.generate(
-            **model_inputs, max_new_tokens=2, do_sample=False, return_dict_in_generate=True
-        )
-        self.assertIn("DynamicSlidingWindowLayer", str(generate_outputs.past_key_values))
-
-        # If we manually specify the cache implementation = "hybrid", it will use the static sliding window cache
-        generate_outputs = model.generate(
-            **model_inputs,
-            max_new_tokens=2,
-            do_sample=False,
-            return_dict_in_generate=True,
-            cache_implementation="hybrid",
-        )
-        self.assertNotIn("DynamicSlidingWindowLayer", str(generate_outputs.past_key_values))

From b8a649e01ea4402668c0dd032bd609ef69957ad9 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 7 Apr 2026 12:19:44 +0200
Subject: [PATCH 02/11] start making them pass

---
 tests/models/gemma4/test_modeling_gemma4.py | 135 ++++----------------
 1 file changed, 27 insertions(+), 108 deletions(-)

diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 50ea0986fd52..ece39d5a0d48 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -29,7 +29,6 @@
     Expectations,
     cleanup,
     is_flash_attn_2_available,
-    require_deterministic_for_xpu,
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
@@ -47,6 +46,7 @@
     import torch
 
     from transformers import (
+        AutoModelForCausalLM,
         Gemma4ForCausalLM,
         Gemma4ForConditionalGeneration,
         Gemma4Model,
@@ -417,7 +417,6 @@ def test_generate_from_random_inputs_embeds(self):
         pass
 
 
-@unittest.skip("Integration Tests are not up-to-date yet! TODO Cyril: update me pretty pretty please!")
 @slow
 @require_torch_accelerator
 class Gemma4IntegrationTest(unittest.TestCase):
@@ -440,8 +439,7 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
-    @require_deterministic_for_xpu
-    def test_model_4b_bf16(self):
+    def test_model_4b(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         inputs = self.processor.apply_chat_template(
@@ -453,17 +451,17 @@ def test_model_4b_bf16(self):
         ).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
+                ("cuda", 8): ['This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_deterministic_for_xpu
     def test_model_4b_batch(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -492,153 +490,73 @@ def test_model_4b_batch(self):
         ).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
                 ("cuda", 8):
                     [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like',
-                        "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a brown"
+                        'This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background',
+                        'No, these images are not identical.\n\nThe first image is a photograph of a **cow** standing on a beach under a blue sky.\n\n'
                     ],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    def test_model_4b_crops(self):
+    def test_model_4b_multiimage(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
-        crop_config = {
-            "images_kwargs": {
-                "do_pan_and_scan": True,
-                "pan_and_scan_max_num_crops": 448,
-                "pan_and_scan_min_crop_size": 32,
-                "pan_and_scan_min_ratio_to_activate": 0.3,
-            }
-        }
-
-        inputs = self.processor.apply_chat_template(
-            self.messages,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            add_generation_prompt=True,
-            **crop_config,
-        ).to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-
-        EXPECTED_NUM_IMAGES = 3  # one for the origin image and two crops of images
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("cuda", 8): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
-            }
-        )  # fmt: skip
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
-        print(f"Generated text: {output_text}")
-        self.assertEqual(output_text, EXPECTED_TEXT)
-
-    @require_deterministic_for_xpu
-    def test_model_4b_batch_crops(self):
-        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
-        crop_config = {
-            "images_kwargs": {
-                "do_pan_and_scan": True,
-                "pan_and_scan_max_num_crops": 448,
-                "pan_and_scan_min_crop_size": 32,
-                "pan_and_scan_min_ratio_to_activate": 0.3,
-            }
-        }
-        messages_2 = [
+        messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image",
-                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
-                    },
                     {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                    {"type": "text", "text": "Are these images identical?"},
+                    {"type": "text", "text": "What do you see here?"},
                 ],
             },
         ]
 
         inputs = self.processor.apply_chat_template(
-            [self.messages, messages_2],
+            messages,
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
             padding=True,
             add_generation_prompt=True,
-            **crop_config,
         ).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-        EXPECTED_NUM_IMAGES = 9  # 3 * (one for the origin image and two crops of images) = 9
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
         EXPECTED_TEXTS = Expectations(
             {
-                ("cuda", 8): [
-                    "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the",
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                ],
+                ("cuda", 8): ['Based on the image, here is a description of what I see:\n\n**Foreground & Street Scene:**\n* **Traffic Sign:** The most prominent'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    def test_model_4b_multiimage(self):
-        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
-
-        messages = [
-            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                    {"type": "text", "text": "What do you see here?"},
-                ],
-            },
-        ]
-
-        inputs = self.processor.apply_chat_template(
-            messages,
+    def test_model_text_only(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map=torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")
+        inputs = tokenizer.apply_chat_template(
+            [{"role": "user", "content": "Write a poem about Machine Learning."}],
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-            padding=True,
             add_generation_prompt=True,
         ).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("cuda", 8): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
-            }
-        )  # fmt: skip
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(output_text, EXPECTED_TEXT)
-
-    @require_deterministic_for_xpu
-    def test_model_1b_text_only(self):
-        model_id = "google/gemma-3-1b-it"
-
-        model = Gemma4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
-        inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("cuda", 8): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
+                ("cuda", 8): ['## The Algorithmic Mind\n\nA whisper starts, a seed unseen,\nOf data vast, a vibrant sheen.\nA sea of numbers,'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -660,7 +578,8 @@ def test_model_4b_flash_attn(self):
         ).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
@@ -705,7 +624,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
         self.assertEqual(output_text, EXPECTED_COMPLETIONS)
 
     @pytest.mark.torch_export_test
-    def test_export_text_only_with_hybrid_cache(self):
+    def test_export_text_only(self):
         if not is_torch_greater_or_equal("2.6.0"):
             self.skipTest(reason="This test requires torch >= 2.6 to run.")
 

From ce9520ea31e6bd0ba501204ae16acf39b3f45ee3 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 7 Apr 2026 14:24:13 +0200
Subject: [PATCH 03/11] more

---
 tests/models/gemma4/test_modeling_gemma4.py | 82 +++++++++------------
 1 file changed, 35 insertions(+), 47 deletions(-)

diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index ece39d5a0d48..87299aebfa19 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Gemma4 model."""
 
-import logging
 import unittest
 
 import pytest
@@ -29,7 +28,6 @@
     Expectations,
     cleanup,
     is_flash_attn_2_available,
-    require_flash_attn,
     require_torch,
     require_torch_accelerator,
     slow,
@@ -53,7 +51,6 @@
         Gemma4Processor,
         Gemma4TextModel,
     )
-    from transformers.pytorch_utils import is_torch_greater_or_equal
 
 
 class Gemma4TextModelTester(CausalLMModelTester):
@@ -562,34 +559,7 @@ def test_model_text_only(self):
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_flash_attn
-    @pytest.mark.flash_attn_test
-    def test_model_4b_flash_attn(self):
-        model = Gemma4ForConditionalGeneration.from_pretrained(
-            self.model_name, device_map=torch_device, attn_implementation="flash_attention_2"
-        )
-
-        inputs = self.processor.apply_chat_template(
-            self.messages,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            add_generation_prompt=True,
-        ).to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
-        input_size = inputs.input_ids.shape[-1]
-        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
-
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
-            }
-        )  # fmt: skip
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(output_text, EXPECTED_TEXT)
-
-    @parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)])
+    @parameterized.expand([("sdpa",), ("eager",)])
     def test_generation_beyond_sliding_window(self, attn_implementation: str):
         """Test that we can correctly generate beyond the sliding window. Outputs for every attention functions
         should be coherent and identical.
@@ -603,6 +573,15 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
             "A list of colors: red, blue",  # This will almost all be padding tokens
         ]
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding="left")
+        input_text = [
+            tokenizer.apply_chat_template(
+                [{"role": "user", "content": item}],
+                tokenize=False,
+                return_dict=True,
+                add_generation_prompt=True,
+            )
+            for item in input_text
+        ]
         inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device)
 
         model = Gemma4ForConditionalGeneration.from_pretrained(
@@ -613,40 +592,49 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
 
         # Make sure prefill is larger than sliding window
         input_size = inputs.input_ids.shape[-1]
-        self.assertTrue(input_size > model.config.sliding_window)
+        self.assertTrue(input_size > model.config.get_text_config().sliding_window)
 
-        out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")[
-            :, input_size:
-        ]
-        output_text = tokenizer.batch_decode(out)
+        out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")
+        output_text = tokenizer.batch_decode(out[:, input_size:])
 
-        EXPECTED_COMPLETIONS = [" and I'm going to take a walk.\n\nI really enjoy the scenery, and I'", ", green, yellow, orange, purple, brown, black, white, gray.\n\nI'"]  # fmt: skip
-        self.assertEqual(output_text, EXPECTED_COMPLETIONS)
+        EXPECTED_COMPLETIONS = Expectations(
+            {
+                ("cuda", 8): [
+                    "That sounds lovely! It seems like you're really enjoying the place you're in.\n\n",
+                    "Here are a few ways you could use or expand upon that list, depending on what you need:",
+                ]
+            }
+        )
+        self.assertEqual(output_text, EXPECTED_COMPLETIONS.get_expectation())
 
     @pytest.mark.torch_export_test
     def test_export_text_only(self):
-        if not is_torch_greater_or_equal("2.6.0"):
-            self.skipTest(reason="This test requires torch >= 2.6 to run.")
-
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        # Export + hybrid cache
         exportable_module = TorchExportableModuleForDecoderOnlyLM(model, batch_size=1, max_cache_len=1024)
         exported_program = exportable_module.export(
-            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], dtype=torch.long),
         )
 
         # Test generation with the exported model
-        prompt = "What is the capital of France?"
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": "What is the capital of France?"}],
+            tokenize=False,
+            return_dict=True,
+            add_generation_prompt=True,
+        )
+
         max_new_tokens_to_generate = 20
         # Generate text with the exported model
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         export_generated_text = TorchExportableModuleForDecoderOnlyLM.generate(
-            exported_program, tokenizer, prompt, max_new_tokens=max_new_tokens_to_generate
+            exported_program,
+            tokenizer,
+            prompt,
+            max_new_tokens=max_new_tokens_to_generate,
         )
-        logging.info(f"\nExport generated texts: '{export_generated_text}'")
 
         input_text = tokenizer(prompt, return_tensors="pt")
         eager_outputs = model.generate(

From cc66380452821d4cc99ed1dcfc7ff0494cd5c6fb Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Tue, 7 Apr 2026 14:41:56 +0200
Subject: [PATCH 04/11] fix

---
 tests/models/gemma4/test_modeling_gemma4.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 87299aebfa19..087acf90156b 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -597,7 +597,7 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
         out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")
         output_text = tokenizer.batch_decode(out[:, input_size:])
 
-        EXPECTED_COMPLETIONS = Expectations(
+        EXPECTED_COMPLETIONS_EAGER = Expectations(
             {
                 ("cuda", 8): [
                     "That sounds lovely! It seems like you're really enjoying the place you're in.\n\n",
@@ -605,7 +605,18 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
                 ]
             }
         )
-        self.assertEqual(output_text, EXPECTED_COMPLETIONS.get_expectation())
+        EXPECTED_COMPLETIONS_SDPA = Expectations(
+            {
+                ("cuda", 8): [
+                    "That sounds lovely! It seems like you're really enjoying the place you're in.\n\n",
+                    "Here are a few ways you could use or expand upon that list, depending on what you're",
+                ]
+            }
+        )
+        if attn_implementation == "eager":
+            self.assertEqual(output_text, EXPECTED_COMPLETIONS_EAGER.get_expectation())
+        elif attn_implementation == "sdpa":
+            self.assertEqual(output_text, EXPECTED_COMPLETIONS_SDPA.get_expectation())
 
     @pytest.mark.torch_export_test
     def test_export_text_only(self):

From 82b5e87754a6799883ab7d7a9bef9a03293e5164 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 12:39:18 +0200
Subject: [PATCH 05/11] fix export

---
 src/transformers/cache_utils.py             | 34 ++++++++++++++++-----
 src/transformers/integrations/executorch.py | 19 ++++++++++--
 tests/models/gemma4/test_modeling_gemma4.py |  7 +----
 3 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index ac324ebb62b4..d86edfd33426 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -982,18 +982,38 @@ def update_recurrent_state(self, recurrent_states: torch.Tensor, layer_idx: int,
         return recurrent_states
 
     def early_initialization(
-        self, batch_size: int, num_heads: int, head_dim: int, dtype: torch.dtype, device: torch.device
+        self,
+        batch_size: int,
+        num_heads: int | list[int],
+        head_dim: int | list[int],
+        dtype: torch.dtype,
+        device: torch.device,
     ):
         """
         Initialize all the layers in advance (it's otherwise lazily initialized on the first `update` call).
         This is useful for our `export` recipes, as `export` needs everything in advance.
         """
-        # Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
-        # this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
-        # creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
-        fake_kv_tensor = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
-        # Init all layers
-        for layer in self.layers:
+        # To allow different num_heads and head_dim depending on layers, we accept lists
+        if isinstance(num_heads, int):
+            num_heads = [num_heads] * len(self)
+        if isinstance(head_dim, int):
+            head_dim = [head_dim] * len(self)
+
+        if len(num_heads) != len(self.layers):
+            raise ValueError(
+                f"`num_head` was provided as a list of length {len(num_heads)}, but the Cache currently has {len(self.layers)} layers"
+            )
+        if len(head_dim) != len(self.layers):
+            raise ValueError(
+                f"`head_dim` was provided as a list of length {len(num_heads)}, but the Cache currently has {len(self.layers)} layers"
+            )
+
+        for layer, layer_num_heads, layer_head_dim in zip(self.layers, num_heads, head_dim):
+            # Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
+            # this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
+            # creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
+            fake_kv_tensor = torch.zeros((batch_size, layer_num_heads, 0, layer_head_dim), dtype=dtype, device=device)
+            # Init the layer
             layer.lazy_initialization(fake_kv_tensor, fake_kv_tensor)
 
     def get_seq_length(self, layer_idx: int = 0) -> int:
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index ebc4d64b55f9..0022191a0ea5 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -702,9 +702,22 @@ def __init__(
         # simple StaticLayer... It means that any generation beyond the window is unfortunately unsupported
         for i, layer in enumerate(self.cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
-                self.cache.layers[i] = StaticLayer(layer.max_cache_len)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+                self.cache.layers[i] = StaticLayer(max_cache_len)
+        # Gemma4 has different head_dim and num_heads depending on layer type
+        if hasattr(config, "global_head_dim"):
+            head_dim = [
+                config.global_head_dim if layer == "full_attention" else config.head_dim
+                for layer in config.layer_types[: -config.num_kv_shared_layers]
+            ]
+            num_heads = [
+                config.num_global_key_value_heads
+                if layer == "full_attention" and config.attention_k_eq_v
+                else config.num_key_value_heads
+                for layer in config.layer_types[: -config.num_kv_shared_layers]
+            ]
+        else:
+            head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+            num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
         dtype = self.model.dtype
         # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
         self.cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 087acf90156b..8593bf1d0791 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -27,7 +27,6 @@
 from transformers.testing_utils import (
     Expectations,
     cleanup,
-    is_flash_attn_2_available,
     require_torch,
     require_torch_accelerator,
     slow,
@@ -559,15 +558,13 @@ def test_model_text_only(self):
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    # Note: we do not test FA2 as the head dim is 512 on some layers, which is not compatible with the kernels
     @parameterized.expand([("sdpa",), ("eager",)])
     def test_generation_beyond_sliding_window(self, attn_implementation: str):
         """Test that we can correctly generate beyond the sliding window. Outputs for every attention functions
         should be coherent and identical.
         """
 
-        if attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
-            self.skipTest("FlashAttention2 is required for this test.")
-
         input_text = [
             "This is a nice place. " * 800 + "I really enjoy the scenery,",  # This is larger than 4096 tokens
             "A list of colors: red, blue",  # This will almost all be padding tokens
@@ -577,7 +574,6 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
             tokenizer.apply_chat_template(
                 [{"role": "user", "content": item}],
                 tokenize=False,
-                return_dict=True,
                 add_generation_prompt=True,
             )
             for item in input_text
@@ -634,7 +630,6 @@ def test_export_text_only(self):
         prompt = tokenizer.apply_chat_template(
             [{"role": "user", "content": "What is the capital of France?"}],
             tokenize=False,
-            return_dict=True,
             add_generation_prompt=True,
         )
 

From b3dcf69598ac12c3d13fe80f3902b4b24b172631 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 12:40:53 +0200
Subject: [PATCH 06/11] fix

---
 src/transformers/integrations/executorch.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 0022191a0ea5..a798a0bf1179 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -524,8 +524,21 @@ def __init__(
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
                 self.static_cache.layers[i] = StaticLayer(layer.max_cache_len)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        # Gemma4 has different head_dim and num_heads depending on layer type
+        if hasattr(config, "global_head_dim"):
+            head_dim = [
+                config.global_head_dim if layer == "full_attention" else config.head_dim
+                for layer in config.layer_types[: -config.num_kv_shared_layers]
+            ]
+            num_heads = [
+                config.num_global_key_value_heads
+                if layer == "full_attention" and config.attention_k_eq_v
+                else config.num_key_value_heads
+                for layer in config.layer_types[: -config.num_kv_shared_layers]
+            ]
+        else:
+            head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+            num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
         dtype = self.model.dtype
         # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)

From 32e9eafba8211889690607dcf8e3194057de34f0 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 12:45:45 +0200
Subject: [PATCH 07/11] oupsi

---
 src/transformers/integrations/executorch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index a798a0bf1179..9b8289d7df73 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -523,7 +523,7 @@ def __init__(
         # simple StaticLayer... It means that any generation beyond the window is unfortunately unsupported
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
-                self.static_cache.layers[i] = StaticLayer(layer.max_cache_len)
+                self.static_cache.layers[i] = StaticLayer(max_cache_len)
         # Gemma4 has different head_dim and num_heads depending on layer type
         if hasattr(config, "global_head_dim"):
             head_dim = [
@@ -882,7 +882,7 @@ def __init__(self, model, max_static_cache_length, batch_size):
         # simple StaticLayer... It means that any generation beyond the window is unfortunately unsupported
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
-                self.static_cache.layers[i] = StaticLayer(layer.max_cache_len)
+                self.static_cache.layers[i] = StaticLayer(max_static_cache_length)
         head_dim = getattr(self.config, "head_dim", self.config.hidden_size // self.config.num_attention_heads)
         num_heads = getattr(self.config, "num_key_value_heads", self.config.num_attention_heads)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)

From fcee22e3ca149dfc4275ce45e406c54561872918 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 14:11:37 +0200
Subject: [PATCH 08/11] review comments

---
 src/transformers/integrations/executorch.py | 57 +++++++++------------
 tests/models/gemma4/test_modeling_gemma4.py | 52 ++++++++-----------
 utils/fetch_hub_objects_for_ci.py           |  2 +
 3 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 9b8289d7df73..2ecc0889898d 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -443,6 +443,28 @@ def generate(
         return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
 
 
+def get_head_shapes(config) -> tuple[int | list[int], int | list[int]]:
+    """Returns a tuple `(num_heads, head_dim)` containing either 2 ints, or a list of int with the value for each
+    layer."""
+    # Gemma4 has different head_dim and num_heads depending on layer type
+    if hasattr(config, "global_head_dim"):
+        head_dim = [
+            config.global_head_dim if layer == "full_attention" else config.head_dim
+            for layer in config.layer_types[: -config.num_kv_shared_layers]
+        ]
+        num_heads = [
+            config.num_global_key_value_heads
+            if layer == "full_attention" and config.attention_k_eq_v
+            else config.num_key_value_heads
+            for layer in config.layer_types[: -config.num_kv_shared_layers]
+        ]
+    else:
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+
+    return num_heads, head_dim
+
+
 class TorchExportableModuleWithStaticCache(torch.nn.Module):
     """
     A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
@@ -524,21 +546,7 @@ def __init__(
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
                 self.static_cache.layers[i] = StaticLayer(max_cache_len)
-        # Gemma4 has different head_dim and num_heads depending on layer type
-        if hasattr(config, "global_head_dim"):
-            head_dim = [
-                config.global_head_dim if layer == "full_attention" else config.head_dim
-                for layer in config.layer_types[: -config.num_kv_shared_layers]
-            ]
-            num_heads = [
-                config.num_global_key_value_heads
-                if layer == "full_attention" and config.attention_k_eq_v
-                else config.num_key_value_heads
-                for layer in config.layer_types[: -config.num_kv_shared_layers]
-            ]
-        else:
-            head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-            num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        num_heads, head_dim = get_head_shapes(config)
         dtype = self.model.dtype
         # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
@@ -716,21 +724,7 @@ def __init__(
         for i, layer in enumerate(self.cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
                 self.cache.layers[i] = StaticLayer(max_cache_len)
-        # Gemma4 has different head_dim and num_heads depending on layer type
-        if hasattr(config, "global_head_dim"):
-            head_dim = [
-                config.global_head_dim if layer == "full_attention" else config.head_dim
-                for layer in config.layer_types[: -config.num_kv_shared_layers]
-            ]
-            num_heads = [
-                config.num_global_key_value_heads
-                if layer == "full_attention" and config.attention_k_eq_v
-                else config.num_key_value_heads
-                for layer in config.layer_types[: -config.num_kv_shared_layers]
-            ]
-        else:
-            head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-            num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+        num_heads, head_dim = get_head_shapes(config)
         dtype = self.model.dtype
         # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
         self.cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
@@ -883,8 +877,7 @@ def __init__(self, model, max_static_cache_length, batch_size):
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
                 self.static_cache.layers[i] = StaticLayer(max_static_cache_length)
-        head_dim = getattr(self.config, "head_dim", self.config.hidden_size // self.config.num_attention_heads)
-        num_heads = getattr(self.config, "num_key_value_heads", self.config.num_attention_heads)
+        num_heads, head_dim = get_head_shapes(config)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)
         self.cache = EncoderDecoderCache(self.static_cache, DynamicCache(config=self.config))
 
diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 8593bf1d0791..e9bb1ca747d9 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -37,6 +37,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_processing_common import url_to_local_path
 
 
 if is_torch_available():
@@ -418,15 +419,18 @@ def test_generate_from_random_inputs_embeds(self):
 class Gemma4IntegrationTest(unittest.TestCase):
     def setUp(self):
         self.model_name = "google/gemma-4-E2B-it"
-        self.processor = Gemma4Processor.from_pretrained(self.model_name, padding_side="left")
+        self.processor = Gemma4Processor.from_pretrained(self.model_name)
 
-        url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        self.url1 = url_to_local_path(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        )
+        self.url2 = url_to_local_path("https://www.ilankelman.org/stopsigns/australia.jpg")
         self.messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "url": url},
+                    {"type": "image", "url": self.url1},
                     {"type": "text", "text": "What is shown in this image?"},
                 ],
             },
@@ -468,9 +472,9 @@ def test_model_4b_batch(self):
                 "content": [
                     {
                         "type": "image",
-                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                        "url": self.url1,
                     },
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "image", "url": self.url2},
                     {"type": "text", "text": "Are these images identical?"},
                 ],
             },
@@ -509,7 +513,7 @@ def test_model_4b_multiimage(self):
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "image", "url": self.url2},
                     {"type": "text", "text": "What do you see here?"},
                 ],
             },
@@ -590,40 +594,31 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
         input_size = inputs.input_ids.shape[-1]
         self.assertTrue(input_size > model.config.get_text_config().sliding_window)
 
-        out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")
+        out = model.generate(**inputs, max_new_tokens=16, do_sample=False, cache_implementation="static")
         output_text = tokenizer.batch_decode(out[:, input_size:])
 
-        EXPECTED_COMPLETIONS_EAGER = Expectations(
+        EXPECTED_COMPLETIONS = Expectations(
             {
                 ("cuda", 8): [
-                    "That sounds lovely! It seems like you're really enjoying the place you're in.\n\n",
-                    "Here are a few ways you could use or expand upon that list, depending on what you need:",
+                    "That sounds lovely! It seems like you're really enjoying the place you'",
+                    "Here are a few ways you could use or expand upon that list, depending on",
                 ]
             }
         )
-        EXPECTED_COMPLETIONS_SDPA = Expectations(
-            {
-                ("cuda", 8): [
-                    "That sounds lovely! It seems like you're really enjoying the place you're in.\n\n",
-                    "Here are a few ways you could use or expand upon that list, depending on what you're",
-                ]
-            }
-        )
-        if attn_implementation == "eager":
-            self.assertEqual(output_text, EXPECTED_COMPLETIONS_EAGER.get_expectation())
-        elif attn_implementation == "sdpa":
-            self.assertEqual(output_text, EXPECTED_COMPLETIONS_SDPA.get_expectation())
+        self.assertEqual(output_text, EXPECTED_COMPLETIONS.get_expectation())
 
     @pytest.mark.torch_export_test
     def test_export_text_only(self):
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name)
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(model, batch_size=1, max_cache_len=1024)
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            model, batch_size=1, max_cache_len=1024, device=torch_device
+        )
         exported_program = exportable_module.export(
-            input_ids=torch.tensor([[1]], dtype=torch.long),
+            input_ids=torch.tensor([[1]], device=torch_device, dtype=torch.long),
         )
 
         # Test generation with the exported model
@@ -636,13 +631,10 @@ def test_export_text_only(self):
         max_new_tokens_to_generate = 20
         # Generate text with the exported model
         export_generated_text = TorchExportableModuleForDecoderOnlyLM.generate(
-            exported_program,
-            tokenizer,
-            prompt,
-            max_new_tokens=max_new_tokens_to_generate,
+            exported_program, tokenizer, prompt, max_new_tokens=max_new_tokens_to_generate, device=torch_device
         )
 
-        input_text = tokenizer(prompt, return_tensors="pt")
+        input_text = tokenizer(prompt, return_tensors="pt").to(torch_device)
         eager_outputs = model.generate(
             **input_text,
             max_new_tokens=max_new_tokens_to_generate,
diff --git a/utils/fetch_hub_objects_for_ci.py b/utils/fetch_hub_objects_for_ci.py
index fe0afe4c32fd..59cf65117913 100644
--- a/utils/fetch_hub_objects_for_ci.py
+++ b/utils/fetch_hub_objects_for_ci.py
@@ -18,12 +18,14 @@
     "http://images.cocodataset.org/val2017/000000000802.jpg",
     "http://images.cocodataset.org/val2017/000000000872.jpg",
     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    "https://www.ilankelman.org/stopsigns/australia.jpg",
     "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
     "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
     "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
     "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png",
     "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png",
+    "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
     "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav",
     "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3",

From f9e814a7608aeb0b3bfa9298cc4dcb395e26d22d Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 14:49:38 +0200
Subject: [PATCH 09/11] fix expectations for a10

---
 src/transformers/integrations/executorch.py | 2 +-
 tests/models/gemma4/test_modeling_gemma4.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 2ecc0889898d..675a0ea5783a 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -877,7 +877,7 @@ def __init__(self, model, max_static_cache_length, batch_size):
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
                 self.static_cache.layers[i] = StaticLayer(max_static_cache_length)
-        num_heads, head_dim = get_head_shapes(config)
+        num_heads, head_dim = get_head_shapes(self.config)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)
         self.cache = EncoderDecoderCache(self.static_cache, DynamicCache(config=self.config))
 
diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index e9bb1ca747d9..d075030d878e 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -495,11 +495,14 @@ def test_model_4b_batch(self):
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("cuda", 8):
+                ("cuda", (8, 0)):
                     [
                         'This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background',
                         'No, these images are not identical.\n\nThe first image is a photograph of a **cow** standing on a beach under a blue sky.\n\n'
                     ],
+                ("cuda", (8, 6)):
+                    ['This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background',
+                     'No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue']
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -556,7 +559,8 @@ def test_model_text_only(self):
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("cuda", 8): ['## The Algorithmic Mind\n\nA whisper starts, a seed unseen,\nOf data vast, a vibrant sheen.\nA sea of numbers,'],
+                ("cuda", (8, 0)): ['## The Algorithmic Mind\n\nA whisper starts, a seed unseen,\nOf data vast, a vibrant sheen.\nA sea of numbers,'],
+                ("cuda", (8, 6)): ['## The Algorithmic Mind\n\nA tapestry of data, vast and deep,\nWhere silent numbers in their slumber sleep.\nA sea of text'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()

From dec020b91bf09761a594d78840db8afe4904f0e9 Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 14:51:48 +0200
Subject: [PATCH 10/11] style

---
 tests/models/gemma4/test_modeling_gemma4.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index d075030d878e..0fc8d60878c3 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -495,16 +495,16 @@ def test_model_4b_batch(self):
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("cuda", (8, 0)):
-                    [
-                        'This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background',
-                        'No, these images are not identical.\n\nThe first image is a photograph of a **cow** standing on a beach under a blue sky.\n\n'
-                    ],
-                ("cuda", (8, 6)):
-                    ['This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background',
-                     'No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue']
+                ("cuda", (8, 0)): [
+                    "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
+                    "No, these images are not identical.\n\nThe first image is a photograph of a **cow** standing on a beach under a blue sky.\n\n",
+                ],
+                ("cuda", (8, 6)): [
+                    "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
+                    "No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue",
+                ],
             }
-        )  # fmt: skip
+        )
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 

From 3728c8f433ad9fbf7c88fd77cea81b0b77f0fd6b Mon Sep 17 00:00:00 2001
From: Cyril Vallez <cyril.vallez@gmail.com>
Date: Wed, 8 Apr 2026 14:54:27 +0200
Subject: [PATCH 11/11] rename tests

---
 tests/models/gemma4/test_modeling_gemma4.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 0fc8d60878c3..8bf16feca731 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -439,7 +439,7 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
-    def test_model_4b(self):
+    def test_model_with_image(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         inputs = self.processor.apply_chat_template(
@@ -462,7 +462,7 @@ def test_model_4b(self):
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    def test_model_4b_batch(self):
+    def test_model_with_image_batch(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         messages_2 = [
@@ -508,7 +508,7 @@ def test_model_4b_batch(self):
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    def test_model_4b_multiimage(self):
+    def test_model_multiimage(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         messages = [