diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index ac324ebb62b4..d86edfd33426 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -982,18 +982,38 @@ def update_recurrent_state(self, recurrent_states: torch.Tensor, layer_idx: int,
         return recurrent_states
 
     def early_initialization(
-        self, batch_size: int, num_heads: int, head_dim: int, dtype: torch.dtype, device: torch.device
+        self,
+        batch_size: int,
+        num_heads: int | list[int],
+        head_dim: int | list[int],
+        dtype: torch.dtype,
+        device: torch.device,
     ):
         """
         Initialize all the layers in advance (it's otherwise lazily initialized on the first `update` call).
         This is useful for our `export` recipes, as `export` needs everything in advance.
         """
-        # Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
-        # this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
-        # creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
-        fake_kv_tensor = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
-        # Init all layers
-        for layer in self.layers:
+        # To allow different num_heads and head_dim depending on layers, we accept lists
+        if isinstance(num_heads, int):
+            num_heads = [num_heads] * len(self)
+        if isinstance(head_dim, int):
+            head_dim = [head_dim] * len(self)
+
+        if len(num_heads) != len(self.layers):
+            raise ValueError(
+                f"`num_head` was provided as a list of length {len(num_heads)}, but the Cache currently has {len(self.layers)} layers"
+            )
+        if len(head_dim) != len(self.layers):
+            raise ValueError(
+                f"`head_dim` was provided as a list of length {len(num_heads)}, but the Cache currently has {len(self.layers)} layers"
+            )
+
+        for layer, layer_num_heads, layer_head_dim in zip(self.layers, num_heads, head_dim):
+            # Note that the initialization needs all dimensions (except -2), as well as device and dtype, so we use
+            # this fake tensor approach. It has size 0 on the -2 dimension, so it does not allocate any data (it only
+            # creates an empty tensor with correct shape, dtype and device), which is very efficient and practical
+            fake_kv_tensor = torch.zeros((batch_size, layer_num_heads, 0, layer_head_dim), dtype=dtype, device=device)
+            # Init the layer
             layer.lazy_initialization(fake_kv_tensor, fake_kv_tensor)
 
     def get_seq_length(self, layer_idx: int = 0) -> int:
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index ebc4d64b55f9..675a0ea5783a 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -443,6 +443,28 @@ def generate(
         return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
 
 
+def get_head_shapes(config) -> tuple[int | list[int], int | list[int]]:
+    """Returns a tuple `(num_heads, head_dim)` containing either 2 ints, or a list of int with the value for each
+    layer."""
+    # Gemma4 has different head_dim and num_heads depending on layer type
+    if hasattr(config, "global_head_dim"):
+        head_dim = [
+            config.global_head_dim if layer == "full_attention" else config.head_dim
+            for layer in config.layer_types[: -config.num_kv_shared_layers]
+        ]
+        num_heads = [
+            config.num_global_key_value_heads
+            if layer == "full_attention" and config.attention_k_eq_v
+            else config.num_key_value_heads
+            for layer in config.layer_types[: -config.num_kv_shared_layers]
+        ]
+    else:
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+
+    return num_heads, head_dim
+
+
 class TorchExportableModuleWithStaticCache(torch.nn.Module):
     """
     A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
@@ -523,9 +545,8 @@ def __init__(
         # simple StaticLayer... It means that any generation beyond the window is unfortunately unsupported
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
-                self.static_cache.layers[i] = StaticLayer(layer.max_cache_len)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+                self.static_cache.layers[i] = StaticLayer(max_cache_len)
+        num_heads, head_dim = get_head_shapes(config)
         dtype = self.model.dtype
         # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
@@ -702,9 +723,8 @@ def __init__(
         # simple StaticLayer... It means that any generation beyond the window is unfortunately unsupported
         for i, layer in enumerate(self.cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
-                self.cache.layers[i] = StaticLayer(layer.max_cache_len)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
+                self.cache.layers[i] = StaticLayer(max_cache_len)
+        num_heads, head_dim = get_head_shapes(config)
         dtype = self.model.dtype
         # We need this call to initialize all the layers (otherwise it's done lazily, which is not exportable)
         self.cache.early_initialization(batch_size, num_heads, head_dim, dtype, device)
@@ -856,9 +876,8 @@ def __init__(self, model, max_static_cache_length, batch_size):
         # simple StaticLayer... It means that any generation beyond the window is unfortunately unsupported
         for i, layer in enumerate(self.static_cache.layers):
             if isinstance(layer, StaticSlidingWindowLayer):
-                self.static_cache.layers[i] = StaticLayer(layer.max_cache_len)
-        head_dim = getattr(self.config, "head_dim", self.config.hidden_size // self.config.num_attention_heads)
-        num_heads = getattr(self.config, "num_key_value_heads", self.config.num_attention_heads)
+                self.static_cache.layers[i] = StaticLayer(max_static_cache_length)
+        num_heads, head_dim = get_head_shapes(self.config)
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)
         self.cache = EncoderDecoderCache(self.static_cache, DynamicCache(config=self.config))
 
diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index c63e9ba20165..8bf16feca731 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 """Testing suite for the PyTorch Gemma4 model."""
 
-import logging
 import unittest
 
 import pytest
 from parameterized import parameterized
 
 from transformers import (
-    AutoModelForCausalLM,
     AutoTokenizer,
     Gemma4Config,
     Gemma4TextConfig,
@@ -29,12 +27,8 @@
 from transformers.testing_utils import (
     Expectations,
     cleanup,
-    is_flash_attn_2_available,
-    require_deterministic_for_xpu,
-    require_flash_attn,
     require_torch,
     require_torch_accelerator,
-    require_torch_large_accelerator,
     slow,
     torch_device,
 )
@@ -43,19 +37,20 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_processing_common import url_to_local_path
 
 
 if is_torch_available():
     import torch
 
     from transformers import (
+        AutoModelForCausalLM,
         Gemma4ForCausalLM,
         Gemma4ForConditionalGeneration,
         Gemma4Model,
         Gemma4Processor,
         Gemma4TextModel,
     )
-    from transformers.pytorch_utils import is_torch_greater_or_equal
 
 
 class Gemma4TextModelTester(CausalLMModelTester):
@@ -419,20 +414,23 @@ def test_generate_from_random_inputs_embeds(self):
         pass
 
 
-@unittest.skip("Integration Tests are not up-to-date yet! TODO Cyril: update me pretty pretty please!")
 @slow
 @require_torch_accelerator
 class Gemma4IntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = Gemma4Processor.from_pretrained("google/gemma-4-e2b-it", padding_side="left")
+        self.model_name = "google/gemma-4-E2B-it"
+        self.processor = Gemma4Processor.from_pretrained(self.model_name)
 
-        url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        self.url1 = url_to_local_path(
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+        )
+        self.url2 = url_to_local_path("https://www.ilankelman.org/stopsigns/australia.jpg")
         self.messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "url": url},
+                    {"type": "image", "url": self.url1},
                     {"type": "text", "text": "What is shown in this image?"},
                 ],
             },
@@ -441,11 +439,8 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
-    @require_deterministic_for_xpu
-    def test_model_4b_bf16(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+    def test_model_with_image(self):
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         inputs = self.processor.apply_chat_template(
             self.messages,
@@ -455,28 +450,20 @@ def test_model_4b_bf16(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
-                ("cuda", (8, 0)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'],
-                ("cuda", (8, 6)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
-                ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
-                ("rocm", (9, 5)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant coastline in the background. It looks'],
+                ("cuda", 8): ['This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_torch_large_accelerator
-    @require_deterministic_for_xpu
-    def test_model_4b_batch(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+    def test_model_with_image_batch(self):
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         messages_2 = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
@@ -485,9 +472,9 @@ def test_model_4b_batch(self):
                 "content": [
                     {
                         "type": "image",
-                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                        "url": self.url1,
                     },
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "image", "url": self.url2},
                     {"type": "text", "text": "Are these images identical?"},
                 ],
             },
@@ -502,170 +489,34 @@ def test_model_4b_batch(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("xpu", 3):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n*   **Image 1** shows a cow standing on a beach.',
-                    ],
-                ("cuda", (8,0)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
-                        "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a brown"
-                        ],
-                ("cuda", (8,6)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like',
-                        "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a brown"
-                    ],
-                ("rocm", (9, 4)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like',
-                        "user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n*   **Image 1:** Shows a cow"
-                    ],
-                ("rocm", (9, 5)):
-                    [
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. There are some clouds in the blue',
-                        'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes. \n\n*   **Image 1** shows a cow standing on a beach',
-                    ],
-            }
-        )  # fmt: skip
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(output_text, EXPECTED_TEXT)
-
-    @require_torch_large_accelerator
-    def test_model_4b_crops(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
-
-        crop_config = {
-            "images_kwargs": {
-                "do_pan_and_scan": True,
-                "pan_and_scan_max_num_crops": 448,
-                "pan_and_scan_min_crop_size": 32,
-                "pan_and_scan_min_ratio_to_activate": 0.3,
-            }
-        }
-
-        inputs = self.processor.apply_chat_template(
-            self.messages,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            add_generation_prompt=True,
-            **crop_config,
-        ).to(torch_device)
-
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-
-        EXPECTED_NUM_IMAGES = 3  # one for the origin image and two crops of images
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
-                ("cuda", 7): [],
-                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a clear blue sky with some white clouds above."],
-                ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
-                ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
-                ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"]
-            }
-        )  # fmt: skip
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
-        print(f"Generated text: {output_text}")
-        self.assertEqual(output_text, EXPECTED_TEXT)
-
-    @require_torch_large_accelerator
-    @require_deterministic_for_xpu
-    def test_model_4b_batch_crops(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
-        crop_config = {
-            "images_kwargs": {
-                "do_pan_and_scan": True,
-                "pan_and_scan_max_num_crops": 448,
-                "pan_and_scan_min_crop_size": 32,
-                "pan_and_scan_min_ratio_to_activate": 0.3,
-            }
-        }
-        messages_2 = [
-            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "url": "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
-                    },
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-                    {"type": "text", "text": "Are these images identical?"},
-                ],
-            },
-        ]
-
-        inputs = self.processor.apply_chat_template(
-            [self.messages, messages_2],
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            padding=True,
-            add_generation_prompt=True,
-            **crop_config,
-        ).to(torch_device)
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-        EXPECTED_NUM_IMAGES = 9  # 3 * (one for the origin image and two crops of images) = 9
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): [
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
+                ("cuda", (8, 0)): [
+                    "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
+                    "No, these images are not identical.\n\nThe first image is a photograph of a **cow** standing on a beach under a blue sky.\n\n",
                 ],
-                ("cuda", 7): [],
-                ("cuda", (8,0)): [
-                    "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background",
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                    ],
                 ("cuda", (8, 6)): [
-                    "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the",
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                ],
-                ("rocm", (9, 4)) : [
-                    "user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the",
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'
-                    ],
-                ("rocm", (9, 5)) : [
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
-                    'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
+                    "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
+                    "No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue",
                 ],
             }
-        )  # fmt: skip
+        )
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(len(inputs["pixel_values"]), EXPECTED_NUM_IMAGES)
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @require_torch_large_accelerator
-    def test_model_4b_multiimage(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
+    def test_model_multiimage(self):
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
         messages = [
             {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    {"type": "image", "url": self.url2},
                     {"type": "text", "text": "What do you see here?"},
                 ],
             },
@@ -680,187 +531,119 @@ def test_model_4b_multiimage(self):
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n*   **Chinese Arch"],
-                ("cuda", 7): [],
-                ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"],
-                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],
-                ("rocm", (9, 4)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a vibrant"],
-                ("rocm", (9, 5)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Main Features:**\n\n*   **Chinese Archway:** The most prominent"],
-            }
-        )  # fmt: skip
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
-        self.assertEqual(output_text, EXPECTED_TEXT)
-
-    @require_deterministic_for_xpu
-    def test_model_1b_text_only(self):
-        model_id = "google/gemma-3-1b-it"
-
-        model = Gemma4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16).to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
-        inputs = tokenizer("Write a poem about Machine Learning.", return_tensors="pt").to(torch_device)
-
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'],
-                ("cuda", 7): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
-                ("cuda", 8): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
-                ("rocm", (9, 4)): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a silent stream,\nInto the neural net, a waking dream.\nAlgorithms hum, a coded grace,\n'],
-                ("rocm", (9, 5)): ['Write a poem about Machine Learning.\n\n---\n\nThe data flows, a river deep,\nWith patterns hidden, secrets sleep.\nA neural net, a watchful eye,\nLearning'],
+                ("cuda", 8): ['Based on the image, here is a description of what I see:\n\n**Foreground & Street Scene:**\n* **Traffic Sign:** The most prominent'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    # TODO: raushan FA2 generates gibberish for no reason, check later
-    @require_flash_attn
-    @require_torch_large_accelerator
-    @pytest.mark.flash_attn_test
-    def test_model_4b_flash_attn(self):
-        model_id = "google/gemma-4-e2b-it"
-
-        model = Gemma4ForConditionalGeneration.from_pretrained(
-            model_id, dtype=torch.bfloat16, attn_implementation="flash_attention_2"
-        ).to(torch_device)
-
-        inputs = self.processor.apply_chat_template(
-            self.messages,
+    def test_model_text_only(self):
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map=torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")
+        inputs = tokenizer.apply_chat_template(
+            [{"role": "user", "content": "Write a poem about Machine Learning."}],
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
             add_generation_prompt=True,
         ).to(torch_device)
 
-        # cache_implementation="hybrid" an in the original transformers implementation
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, cache_implementation="hybrid")
-        output_text = self.processor.batch_decode(output, skip_special_tokens=True)
+        output = model.generate(**inputs, max_new_tokens=30, do_sample=False)
+        input_size = inputs.input_ids.shape[-1]
+        output_text = self.processor.batch_decode(output[:, input_size:], skip_special_tokens=True)
 
         EXPECTED_TEXTS = Expectations(
             {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
-                ("cuda", 7): [],
-                ("cuda", 8): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with turquoise water and a distant island in the background. It looks like a sunny day'],
-                ("rocm", (9, 5)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown and white cow standing on a sandy beach with a turquoise ocean and a distant island in the background. It looks like a sunny'],
+                ("cuda", (8, 0)): ['## The Algorithmic Mind\n\nA whisper starts, a seed unseen,\nOf data vast, a vibrant sheen.\nA sea of numbers,'],
+                ("cuda", (8, 6)): ['## The Algorithmic Mind\n\nA tapestry of data, vast and deep,\nWhere silent numbers in their slumber sleep.\nA sea of text'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
-    @parameterized.expand([("flash_attention_2",), ("sdpa",), ("eager",)])
+    # Note: we do not test FA2 as the head dim is 512 on some layers, which is not compatible with the kernels
+    @parameterized.expand([("sdpa",), ("eager",)])
     def test_generation_beyond_sliding_window(self, attn_implementation: str):
-        """Test that we can correctly generate beyond the sliding window. This is non trivial as
-        we need to correctly slice the attention mask in all cases (because we use a hybrid cache).
-        Outputs for every attention functions should be coherent and identical.
+        """Test that we can correctly generate beyond the sliding window. Outputs for every attention functions
+        should be coherent and identical.
         """
-        model_id = "google/gemma-3-1b-it"
-
-        if attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
-            self.skipTest("FlashAttention2 is required for this test.")
 
         input_text = [
             "This is a nice place. " * 800 + "I really enjoy the scenery,",  # This is larger than 4096 tokens
             "A list of colors: red, blue",  # This will almost all be padding tokens
         ]
-        tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left")
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding="left")
+        input_text = [
+            tokenizer.apply_chat_template(
+                [{"role": "user", "content": item}],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            for item in input_text
+        ]
         inputs = tokenizer(input_text, padding=True, return_tensors="pt").to(torch_device)
 
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, attn_implementation=attn_implementation, dtype=torch.float16
-        ).to(torch_device)
+        model = Gemma4ForConditionalGeneration.from_pretrained(
+            self.model_name,
+            device_map=torch_device,
+            attn_implementation=attn_implementation,
+        )
 
         # Make sure prefill is larger than sliding window
         input_size = inputs.input_ids.shape[-1]
-        self.assertTrue(input_size > model.config.sliding_window)
+        self.assertTrue(input_size > model.config.get_text_config().sliding_window)
 
-        out = model.generate(**inputs, max_new_tokens=20, do_sample=False, cache_implementation="static")[
-            :, input_size:
-        ]
-        output_text = tokenizer.batch_decode(out)
+        out = model.generate(**inputs, max_new_tokens=16, do_sample=False, cache_implementation="static")
+        output_text = tokenizer.batch_decode(out[:, input_size:])
 
-        EXPECTED_COMPLETIONS = [" and I'm going to take a walk.\n\nI really enjoy the scenery, and I'", ", green, yellow, orange, purple, brown, black, white, gray.\n\nI'"]  # fmt: skip
-        self.assertEqual(output_text, EXPECTED_COMPLETIONS)
+        EXPECTED_COMPLETIONS = Expectations(
+            {
+                ("cuda", 8): [
+                    "That sounds lovely! It seems like you're really enjoying the place you'",
+                    "Here are a few ways you could use or expand upon that list, depending on",
+                ]
+            }
+        )
+        self.assertEqual(output_text, EXPECTED_COMPLETIONS.get_expectation())
 
     @pytest.mark.torch_export_test
-    def test_export_text_only_with_hybrid_cache(self):
-        if not is_torch_greater_or_equal("2.6.0"):
-            self.skipTest(reason="This test requires torch >= 2.6 to run.")
-
+    def test_export_text_only(self):
         from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-        model_id = "google/gemma-3-1b-it"
-        model = AutoModelForCausalLM.from_pretrained(model_id)
-        self.assertEqual(model.config.cache_implementation, "hybrid")
+        model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
-        # Export + hybrid cache
-        model.eval()
-        exportable_module = TorchExportableModuleForDecoderOnlyLM(model, batch_size=1, max_cache_len=1024)
+        exportable_module = TorchExportableModuleForDecoderOnlyLM(
+            model, batch_size=1, max_cache_len=1024, device=torch_device
+        )
         exported_program = exportable_module.export(
-            input_ids=torch.tensor([[1]], dtype=torch.long, device=model.device),
+            input_ids=torch.tensor([[1]], device=torch_device, dtype=torch.long),
         )
-        logging.info(f"\nExported program: {exported_program}")
 
         # Test generation with the exported model
-        prompt = "What is the capital of France?"
+        prompt = tokenizer.apply_chat_template(
+            [{"role": "user", "content": "What is the capital of France?"}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
         max_new_tokens_to_generate = 20
         # Generate text with the exported model
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
         export_generated_text = TorchExportableModuleForDecoderOnlyLM.generate(
-            exported_program, tokenizer, prompt, max_new_tokens=max_new_tokens_to_generate
+            exported_program, tokenizer, prompt, max_new_tokens=max_new_tokens_to_generate, device=torch_device
         )
-        logging.info(f"\nExport generated texts: '{export_generated_text}'")
-
-        input_text = tokenizer(prompt, return_tensors="pt")
-        with torch.no_grad():
-            eager_outputs = model.generate(
-                **input_text,
-                max_new_tokens=max_new_tokens_to_generate,
-                do_sample=False,  # Use greedy decoding to match the exported model
-                cache_implementation="hybrid",
-            )
 
-        eager_generated_text = tokenizer.decode(eager_outputs[0], skip_special_tokens=True)
-        logging.info(f"\nEager generated texts: '{eager_generated_text}'")
+        input_text = tokenizer(prompt, return_tensors="pt").to(torch_device)
+        eager_outputs = model.generate(
+            **input_text,
+            max_new_tokens=max_new_tokens_to_generate,
+            do_sample=False,  # Use greedy decoding to match the exported model
+        )
 
+        eager_generated_text = tokenizer.decode(eager_outputs[0], skip_special_tokens=True)
         self.assertEqual(export_generated_text, eager_generated_text)
-
-    def test_dynamic_sliding_window_is_default(self):
-        """
-        Test that the dynamic sliding window cache (added in #40039) is the default cache implementation for Gemma4
-        models, despite the fact that Hub checkpoints may have `cache_implementation="hybrid"` (static sliding window).
-        """
-        model_id = "google/gemma-3-1b-it"
-        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
-
-        # the default cache is static sliding window
-        self.assertEqual(model.config.cache_implementation, "hybrid")
-        self.assertEqual(model.generation_config.cache_implementation, "hybrid")
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        prompt = "What is the capital of France?"
-        model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-        foward_outputs = model(**model_inputs)
-        self.assertIn("DynamicSlidingWindowLayer", str(foward_outputs.past_key_values))
-
-        generate_outputs = model.generate(
-            **model_inputs, max_new_tokens=2, do_sample=False, return_dict_in_generate=True
-        )
-        self.assertIn("DynamicSlidingWindowLayer", str(generate_outputs.past_key_values))
-
-        # If we manually specify the cache implementation = "hybrid", it will use the static sliding window cache
-        generate_outputs = model.generate(
-            **model_inputs,
-            max_new_tokens=2,
-            do_sample=False,
-            return_dict_in_generate=True,
-            cache_implementation="hybrid",
-        )
-        self.assertNotIn("DynamicSlidingWindowLayer", str(generate_outputs.past_key_values))
diff --git a/utils/fetch_hub_objects_for_ci.py b/utils/fetch_hub_objects_for_ci.py
index fe0afe4c32fd..59cf65117913 100644
--- a/utils/fetch_hub_objects_for_ci.py
+++ b/utils/fetch_hub_objects_for_ci.py
@@ -18,12 +18,14 @@
     "http://images.cocodataset.org/val2017/000000000802.jpg",
     "http://images.cocodataset.org/val2017/000000000872.jpg",
     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    "https://www.ilankelman.org/stopsigns/australia.jpg",
     "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
     "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
     "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
     "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png",
     "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png",
+    "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
     "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav",
     "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3",