From facdf82ecb6b10d77596936644085eca6cc630bc Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Mon, 9 Feb 2026 23:32:58 -0400
Subject: [PATCH 01/11] Add PowerVR GPU detection and initial support to Vulkan
 backend

Add PowerVR GPU type detection to the Vulkan backend device enumeration,
PowerVR-specific workgroup size tuning for convolution operators, and
correctness fixes for PowerVR's TBDR architecture.

Changes:
- Add POWERVR to DeviceType enum with string detection
- Add device_is_powervr() convenience method on ComputeGraph
- Add PowerVR-specific workgroup sizes (32 instead of 64) for
  convolution dispatch to match PowerVR execution unit configuration
- Force optimal tiling on PowerVR (linear tiling may produce
  incorrect results in compute shaders on TBDR architecture)
- Enable robustBufferAccess on PowerVR for well-defined OOB behavior

Tested on Pixel 10 Pro (PowerVR D-Series DXT-48-1536 MC1):
- FP32 convolution passes all tests
- Non-conv FP16 ops (add, multiply) pass correctly
- FP16 conv has known bias texture initialization issue (#17299)

Related: #17299
---
 backends/vulkan/runtime/api/Context.cpp       |  6 +-
 backends/vulkan/runtime/graph/ComputeGraph.h  |  3 +
 .../runtime/graph/ops/impl/Convolution.cpp    | 58 ++++++++++++++++++-
 backends/vulkan/runtime/vk_api/Adapter.cpp    | 13 ++++-
 backends/vulkan/runtime/vk_api/Device.cpp     |  2 +
 backends/vulkan/runtime/vk_api/Device.h       |  1 +
 6 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 80aef97fc04..f6740e189b3 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -46,7 +46,11 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config)
       image_clearlist_mutex_{},
       images_to_clear_{},
       preferred_image_tiling_{VK_IMAGE_TILING_OPTIMAL} {
-  if (adapter_p_->linear_tiling_3d_enabled()) {
+  // PowerVR GPUs may report linear tiling support for 3D images but not
+  // handle it correctly in compute shaders (e.g., imageStore may produce
+  // incorrect results). Force optimal tiling on PowerVR for correctness.
+  if (adapter_p_->linear_tiling_3d_enabled() &&
+      adapter_p_->device_type() != vkapi::DeviceType::POWERVR) {
     preferred_image_tiling_ = VK_IMAGE_TILING_LINEAR;
   }
 }
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index a7c8cffffd1..1b44a032850 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -661,6 +661,9 @@ class ComputeGraph final {
   inline bool device_is_adreno() {
     return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
   }
+  inline bool device_is_powervr() {
+    return context_->adapter_ptr()->device_type() == vkapi::DeviceType::POWERVR;
+  }
   const std::string& device_name() {
     return context()->adapter_ptr()->device_name();
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index efd61848af1..da70d855901 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -18,6 +18,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <iostream>
+
 namespace vkcompute {
 
 enum class Conv2dMethod : uint8_t {
@@ -401,6 +403,25 @@ utils::uvec3 conv2d_local_wg_size(
     method = Conv2dMethod::SlidingWindow;
   }
 
+  // PowerVR GPUs may not handle large workgroup sizes well. Use smaller,
+  // more conservative workgroup sizes to avoid potential hardware issues
+  // with the TBDR architecture.
+  if (graph->device_is_powervr()) {
+    if (method == Conv2dMethod::Pointwise) {
+      uint32_t local_wg_size_y = 1;
+      if (global_workgroup_size[1] % 4 == 0) {
+        local_wg_size_y = 4;
+      } else if (global_workgroup_size[1] % 2 == 0) {
+        local_wg_size_y = 2;
+      }
+      return {32 / local_wg_size_y, local_wg_size_y, 1};
+    } else if (method == Conv2dMethod::Depthwise) {
+      return {32, 1, 1};
+    } else {
+      return graph->create_local_wg_size(global_workgroup_size);
+    }
+  }
+
   if (method == Conv2dMethod::Pointwise) {
     uint32_t local_wg_size_y = 1;
     if (global_workgroup_size[1] % 8 == 0) {
@@ -523,18 +544,21 @@ void add_conv2d_node(
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
   }
 
+  // Use smaller workgroup sizes on PowerVR to avoid potential hardware issues
+  const uint32_t max_local_size = graph.device_is_powervr() ? 32u : 64u;
+
   if (method == Conv2dMethod::Pointwise) {
     uint32_t local_wg_size_y = 1;
-    if (wg_size[1] % 8 == 0) {
+    if (!graph.device_is_powervr() && wg_size[1] % 8 == 0) {
       local_wg_size_y = 8;
     } else if (wg_size[1] % 4 == 0) {
       local_wg_size_y = 4;
     } else if (wg_size[1] % 2 == 0) {
       local_wg_size_y = 2;
     }
-    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
+    local_wg_size = {max_local_size / local_wg_size_y, local_wg_size_y, 1};
   } else if (method == Conv2dMethod::Depthwise) {
-    local_wg_size = {64, 1, 1};
+    local_wg_size = {max_local_size, 1, 1};
   } else {
     local_wg_size = graph.create_local_wg_size(wg_size);
   }
@@ -595,6 +619,34 @@ void add_conv2d_node(
     };
   }
 
+  // Diagnostic logging for PowerVR devices to help debug conv2d issues
+#ifndef NDEBUG
+  if (graph.device_is_powervr()) {
+    const auto weight_sizes = graph.sizes_of(weight_data);
+    const auto out_sizes = graph.sizes_of(out);
+    const auto in_sizes_dbg = graph.sizes_of(in);
+    const char* method_str =
+        method == Conv2dMethod::Depthwise    ? "Depthwise"
+        : method == Conv2dMethod::Pointwise  ? "Pointwise"
+        : method == Conv2dMethod::Transposed ? "Transposed"
+                                             : "SlidingWindow";
+    std::cerr << "[PowerVR conv2d] method=" << method_str
+              << " shader=" << shader.kernel_name
+              << " in=[" << in_sizes_dbg[0] << "," << in_sizes_dbg[1] << ","
+              << in_sizes_dbg[2] << "," << in_sizes_dbg[3] << "]"
+              << " weight=[" << weight_sizes[0] << "," << weight_sizes[1] << ","
+              << weight_sizes[2] << "," << weight_sizes[3] << "]"
+              << " out=[" << out_sizes[0] << "," << out_sizes[1] << ","
+              << out_sizes[2] << "," << out_sizes[3] << "]"
+              << " groups=" << groups_val
+              << " global_wg=[" << wg_size[0] << "," << wg_size[1] << ","
+              << wg_size[2] << "]"
+              << " local_wg=[" << local_wg_size[0] << "," << local_wg_size[1]
+              << "," << local_wg_size[2] << "]"
+              << std::endl;
+  }
+#endif
+
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index 0a5b1601dea..a6eea9adcff 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -131,6 +131,15 @@ VkDevice create_logical_device(
       enabled_device_extensions,
       requested_device_extensions);
 
+  // Enable robustBufferAccess on PowerVR devices to ensure well-defined
+  // behavior for out-of-bounds buffer/image accesses. Without this, PowerVR
+  // drivers may return zeros or undefined values for edge cases in compute
+  // shaders. This has a minor performance cost but improves correctness.
+  VkPhysicalDeviceFeatures enabled_features{};
+  if (physical_device.device_type == DeviceType::POWERVR) {
+    enabled_features.robustBufferAccess = VK_TRUE;
+  }
+
   VkDeviceCreateInfo device_create_info{
       VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType
       nullptr, // pNext
@@ -142,7 +151,9 @@ VkDevice create_logical_device(
       static_cast<uint32_t>(
           enabled_device_extensions.size()), // enabledExtensionCount
       enabled_device_extensions.data(), // ppEnabledExtensionNames
-      nullptr, // pEnabledFeatures
+      physical_device.device_type == DeviceType::POWERVR
+          ? &enabled_features
+          : nullptr, // pEnabledFeatures
   };
 
   void* extension_list_top = nullptr;
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
index 249038ed51e..7893ea4e312 100644
--- a/backends/vulkan/runtime/vk_api/Device.cpp
+++ b/backends/vulkan/runtime/vk_api/Device.cpp
@@ -126,6 +126,8 @@ PhysicalDevice::PhysicalDevice(
     device_type = DeviceType::NVIDIA;
   } else if (device_name.find("mali") != std::string::npos) {
     device_type = DeviceType::MALI;
+  } else if (device_name.find("powervr") != std::string::npos) {
+    device_type = DeviceType::POWERVR;
   }
 }
 
diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h
index 2bc3075ffb4..dc59367d03f 100644
--- a/backends/vulkan/runtime/vk_api/Device.h
+++ b/backends/vulkan/runtime/vk_api/Device.h
@@ -24,6 +24,7 @@ enum class DeviceType : uint32_t {
   MALI,
   ADRENO,
   SWIFTSHADER,
+  POWERVR,
 };
 
 struct PhysicalDevice final {

From d0f7d20f5260be9a8a7312525c29ae1220880b57 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Mon, 9 Feb 2026 23:42:03 -0400
Subject: [PATCH 02/11] Fix missing vmaFlushAllocation in StagingBuffer

set_staging_zeros() and cast_and_copy_from() write to staging buffers
without flushing, unlike copy_from() which correctly calls
vmaFlushAllocation(). On GPUs where VMA staging memory is not
host-coherent (e.g. PowerVR), CPU writes stay in cache and the GPU
reads garbage, causing incorrect inference results.

This fixes FP16 convolution producing wrong outputs on PowerVR GPUs
where the implicit zero-bias texture reads uninitialized memory.
---
 backends/vulkan/runtime/api/containers/StagingBuffer.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index d786d030b39..5a0be19f63b 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -88,6 +88,11 @@ class StagingBuffer final {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
     }
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
   }
 
   void cast_half_to_float_and_copy_from(
@@ -117,6 +122,11 @@ class StagingBuffer final {
 
   inline void set_staging_zeros() {
     memset(data(), 0, nbytes());
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
   }
 
   template <typename T>

From a75bec1efa6cc542d6071d6951e70aa6a67398a7 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Mon, 9 Feb 2026 23:51:40 -0400
Subject: [PATCH 03/11] Remove debug logging from Convolution.cpp

Remove PowerVR-specific diagnostic cerr logging and unused iostream
include that were used during development.
---
 .../runtime/graph/ops/impl/Convolution.cpp    | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index da70d855901..31396cd20dd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -18,8 +18,6 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <iostream>
-
 namespace vkcompute {
 
 enum class Conv2dMethod : uint8_t {
@@ -619,34 +617,6 @@ void add_conv2d_node(
     };
   }
 
-  // Diagnostic logging for PowerVR devices to help debug conv2d issues
-#ifndef NDEBUG
-  if (graph.device_is_powervr()) {
-    const auto weight_sizes = graph.sizes_of(weight_data);
-    const auto out_sizes = graph.sizes_of(out);
-    const auto in_sizes_dbg = graph.sizes_of(in);
-    const char* method_str =
-        method == Conv2dMethod::Depthwise    ? "Depthwise"
-        : method == Conv2dMethod::Pointwise  ? "Pointwise"
-        : method == Conv2dMethod::Transposed ? "Transposed"
-                                             : "SlidingWindow";
-    std::cerr << "[PowerVR conv2d] method=" << method_str
-              << " shader=" << shader.kernel_name
-              << " in=[" << in_sizes_dbg[0] << "," << in_sizes_dbg[1] << ","
-              << in_sizes_dbg[2] << "," << in_sizes_dbg[3] << "]"
-              << " weight=[" << weight_sizes[0] << "," << weight_sizes[1] << ","
-              << weight_sizes[2] << "," << weight_sizes[3] << "]"
-              << " out=[" << out_sizes[0] << "," << out_sizes[1] << ","
-              << out_sizes[2] << "," << out_sizes[3] << "]"
-              << " groups=" << groups_val
-              << " global_wg=[" << wg_size[0] << "," << wg_size[1] << ","
-              << wg_size[2] << "]"
-              << " local_wg=[" << local_wg_size[0] << "," << local_wg_size[1]
-              << "," << local_wg_size[2] << "]"
-              << std::endl;
-  }
-#endif
-
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,

From c653c2e961aa5ac7f68d8d55ab6882d5461b16e3 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Tue, 10 Feb 2026 18:57:13 -0400
Subject: [PATCH 04/11] Revert "Fix missing vmaFlushAllocation in
 StagingBuffer"

This reverts commit 950906427dcc03255b2af992103aa6509fa7f590.
---
 backends/vulkan/runtime/api/containers/StagingBuffer.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index 5a0be19f63b..d786d030b39 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -88,11 +88,6 @@ class StagingBuffer final {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
     }
-    vmaFlushAllocation(
-        vulkan_buffer_.vma_allocator(),
-        vulkan_buffer_.allocation(),
-        0u,
-        VK_WHOLE_SIZE);
   }
 
   void cast_half_to_float_and_copy_from(
@@ -122,11 +117,6 @@ class StagingBuffer final {
 
   inline void set_staging_zeros() {
     memset(data(), 0, nbytes());
-    vmaFlushAllocation(
-        vulkan_buffer_.vma_allocator(),
-        vulkan_buffer_.allocation(),
-        0u,
-        VK_WHOLE_SIZE);
   }
 
   template <typename T>

From 8fe504bf1924c949ef052d69cc20583f74cbca29 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Mon, 9 Feb 2026 23:32:58 -0400
Subject: [PATCH 05/11] Add PowerVR GPU detection and initial support to Vulkan
 backend

Add PowerVR GPU type detection to the Vulkan backend device enumeration,
PowerVR-specific workgroup size tuning for convolution operators, and
correctness fixes for PowerVR's TBDR architecture.

Changes:
- Add POWERVR to DeviceType enum with string detection
- Add device_is_powervr() convenience method on ComputeGraph
- Add PowerVR-specific workgroup sizes (32 instead of 64) for
  convolution dispatch to match PowerVR execution unit configuration
- Force optimal tiling on PowerVR (linear tiling may produce
  incorrect results in compute shaders on TBDR architecture)
- Enable robustBufferAccess on PowerVR for well-defined OOB behavior

Tested on Pixel 10 Pro (PowerVR D-Series DXT-48-1536 MC1):
- FP32 convolution passes all tests
- Non-conv FP16 ops (add, multiply) pass correctly
- FP16 conv has known bias texture initialization issue (#17299)

Related: #17299
---
 .../runtime/graph/ops/impl/Convolution.cpp    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 31396cd20dd..da70d855901 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -18,6 +18,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <iostream>
+
 namespace vkcompute {
 
 enum class Conv2dMethod : uint8_t {
@@ -617,6 +619,34 @@ void add_conv2d_node(
     };
   }
 
+  // Diagnostic logging for PowerVR devices to help debug conv2d issues
+#ifndef NDEBUG
+  if (graph.device_is_powervr()) {
+    const auto weight_sizes = graph.sizes_of(weight_data);
+    const auto out_sizes = graph.sizes_of(out);
+    const auto in_sizes_dbg = graph.sizes_of(in);
+    const char* method_str =
+        method == Conv2dMethod::Depthwise    ? "Depthwise"
+        : method == Conv2dMethod::Pointwise  ? "Pointwise"
+        : method == Conv2dMethod::Transposed ? "Transposed"
+                                             : "SlidingWindow";
+    std::cerr << "[PowerVR conv2d] method=" << method_str
+              << " shader=" << shader.kernel_name
+              << " in=[" << in_sizes_dbg[0] << "," << in_sizes_dbg[1] << ","
+              << in_sizes_dbg[2] << "," << in_sizes_dbg[3] << "]"
+              << " weight=[" << weight_sizes[0] << "," << weight_sizes[1] << ","
+              << weight_sizes[2] << "," << weight_sizes[3] << "]"
+              << " out=[" << out_sizes[0] << "," << out_sizes[1] << ","
+              << out_sizes[2] << "," << out_sizes[3] << "]"
+              << " groups=" << groups_val
+              << " global_wg=[" << wg_size[0] << "," << wg_size[1] << ","
+              << wg_size[2] << "]"
+              << " local_wg=[" << local_wg_size[0] << "," << local_wg_size[1]
+              << "," << local_wg_size[2] << "]"
+              << std::endl;
+  }
+#endif
+
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,

From 797ecfe86dd3e56de43fac8b2984e3e471586bf0 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Mon, 9 Feb 2026 23:42:03 -0400
Subject: [PATCH 06/11] Fix missing vmaFlushAllocation in StagingBuffer

set_staging_zeros() and cast_and_copy_from() write to staging buffers
without flushing, unlike copy_from() which correctly calls
vmaFlushAllocation(). On GPUs where VMA staging memory is not
host-coherent (e.g. PowerVR), CPU writes stay in cache and the GPU
reads garbage, causing incorrect inference results.

This fixes FP16 convolution producing wrong outputs on PowerVR GPUs
where the implicit zero-bias texture reads uninitialized memory.
---
 backends/vulkan/runtime/api/containers/StagingBuffer.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index d786d030b39..5a0be19f63b 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -88,6 +88,11 @@ class StagingBuffer final {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
     }
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
   }
 
   void cast_half_to_float_and_copy_from(
@@ -117,6 +122,11 @@ class StagingBuffer final {
 
   inline void set_staging_zeros() {
     memset(data(), 0, nbytes());
+    vmaFlushAllocation(
+        vulkan_buffer_.vma_allocator(),
+        vulkan_buffer_.allocation(),
+        0u,
+        VK_WHOLE_SIZE);
   }
 
   template <typename T>

From aebe7fd68921075668715ab76328f6867ed1c970 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Mon, 9 Feb 2026 23:51:40 -0400
Subject: [PATCH 07/11] Remove debug logging from Convolution.cpp

Remove PowerVR-specific diagnostic cerr logging and unused iostream
include that were used during development.
---
 .../runtime/graph/ops/impl/Convolution.cpp    | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index da70d855901..31396cd20dd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -18,8 +18,6 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
-#include <iostream>
-
 namespace vkcompute {
 
 enum class Conv2dMethod : uint8_t {
@@ -619,34 +617,6 @@ void add_conv2d_node(
     };
   }
 
-  // Diagnostic logging for PowerVR devices to help debug conv2d issues
-#ifndef NDEBUG
-  if (graph.device_is_powervr()) {
-    const auto weight_sizes = graph.sizes_of(weight_data);
-    const auto out_sizes = graph.sizes_of(out);
-    const auto in_sizes_dbg = graph.sizes_of(in);
-    const char* method_str =
-        method == Conv2dMethod::Depthwise    ? "Depthwise"
-        : method == Conv2dMethod::Pointwise  ? "Pointwise"
-        : method == Conv2dMethod::Transposed ? "Transposed"
-                                             : "SlidingWindow";
-    std::cerr << "[PowerVR conv2d] method=" << method_str
-              << " shader=" << shader.kernel_name
-              << " in=[" << in_sizes_dbg[0] << "," << in_sizes_dbg[1] << ","
-              << in_sizes_dbg[2] << "," << in_sizes_dbg[3] << "]"
-              << " weight=[" << weight_sizes[0] << "," << weight_sizes[1] << ","
-              << weight_sizes[2] << "," << weight_sizes[3] << "]"
-              << " out=[" << out_sizes[0] << "," << out_sizes[1] << ","
-              << out_sizes[2] << "," << out_sizes[3] << "]"
-              << " groups=" << groups_val
-              << " global_wg=[" << wg_size[0] << "," << wg_size[1] << ","
-              << wg_size[2] << "]"
-              << " local_wg=[" << local_wg_size[0] << "," << local_wg_size[1]
-              << "," << local_wg_size[2] << "]"
-              << std::endl;
-  }
-#endif
-
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,

From ed41c7a6107332a209fd76961cfec97c7f9dd819 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Tue, 10 Feb 2026 18:57:13 -0400
Subject: [PATCH 08/11] Revert "Fix missing vmaFlushAllocation in
 StagingBuffer"

This reverts commit 950906427dcc03255b2af992103aa6509fa7f590.
---
 backends/vulkan/runtime/api/containers/StagingBuffer.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
index 5a0be19f63b..d786d030b39 100644
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -88,11 +88,6 @@ class StagingBuffer final {
     for (size_t i = 0; i < numel; ++i) {
       dst[i] = static_cast<DST_T>(src[i]);
     }
-    vmaFlushAllocation(
-        vulkan_buffer_.vma_allocator(),
-        vulkan_buffer_.allocation(),
-        0u,
-        VK_WHOLE_SIZE);
   }
 
   void cast_half_to_float_and_copy_from(
@@ -122,11 +117,6 @@ class StagingBuffer final {
 
   inline void set_staging_zeros() {
     memset(data(), 0, nbytes());
-    vmaFlushAllocation(
-        vulkan_buffer_.vma_allocator(),
-        vulkan_buffer_.allocation(),
-        0u,
-        VK_WHOLE_SIZE);
   }
 
   template <typename T>

From bd9b151119eb499727aa82ed34e5687bd70468f7 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Wed, 11 Feb 2026 21:38:26 -0400
Subject: [PATCH 09/11] Remove dead local_wg_size code from add_conv2d_node

The local_wg_size variable was computed but never used since
DynamicDispatchNode uses the conv2d_local_wg_size callback
which already contains the PowerVR-specific logic.
---
 .../runtime/graph/ops/impl/Convolution.cpp    | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 31396cd20dd..32c8bae1b83 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -537,30 +537,10 @@ void add_conv2d_node(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       graph, method, out, weight_data, stride_equals_dilation);
 
-  utils::uvec3 local_wg_size;
   if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
   }
 
-  // Use smaller workgroup sizes on PowerVR to avoid potential hardware issues
-  const uint32_t max_local_size = graph.device_is_powervr() ? 32u : 64u;
-
-  if (method == Conv2dMethod::Pointwise) {
-    uint32_t local_wg_size_y = 1;
-    if (!graph.device_is_powervr() && wg_size[1] % 8 == 0) {
-      local_wg_size_y = 8;
-    } else if (wg_size[1] % 4 == 0) {
-      local_wg_size_y = 4;
-    } else if (wg_size[1] % 2 == 0) {
-      local_wg_size_y = 2;
-    }
-    local_wg_size = {max_local_size / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    local_wg_size = {max_local_size, 1, 1};
-  } else {
-    local_wg_size = graph.create_local_wg_size(wg_size);
-  }
-
   vkapi::ParamsBindList param_buffers;
   std::vector<PushConstantDataInfo> push_constants;
   if (method == Conv2dMethod::Pointwise) {

From 3f26203e3efa541b3444c571e8d6d39a9d7c4637 Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Wed, 11 Feb 2026 21:59:46 -0400
Subject: [PATCH 10/11] Address review: remove dead wg_size code, harden
 robustBufferAccess

- Remove unused wg_size variable left behind after removing inline
  workgroup size calculation (DynamicDispatchNode uses callbacks)
- Fix robustBufferAccess comment to accurately describe buffer-only scope
- Query device feature support before enabling robustBufferAccess
---
 .../vulkan/runtime/graph/ops/impl/Convolution.cpp |  7 -------
 backends/vulkan/runtime/vk_api/Adapter.cpp        | 15 ++++++++++-----
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 32c8bae1b83..79537d72adc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -534,13 +534,6 @@ void add_conv2d_node(
       stride_equals_dilation,
       stride_1_padding_0);
 
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(
-      graph, method, out, weight_data, stride_equals_dilation);
-
-  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
-    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-  }
-
   vkapi::ParamsBindList param_buffers;
   std::vector<PushConstantDataInfo> push_constants;
   if (method == Conv2dMethod::Pointwise) {
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index a6eea9adcff..96805d1b391 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -131,13 +131,18 @@ VkDevice create_logical_device(
       enabled_device_extensions,
       requested_device_extensions);
 
-  // Enable robustBufferAccess on PowerVR devices to ensure well-defined
-  // behavior for out-of-bounds buffer/image accesses. Without this, PowerVR
-  // drivers may return zeros or undefined values for edge cases in compute
-  // shaders. This has a minor performance cost but improves correctness.
+  // Enable robustBufferAccess on PowerVR devices to provide more well-defined
+  // behavior for out-of-bounds buffer descriptor accesses. Without this,
+  // PowerVR drivers may return zeros or undefined values for some edge cases
+  // in compute shaders. This has a minor performance cost but improves
+  // correctness.
   VkPhysicalDeviceFeatures enabled_features{};
   if (physical_device.device_type == DeviceType::POWERVR) {
-    enabled_features.robustBufferAccess = VK_TRUE;
+    VkPhysicalDeviceFeatures supported_features{};
+    vkGetPhysicalDeviceFeatures(physical_device.handle, &supported_features);
+    if (supported_features.robustBufferAccess == VK_TRUE) {
+      enabled_features.robustBufferAccess = VK_TRUE;
+    }
   }
 
   VkDeviceCreateInfo device_create_info{

From dc11701b6f70d401f8829cecdbdf26169c5a09fc Mon Sep 17 00:00:00 2001
From: Abdelaziz Mahdy <zezohassam@gmail.com>
Date: Sat, 14 Feb 2026 23:24:57 -0400
Subject: [PATCH 11/11] [ET-VK] Serialize prepack dispatches on PowerVR GPUs

PowerVR corrupts prepacked constant data when multiple prepack compute
dispatches are batched in a single command buffer. Only the first
constant is correct; subsequent constants read as zero.

This caused MobileNet to produce NaN (division-by-zero in Hardswish
decomposition) and FP16 convolution to show a +0.5 bias offset.

Submit and wait after each prepack node on PowerVR to ensure each
constant is fully consumed before the next staging buffer is created.
---
 backends/vulkan/runtime/graph/ComputeGraph.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index bb2df30a174..3552b300d63 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -1115,7 +1115,11 @@ void ComputeGraph::prepack() {
   int i = 0;
   bool submitted = false;
   const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB;
-  // int count = 0;
+  // On PowerVR GPUs, batching multiple prepack compute dispatches in a single
+  // command buffer produces incorrect results for the second and subsequent
+  // constants. Submit and wait after each prepack node to work around this
+  // driver issue.
+  const bool serialize_prepack = device_is_powervr();
   context_->set_cmd();
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     // Do not trigger on the first or last prepack node.
@@ -1142,6 +1146,13 @@ void ComputeGraph::prepack() {
 
     node->encode(this);
     i++;
+
+    if (serialize_prepack && i < static_cast<int>(prepack_nodes_.size())) {
+      submit_current_cmd_and_wait();
+      context_->flush();
+      staging_nbytes_in_cmd_ = 0;
+      context_->set_cmd();
+    }
   }
   submit_current_cmd_and_wait(/*final_use=*/true);
   context_->flush();