diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index 80aef97fc04..f6740e189b3 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -46,7 +46,11 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config) image_clearlist_mutex_{}, images_to_clear_{}, preferred_image_tiling_{VK_IMAGE_TILING_OPTIMAL} { - if (adapter_p_->linear_tiling_3d_enabled()) { + // PowerVR GPUs may report linear tiling support for 3D images but not + // handle it correctly in compute shaders (e.g., imageStore may produce + // incorrect results). Force optimal tiling on PowerVR for correctness. + if (adapter_p_->linear_tiling_3d_enabled() && + adapter_p_->device_type() != vkapi::DeviceType::POWERVR) { preferred_image_tiling_ = VK_IMAGE_TILING_LINEAR; } } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index bb2df30a174..3552b300d63 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -1115,7 +1115,11 @@ void ComputeGraph::prepack() { int i = 0; bool submitted = false; const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB; - // int count = 0; + // On PowerVR GPUs, batching multiple prepack compute dispatches in a single + // command buffer produces incorrect results for the second and subsequent + // constants. Submit and wait after each prepack node to work around this + // driver issue. + const bool serialize_prepack = device_is_powervr(); context_->set_cmd(); for (std::unique_ptr& node : prepack_nodes_) { // Do not trigger on the first or last prepack node. @@ -1142,6 +1146,13 @@ void ComputeGraph::prepack() { node->encode(this); i++; + + if (serialize_prepack && i < static_cast(prepack_nodes_.size())) { + submit_current_cmd_and_wait(); + context_->flush(); + staging_nbytes_in_cmd_ = 0; + context_->set_cmd(); + } } submit_current_cmd_and_wait(/*final_use=*/true); context_->flush(); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index a7c8cffffd1..1b44a032850 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -661,6 +661,9 @@ class ComputeGraph final { inline bool device_is_adreno() { return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO; } + inline bool device_is_powervr() { + return context_->adapter_ptr()->device_type() == vkapi::DeviceType::POWERVR; + } const std::string& device_name() { return context()->adapter_ptr()->device_name(); } diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index efd61848af1..79537d72adc 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -401,6 +401,25 @@ utils::uvec3 conv2d_local_wg_size( method = Conv2dMethod::SlidingWindow; } + // PowerVR GPUs may not handle large workgroup sizes well. Use smaller, + // more conservative workgroup sizes to avoid potential hardware issues + // with the TBDR architecture. + if (graph->device_is_powervr()) { + if (method == Conv2dMethod::Pointwise) { + uint32_t local_wg_size_y = 1; + if (global_workgroup_size[1] % 4 == 0) { + local_wg_size_y = 4; + } else if (global_workgroup_size[1] % 2 == 0) { + local_wg_size_y = 2; + } + return {32 / local_wg_size_y, local_wg_size_y, 1}; + } else if (method == Conv2dMethod::Depthwise) { + return {32, 1, 1}; + } else { + return graph->create_local_wg_size(global_workgroup_size); + } + } + if (method == Conv2dMethod::Pointwise) { uint32_t local_wg_size_y = 1; if (global_workgroup_size[1] % 8 == 0) { @@ -515,30 +534,6 @@ void add_conv2d_node( stride_equals_dilation, stride_1_padding_0); - utils::uvec3 wg_size = create_conv2d_global_wg_size( - graph, method, out, weight_data, stride_equals_dilation); - - utils::uvec3 local_wg_size; - if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) { - wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1}; - } - - if (method == Conv2dMethod::Pointwise) { - uint32_t local_wg_size_y = 1; - if (wg_size[1] % 8 == 0) { - local_wg_size_y = 8; - } else if (wg_size[1] % 4 == 0) { - local_wg_size_y = 4; - } else if (wg_size[1] % 2 == 0) { - local_wg_size_y = 2; - } - local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1}; - } else if (method == Conv2dMethod::Depthwise) { - local_wg_size = {64, 1, 1}; - } else { - local_wg_size = graph.create_local_wg_size(wg_size); - } - vkapi::ParamsBindList param_buffers; std::vector push_constants; if (method == Conv2dMethod::Pointwise) { diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp index 0a5b1601dea..96805d1b391 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.cpp +++ b/backends/vulkan/runtime/vk_api/Adapter.cpp @@ -131,6 +131,20 @@ VkDevice create_logical_device( enabled_device_extensions, requested_device_extensions); + // Enable robustBufferAccess on PowerVR devices to provide more well-defined + // behavior for out-of-bounds buffer descriptor accesses. Without this, + // PowerVR drivers may return zeros or undefined values for some edge cases + // in compute shaders. This has a minor performance cost but improves + // correctness. + VkPhysicalDeviceFeatures enabled_features{}; + if (physical_device.device_type == DeviceType::POWERVR) { + VkPhysicalDeviceFeatures supported_features{}; + vkGetPhysicalDeviceFeatures(physical_device.handle, &supported_features); + if (supported_features.robustBufferAccess == VK_TRUE) { + enabled_features.robustBufferAccess = VK_TRUE; + } + } + VkDeviceCreateInfo device_create_info{ VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType nullptr, // pNext @@ -142,7 +156,9 @@ VkDevice create_logical_device( static_cast( enabled_device_extensions.size()), // enabledExtensionCount enabled_device_extensions.data(), // ppEnabledExtensionNames - nullptr, // pEnabledFeatures + physical_device.device_type == DeviceType::POWERVR + ? &enabled_features + : nullptr, // pEnabledFeatures }; void* extension_list_top = nullptr; diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp index 249038ed51e..7893ea4e312 100644 --- a/backends/vulkan/runtime/vk_api/Device.cpp +++ b/backends/vulkan/runtime/vk_api/Device.cpp @@ -126,6 +126,8 @@ PhysicalDevice::PhysicalDevice( device_type = DeviceType::NVIDIA; } else if (device_name.find("mali") != std::string::npos) { device_type = DeviceType::MALI; + } else if (device_name.find("powervr") != std::string::npos) { + device_type = DeviceType::POWERVR; } } diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h index 2bc3075ffb4..dc59367d03f 100644 --- a/backends/vulkan/runtime/vk_api/Device.h +++ b/backends/vulkan/runtime/vk_api/Device.h @@ -24,6 +24,7 @@ enum class DeviceType : uint32_t { MALI, ADRENO, SWIFTSHADER, + POWERVR, }; struct PhysicalDevice final {