From 6974d3214a2dea27baba10de1bdde9d2be60ddd9 Mon Sep 17 00:00:00 2001 From: Abdelaziz Mahdy Date: Sat, 14 Feb 2026 19:01:46 -0400 Subject: [PATCH] [ET-VK] Use direct buffer-to-image copy for 1D tensor prepacking The nchw_to_image compute shader uses axis_map coordinate remapping designed for multi-dimensional tensors. For 1D tensors (e.g., conv2d bias) padded to 4D with three dimensions of size 1, the coordinate math produces out-of-bounds texture writes. Some GPU drivers (PowerVR) strictly validate texture bounds and silently drop these writes, causing bias values to never reach the texture. Other drivers wrap coordinates, masking the bug. For 1D width-packed tensors where the element count is divisible by 4, the staging buffer data is already in the correct layout for the target texture. This change bypasses the compute shader and uses vkCmdCopyBufferToImage to transfer the data directly. Also adds TRANSFER_SRC to staging buffer usage flags (harmless for existing paths) and a copy_buffer_to_image helper to CommandBuffer. --- .../vulkan/runtime/graph/ops/PrepackNode.cpp | 59 +++++++++++++++++++ backends/vulkan/runtime/vk_api/Command.cpp | 43 ++++++++++++++ backends/vulkan/runtime/vk_api/Command.h | 5 ++ .../runtime/vk_api/memory/Allocator.cpp | 7 ++- 4 files changed, 113 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index bb21f4b7c2b..a3e86671ce3 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -118,6 +118,65 @@ void PrepackNode::encode(ComputeGraph* graph) { context->check_device_capabilities(shader_); + // For 1D width-packed tensors (e.g., conv2d bias), use a direct + // vkCmdCopyBufferToImage instead of the nchw_to_image compute shader. + // + // The nchw_to_image shader uses axis_map coordinate remapping that assumes + // multi-dimensional tensors. For 1D tensors padded to 4D with three fake + // dimensions of size 1, the coordinate math can produce out-of-bounds texture + // writes. Some GPU drivers (e.g., PowerVR) strictly validate texture bounds + // and silently drop these writes, causing bias values to never reach the + // texture. Other drivers wrap coordinates, masking the bug. + // + // The staging buffer already contains correctly ordered data for 1D + // width-packed tensors, so we can copy it directly without remapping. + const std::vector packed_sizes = graph->sizes_of(packed_); + const int32_t packed_dim = graph->packed_dim_of(packed_); + const bool is_1d_width_packed = + packed_sizes.size() == 1 && packed_dim == 0 && packed_sizes[0] % 4 == 0; + + if (is_1d_width_packed) { + api::StagingBuffer staging = create_staging_buffer(graph); + + graph->create_dedicated_allocation_for(packed_); + vTensorPtr tensor(graph, packed_); + vkapi::VulkanImage& image = tensor->image(); + VkExtent3D extents = image.extents(); + + std::unique_lock cmd_lock = context->dispatch_lock(); + + // Transition image layout for transfer destination + vkapi::PipelineBarrier transfer_barrier{}; + transfer_barrier.stage.src = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + transfer_barrier.stage.dst = VK_PIPELINE_STAGE_TRANSFER_BIT; + transfer_barrier.images.emplace_back( + 0, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_GENERAL, + image); + image.set_layout(VK_IMAGE_LAYOUT_GENERAL); + + vkapi::CommandBuffer& cmd = context->extract_cmd(); + cmd.insert_barrier(transfer_barrier); + + VkBufferImageCopy region{}; + region.bufferOffset = 0; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + region.imageOffset = {0, 0, 0}; + region.imageExtent = extents; + + cmd.copy_buffer_to_image( + staging.buffer(), + image, + region, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + + return; + } + api::StagingBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp index 84e1f68dc68..a5f4f15f05c 100644 --- a/backends/vulkan/runtime/vk_api/Command.cpp +++ b/backends/vulkan/runtime/vk_api/Command.cpp @@ -233,6 +233,49 @@ void CommandBuffer::blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst) { state_ = CommandBuffer::State::RECORDING; } +void CommandBuffer::copy_buffer_to_image( + vkapi::VulkanBuffer& src, + vkapi::VulkanImage& dst, + const VkBufferImageCopy& region, + VkImageLayout dst_final_layout) { + VK_CHECK_COND( + state_ == CommandBuffer::State::BARRIERS_INSERTED, + "Vulkan CommandBuffer: called copy_buffer_to_image() on a command buffer " + "whose state is not BARRIERS_INSERTED."); + + vkCmdCopyBufferToImage( + handle_, + src.handle(), + dst.handle(), + dst.layout(), + 1, + ®ion); + + // Transition image to final layout via a post-copy barrier + VkImageMemoryBarrier post_barrier{}; + post_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + post_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + post_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + post_barrier.oldLayout = dst.layout(); + post_barrier.newLayout = dst_final_layout; + post_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + post_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + post_barrier.image = dst.handle(); + post_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + + vkCmdPipelineBarrier( + handle_, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + 0, nullptr, + 0, nullptr, + 1, &post_barrier); + + dst.set_layout(dst_final_layout); + state_ = CommandBuffer::State::RECORDING; +} + void CommandBuffer::write_timestamp(VkQueryPool querypool, const uint32_t idx) const { VK_CHECK_COND( diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h index ff1e5934a5c..06d357e752c 100644 --- a/backends/vulkan/runtime/vk_api/Command.h +++ b/backends/vulkan/runtime/vk_api/Command.h @@ -94,6 +94,11 @@ class CommandBuffer final { void insert_barrier(PipelineBarrier& pipeline_barrier); void dispatch(const utils::uvec3&); void blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst); + void copy_buffer_to_image( + vkapi::VulkanBuffer& src, + vkapi::VulkanImage& dst, + const VkBufferImageCopy& region, + VkImageLayout dst_final_layout); void write_timestamp(VkQueryPool, const uint32_t) const; void reset_querypool(VkQueryPool, const uint32_t, const uint32_t) const; diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 1d814533ede..ee3caccd9e6 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -144,7 +144,12 @@ VulkanImage Allocator::create_image( VulkanBuffer Allocator::create_staging_buffer( const VkDeviceSize size, const CopyDirection direction) { - const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + // TRANSFER_SRC allows staging buffers to be used as source for + // vkCmdCopyBufferToImage, needed for direct buffer-to-image copies + // (e.g., 1D tensor prepacking where compute shader coordinate remapping + // would produce incorrect results on some GPUs). + const VkBufferUsageFlags buffer_usage = + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags =