diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index bb21f4b7c2b..a3e86671ce3 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -118,6 +118,65 @@ void PrepackNode::encode(ComputeGraph* graph) { context->check_device_capabilities(shader_); + // For 1D width-packed tensors (e.g., conv2d bias), use a direct + // vkCmdCopyBufferToImage instead of the nchw_to_image compute shader. + // + // The nchw_to_image shader uses axis_map coordinate remapping that assumes + // multi-dimensional tensors. For 1D tensors padded to 4D with three fake + // dimensions of size 1, the coordinate math can produce out-of-bounds texture + // writes. Some GPU drivers (e.g., PowerVR) strictly validate texture bounds + // and silently drop these writes, causing bias values to never reach the + // texture. Other drivers wrap coordinates, masking the bug. + // + // The staging buffer already contains correctly ordered data for 1D + // width-packed tensors, so we can copy it directly without remapping. + const std::vector packed_sizes = graph->sizes_of(packed_); + const int32_t packed_dim = graph->packed_dim_of(packed_); + const bool is_1d_width_packed = + packed_sizes.size() == 1 && packed_dim == 0 && packed_sizes[0] % 4 == 0; + + if (is_1d_width_packed) { + api::StagingBuffer staging = create_staging_buffer(graph); + + graph->create_dedicated_allocation_for(packed_); + vTensorPtr tensor(graph, packed_); + vkapi::VulkanImage& image = tensor->image(); + VkExtent3D extents = image.extents(); + + std::unique_lock cmd_lock = context->dispatch_lock(); + + // Transition image layout for transfer destination + vkapi::PipelineBarrier transfer_barrier{}; + transfer_barrier.stage.src = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + transfer_barrier.stage.dst = VK_PIPELINE_STAGE_TRANSFER_BIT; + transfer_barrier.images.emplace_back( + 0, + VK_ACCESS_TRANSFER_WRITE_BIT, + VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_GENERAL, + image); + image.set_layout(VK_IMAGE_LAYOUT_GENERAL); + + vkapi::CommandBuffer& cmd = context->extract_cmd(); + cmd.insert_barrier(transfer_barrier); + + VkBufferImageCopy region{}; + region.bufferOffset = 0; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + region.imageOffset = {0, 0, 0}; + region.imageExtent = extents; + + cmd.copy_buffer_to_image( + staging.buffer(), + image, + region, + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); + + return; + } + api::StagingBuffer staging = create_staging_buffer(graph); std::unique_lock cmd_lock = context->dispatch_lock(); diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp index 84e1f68dc68..a5f4f15f05c 100644 --- a/backends/vulkan/runtime/vk_api/Command.cpp +++ b/backends/vulkan/runtime/vk_api/Command.cpp @@ -233,6 +233,49 @@ void CommandBuffer::blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst) { state_ = CommandBuffer::State::RECORDING; } +void CommandBuffer::copy_buffer_to_image( + vkapi::VulkanBuffer& src, + vkapi::VulkanImage& dst, + const VkBufferImageCopy& region, + VkImageLayout dst_final_layout) { + VK_CHECK_COND( + state_ == CommandBuffer::State::BARRIERS_INSERTED, + "Vulkan CommandBuffer: called copy_buffer_to_image() on a command buffer " + "whose state is not BARRIERS_INSERTED."); + + vkCmdCopyBufferToImage( + handle_, + src.handle(), + dst.handle(), + dst.layout(), + 1, + ®ion); + + // Transition image to final layout via a post-copy barrier + VkImageMemoryBarrier post_barrier{}; + post_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + post_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + post_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + post_barrier.oldLayout = dst.layout(); + post_barrier.newLayout = dst_final_layout; + post_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + post_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + post_barrier.image = dst.handle(); + post_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + + vkCmdPipelineBarrier( + handle_, + VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + 0, + 0, nullptr, + 0, nullptr, + 1, &post_barrier); + + dst.set_layout(dst_final_layout); + state_ = CommandBuffer::State::RECORDING; +} + void CommandBuffer::write_timestamp(VkQueryPool querypool, const uint32_t idx) const { VK_CHECK_COND( diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h index ff1e5934a5c..06d357e752c 100644 --- a/backends/vulkan/runtime/vk_api/Command.h +++ b/backends/vulkan/runtime/vk_api/Command.h @@ -94,6 +94,11 @@ class CommandBuffer final { void insert_barrier(PipelineBarrier& pipeline_barrier); void dispatch(const utils::uvec3&); void blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst); + void copy_buffer_to_image( + vkapi::VulkanBuffer& src, + vkapi::VulkanImage& dst, + const VkBufferImageCopy& region, + VkImageLayout dst_final_layout); void write_timestamp(VkQueryPool, const uint32_t) const; void reset_querypool(VkQueryPool, const uint32_t, const uint32_t) const; diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 1d814533ede..ee3caccd9e6 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -144,7 +144,12 @@ VulkanImage Allocator::create_image( VulkanBuffer Allocator::create_staging_buffer( const VkDeviceSize size, const CopyDirection direction) { - const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + // TRANSFER_SRC allows staging buffers to be used as source for + // vkCmdCopyBufferToImage, needed for direct buffer-to-image copies + // (e.g., 1D tensor prepacking where compute shader coordinate remapping + // would produce incorrect results on some GPUs). + const VkBufferUsageFlags buffer_usage = + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; VmaAllocationCreateInfo alloc_create_info = {}; alloc_create_info.flags =