Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions backends/vulkan/runtime/graph/ops/PrepackNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,65 @@ void PrepackNode::encode(ComputeGraph* graph) {

context->check_device_capabilities(shader_);

// For 1D width-packed tensors (e.g., conv2d bias), use a direct
// vkCmdCopyBufferToImage instead of the nchw_to_image compute shader.
//
// The nchw_to_image shader uses axis_map coordinate remapping that assumes
// multi-dimensional tensors. For 1D tensors padded to 4D with three fake
// dimensions of size 1, the coordinate math can produce out-of-bounds texture
// writes. Some GPU drivers (e.g., PowerVR) strictly validate texture bounds
// and silently drop these writes, causing bias values to never reach the
// texture. Other drivers wrap coordinates, masking the bug.
//
// The staging buffer already contains correctly ordered data for 1D
// width-packed tensors, so we can copy it directly without remapping.
const std::vector<int64_t> packed_sizes = graph->sizes_of(packed_);
const int32_t packed_dim = graph->packed_dim_of(packed_);
const bool is_1d_width_packed =
packed_sizes.size() == 1 && packed_dim == 0 && packed_sizes[0] % 4 == 0;

if (is_1d_width_packed) {
api::StagingBuffer staging = create_staging_buffer(graph);

graph->create_dedicated_allocation_for(packed_);
vTensorPtr tensor(graph, packed_);
vkapi::VulkanImage& image = tensor->image();
VkExtent3D extents = image.extents();

std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();

// Transition image layout for transfer destination
vkapi::PipelineBarrier transfer_barrier{};
transfer_barrier.stage.src = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
transfer_barrier.stage.dst = VK_PIPELINE_STAGE_TRANSFER_BIT;
transfer_barrier.images.emplace_back(
0,
VK_ACCESS_TRANSFER_WRITE_BIT,
VK_IMAGE_LAYOUT_UNDEFINED,
VK_IMAGE_LAYOUT_GENERAL,
image);
image.set_layout(VK_IMAGE_LAYOUT_GENERAL);

vkapi::CommandBuffer& cmd = context->extract_cmd();
cmd.insert_barrier(transfer_barrier);

VkBufferImageCopy region{};
region.bufferOffset = 0;
region.bufferRowLength = 0;
region.bufferImageHeight = 0;
region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1};
region.imageOffset = {0, 0, 0};
region.imageExtent = extents;

cmd.copy_buffer_to_image(
staging.buffer(),
image,
region,
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);

return;
}

api::StagingBuffer staging = create_staging_buffer(graph);

std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
Expand Down
43 changes: 43 additions & 0 deletions backends/vulkan/runtime/vk_api/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,49 @@ void CommandBuffer::blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst) {
state_ = CommandBuffer::State::RECORDING;
}

void CommandBuffer::copy_buffer_to_image(
vkapi::VulkanBuffer& src,
vkapi::VulkanImage& dst,
const VkBufferImageCopy& region,
VkImageLayout dst_final_layout) {
VK_CHECK_COND(
state_ == CommandBuffer::State::BARRIERS_INSERTED,
"Vulkan CommandBuffer: called copy_buffer_to_image() on a command buffer "
"whose state is not BARRIERS_INSERTED.");

vkCmdCopyBufferToImage(
handle_,
src.handle(),
dst.handle(),
dst.layout(),
1,
&region);

// Transition image to final layout via a post-copy barrier
VkImageMemoryBarrier post_barrier{};
post_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
post_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
post_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
post_barrier.oldLayout = dst.layout();
post_barrier.newLayout = dst_final_layout;
post_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
post_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
post_barrier.image = dst.handle();
post_barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1};

vkCmdPipelineBarrier(
handle_,
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0,
0, nullptr,
0, nullptr,
1, &post_barrier);

dst.set_layout(dst_final_layout);
state_ = CommandBuffer::State::RECORDING;
}

void CommandBuffer::write_timestamp(VkQueryPool querypool, const uint32_t idx)
const {
VK_CHECK_COND(
Expand Down
5 changes: 5 additions & 0 deletions backends/vulkan/runtime/vk_api/Command.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ class CommandBuffer final {
void insert_barrier(PipelineBarrier& pipeline_barrier);
void dispatch(const utils::uvec3&);
void blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst);
void copy_buffer_to_image(
vkapi::VulkanBuffer& src,
vkapi::VulkanImage& dst,
const VkBufferImageCopy& region,
VkImageLayout dst_final_layout);

void write_timestamp(VkQueryPool, const uint32_t) const;
void reset_querypool(VkQueryPool, const uint32_t, const uint32_t) const;
Expand Down
7 changes: 6 additions & 1 deletion backends/vulkan/runtime/vk_api/memory/Allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@ VulkanImage Allocator::create_image(
VulkanBuffer Allocator::create_staging_buffer(
const VkDeviceSize size,
const CopyDirection direction) {
const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
// TRANSFER_SRC allows staging buffers to be used as source for
// vkCmdCopyBufferToImage, needed for direct buffer-to-image copies
// (e.g., 1D tensor prepacking where compute shader coordinate remapping
// would produce incorrect results on some GPUs).
const VkBufferUsageFlags buffer_usage =
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

VmaAllocationCreateInfo alloc_create_info = {};
alloc_create_info.flags =
Expand Down
Loading