From adfbc1c3186b01eee1837b358d4eb84fb77dee1f Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 06:29:29 +0900 Subject: [PATCH 01/28] reimport vbo change commit 1a3dbee99c9a2c362373707678d5657e59ea6827 Author: Masahiro Masuda Date: Sat Apr 10 09:19:06 2021 +0900 fix typo commit 706fb3edb65f39f1cbc0c49e784475f3257fd3c0 Author: Masahiro Masuda Date: Thu Apr 8 17:34:18 2021 +0900 doc update commit a75a5b03b22b46b1709373027913c6738f5b0eb0 Author: Masahiro Masuda Date: Thu Apr 8 17:15:48 2021 +0900 let vkmap/unmap allocate and delete host_buf commit 9a67f4a931a747b8b83fe3e919f47b3cf60bcaf8 Author: Masahiro Masuda Date: Thu Apr 8 16:51:58 2021 +0900 query push constant size using runtime API commit 95ec1dbec1b6b1ac9204d717161c7fcdfe766ed3 Author: Masahiro Masuda Date: Mon Mar 22 17:23:32 2021 +0900 fix cpplint and revert float64 change commit bfec9d343461b8cf63afe44bf4220309be45055b Author: Masahiro Masuda Date: Mon Mar 22 17:19:54 2021 +0900 introduce value kind for ubo commit 17597ae6507b24c056ef7aa99087c8ac514826ce Author: Masahiro Masuda Date: Sun Mar 21 13:41:23 2021 +0900 minor fix commit a5a97f4e397ad410dc2bdb5524a9e2388e207294 Author: Masahiro Masuda Date: Sun Mar 21 13:39:40 2021 +0900 refactored codegen commit 69f2d05cd74ec573d92a97e8596474a8e8ed9b18 Author: Masahiro Masuda Date: Sun Mar 21 13:25:07 2021 +0900 revert BufferArgument change commit fb27fbbaba1b4ea978da164d7d1b30c8ce0ca212 Author: Masahiro Masuda Date: Sun Mar 21 13:22:11 2021 +0900 formatting commit c1b1c888f26071a98a20bc4300d9be96676ec0e3 Author: Masahiro Masuda Date: Sun Mar 21 13:19:19 2021 +0900 cleaning up commit 6c046699dcc9a6ef9163a471589a09f15e11640b Author: Masahiro Masuda Date: Sun Mar 21 12:07:25 2021 +0900 remove log commit 436ff80bf07570145468805b1a4c5e76653b2a47 Author: Masahiro Masuda Date: Sun Mar 21 11:58:14 2021 +0900 cumsum and nms test working with ubo commit 7cfea184a5feb28228033752025304620d3d534b Author: Masahiro Masuda Date: Sun Mar 21 08:37:11 2021 +0900 do not delete ubo when not using it commit 23b1f402471d9c26f615f31eec6f479158a85d41 Author: Masahiro Masuda Date: Sun Mar 21 08:25:48 2021 +0900 add more log commit a8de4593853e30569f2d6fde9c26d90529c57c25 Author: Masahiro Masuda Date: Sun Mar 21 08:20:27 2021 +0900 trying an approach similar to push constant commit 5f9f82dd74e51f316a9aeac5d455d1aec4981bab Author: Masahiro Masuda Date: Sun Mar 21 07:55:30 2021 +0900 do not use float64 commit 432ff24eb0432cee222d6813333049c7c836150e Author: Masahiro Masuda Date: Sat Mar 20 13:18:43 2021 +0900 refactor commit 665d5ff950415d9d407e472c5bd6315888aab79d Author: Masahiro Masuda Date: Sat Mar 20 12:57:39 2021 +0900 query memory type for uniform commit e1788b8d5134dcd6b7bde17b95caca0ed0b24edd Author: Masahiro Masuda Date: Sat Mar 20 04:34:44 2021 +0900 allocate and bind ubo commit 7d2ed2bed107f1dd1aba2bb6697665a9684b1441 Author: Masahiro Masuda Date: Fri Mar 19 11:20:19 2021 +0900 begin runtime change for UBO commit 4f5ca8cda0701b71a1d89369fc3798f92cef4857 Author: Masahiro Masuda Date: Fri Mar 19 11:05:55 2021 +0900 ubo codegen first cut --- src/runtime/vulkan/vulkan.cc | 268 +++++++++++++++++++---------- src/runtime/vulkan/vulkan_common.h | 3 + src/target/spirv/codegen_spirv.cc | 23 ++- src/target/spirv/ir_builder.cc | 31 +++- src/target/spirv/ir_builder.h | 32 +++- 5 files changed, 255 insertions(+), 102 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 5cd4812f41c4..c8a0858ec1bc 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -91,6 +91,11 @@ struct VulkanBuffer { VkDeviceMemory memory{VK_NULL_HANDLE}; }; +struct UniformBuffer { + VulkanBuffer* vk_buf; + void* host_buf; +}; + struct VulkanPipeline { VulkanContext* vctx_{nullptr}; VkShaderModule shader{VK_NULL_HANDLE}; @@ -100,10 +105,105 @@ struct VulkanPipeline { VkPipelineLayout pipeline_layout{VK_NULL_HANDLE}; VkPipeline pipeline{VK_NULL_HANDLE}; VkDescriptorUpdateTemplateKHR descriptor_update_template{VK_NULL_HANDLE}; + UniformBuffer ubo; }; typedef dmlc::ThreadLocalStore VulkanThreadStore; +uint32_t FindMemoryType(VkDevice logical_device, VkPhysicalDevice phy_device, VkBuffer buffer, + VkMemoryPropertyFlags req_prop) { + VkMemoryRequirements mem_reqs; + vkGetBufferMemoryRequirements(logical_device, buffer, &mem_reqs); + uint32_t type_bits = mem_reqs.memoryTypeBits; + VkPhysicalDeviceMemoryProperties phy_mem_prop; + vkGetPhysicalDeviceMemoryProperties(phy_device, &phy_mem_prop); + for (uint32_t i = 0; i < phy_mem_prop.memoryTypeCount; i++) { + if ((type_bits & 1) == 1 && + (phy_mem_prop.memoryTypes[i].propertyFlags & req_prop) == req_prop) { + return i; + } + type_bits >>= 1; + } + LOG(FATAL) << "Requested memory type not found"; + return 0; +} + +VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage) { + VkBufferCreateInfo info; + info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + info.pNext = nullptr; + info.flags = 0; + info.size = nbytes; + info.queueFamilyIndexCount = 1; + info.pQueueFamilyIndices = &(vctx.queue_family_index); + info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + info.usage = usage; + // create buffer + VkBuffer buffer; + VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); + + uint32_t mem_type_index = vctx.compute_mtype_index; + + if (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { + // Find a memory type that supports UBO + auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + mem_type_index = FindMemoryType(vctx.device, vctx.phy_device, buffer, prop); + } + + // bind to memory + bool dedicated_allocation = false; + VkMemoryRequirements2KHR req2; + + if (vctx.get_buffer_memory_requirements_2_functions) { + VkBufferMemoryRequirementsInfo2KHR req_info2; + req_info2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR; + req_info2.pNext = 0; + req_info2.buffer = buffer; + + req2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; + req2.pNext = 0; + + VkMemoryDedicatedRequirementsKHR dedicated_req; + dedicated_req.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; + dedicated_req.pNext = 0; + req2.pNext = &dedicated_req; + + vctx.get_buffer_memory_requirements_2_functions->vkGetBufferMemoryRequirements2KHR( + vctx.device, &req_info2, &req2); + dedicated_allocation = + dedicated_req.requiresDedicatedAllocation || dedicated_req.prefersDedicatedAllocation; + } + + VkDeviceMemory memory; + if (!dedicated_allocation) { + VkMemoryAllocateInfo minfo; + minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + minfo.pNext = nullptr; + minfo.allocationSize = nbytes; + minfo.memoryTypeIndex = mem_type_index; + VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); + } else { + VkMemoryAllocateInfo minfo; + minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + minfo.pNext = nullptr; + minfo.allocationSize = req2.memoryRequirements.size; + minfo.memoryTypeIndex = mem_type_index; + + VkMemoryDedicatedAllocateInfoKHR mdinfo; + mdinfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; + mdinfo.pNext = 0; + mdinfo.image = 0; + mdinfo.buffer = buffer; + minfo.pNext = &mdinfo; + VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); + } + VULKAN_CALL(vkBindBufferMemory(vctx.device, buffer, memory, 0)); + VulkanBuffer* pbuf = new VulkanBuffer(); + pbuf->memory = memory; + pbuf->buffer = buffer; + return pbuf; +} + class VulkanDeviceAPI final : public DeviceAPI { public: VulkanDeviceAPI(); @@ -124,70 +224,9 @@ class VulkanDeviceAPI final : public DeviceAPI { nbytes = 1; } const auto& vctx = context(dev.device_id); - VkBufferCreateInfo info; - info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - info.pNext = nullptr; - info.flags = 0; - info.size = nbytes; - info.queueFamilyIndexCount = 1; - info.pQueueFamilyIndices = &(vctx.queue_family_index); - info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - // create buffer - VkBuffer buffer; - VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); - // bind to memory - VkBufferMemoryRequirementsInfo2KHR req_info2; - req_info2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR; - req_info2.pNext = 0; - req_info2.buffer = buffer; - - VkMemoryRequirements2KHR req2; - req2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; - req2.pNext = 0; - - VkMemoryDedicatedRequirementsKHR dedicated_req; - dedicated_req.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; - dedicated_req.pNext = 0; - req2.pNext = &dedicated_req; - - bool dedicated_allocation = false; - if (vctx.get_buffer_memory_requirements_2_functions) { - vctx.get_buffer_memory_requirements_2_functions->vkGetBufferMemoryRequirements2KHR( - vctx.device, &req_info2, &req2); - dedicated_allocation = - dedicated_req.requiresDedicatedAllocation || dedicated_req.prefersDedicatedAllocation; - } - - VkDeviceMemory memory; - if (!dedicated_allocation) { - VkMemoryAllocateInfo minfo; - minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - minfo.pNext = nullptr; - minfo.allocationSize = nbytes; - minfo.memoryTypeIndex = vctx.compute_mtype_index; - VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); - } else { - VkMemoryAllocateInfo minfo; - minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - minfo.pNext = nullptr; - minfo.allocationSize = req2.memoryRequirements.size; - minfo.memoryTypeIndex = vctx.compute_mtype_index; - - VkMemoryDedicatedAllocateInfoKHR mdinfo; - mdinfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; - mdinfo.pNext = 0; - mdinfo.image = 0; - mdinfo.buffer = buffer; - minfo.pNext = &mdinfo; - VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); - } - VULKAN_CALL(vkBindBufferMemory(vctx.device, buffer, memory, 0)); - VulkanBuffer* pbuf = new VulkanBuffer(); - pbuf->memory = memory; - pbuf->buffer = buffer; - return pbuf; + return CreateBuffer(vctx, nbytes, usage); } void FreeDataSpace(Device dev, void* ptr) final { @@ -747,7 +786,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { public: explicit VulkanModuleNode(std::unordered_map smap, std::unordered_map fmap, std::string source) - : smap_(smap), fmap_(fmap), source_(source) {} + : smap_(smap), fmap_(fmap), source_(source), max_push_constants_(GetMaxPushConstantsSize()) {} const char* type_key() const final { return "vulkan"; } @@ -781,6 +820,13 @@ class VulkanModuleNode final : public runtime::ModuleNode { vkDestroyDescriptorPool(vctx.device, pe->descriptor_pool, nullptr); vkDestroyDescriptorSetLayout(vctx.device, pe->descriptor_set_layout, nullptr); vkDestroyShaderModule(vctx.device, pe->shader, nullptr); + // UBO + if (pe->ubo.vk_buf) { + vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); + vkDestroyBuffer(vctx.device, pe->ubo.vk_buf->buffer, nullptr); + vkFreeMemory(vctx.device, pe->ubo.vk_buf->memory, nullptr); + delete pe->ubo.vk_buf; + } } } } @@ -812,30 +858,35 @@ class VulkanModuleNode final : public runtime::ModuleNode { std::vector arg_template; uint32_t num_pod = 0, num_buffer = 0; + auto push_arg_info = [&arg_binding, &arg_template](uint32_t binding, + VkDescriptorType desc_type) { + { + VkDescriptorSetLayoutBinding bd; + bd.binding = binding; + bd.descriptorType = desc_type; + bd.descriptorCount = 1; + bd.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + bd.pImmutableSamplers = nullptr; + arg_binding.push_back(bd); + } + { + VkDescriptorUpdateTemplateEntryKHR tpl; + tpl.dstBinding = binding; + tpl.dstArrayElement = 0; + tpl.descriptorCount = 1; + tpl.descriptorType = desc_type; + tpl.offset = binding * sizeof(VkDescriptorBufferInfo); + tpl.stride = sizeof(VkDescriptorBufferInfo); + arg_template.push_back(tpl); + } + }; + { auto fit = fmap_.find(func_name); ICHECK(fit != fmap_.end()); for (DLDataType arg_type : fit->second.arg_types) { if (arg_type.code == kTVMOpaqueHandle) { - { - VkDescriptorSetLayoutBinding bd; - bd.binding = num_buffer; - bd.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - bd.descriptorCount = 1; - bd.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - bd.pImmutableSamplers = nullptr; - arg_binding.push_back(bd); - } - { - VkDescriptorUpdateTemplateEntryKHR tpl; - tpl.dstBinding = num_buffer; - tpl.dstArrayElement = 0; - tpl.descriptorCount = 1; - tpl.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - tpl.offset = num_buffer * sizeof(VkDescriptorBufferInfo); - tpl.stride = sizeof(VkDescriptorBufferInfo); - arg_template.push_back(tpl); - } + push_arg_info(num_buffer, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); ++num_buffer; } else { ++num_pod; @@ -843,6 +894,11 @@ class VulkanModuleNode final : public runtime::ModuleNode { } } + size_t nbytes_scalars = num_pod * sizeof(ArgUnion64); + if (nbytes_scalars > max_push_constants_) { + push_arg_info(num_buffer, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + } + { VkDescriptorSetLayoutCreateInfo descrip_cinfo; descrip_cinfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; @@ -894,7 +950,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { playout_cinfo.setLayoutCount = 1; playout_cinfo.pSetLayouts = &(pe->descriptor_set_layout); - if (num_pack_args != 0) { + if (0 < nbytes_scalars && nbytes_scalars <= max_push_constants_) { playout_cinfo.pushConstantRangeCount = 1; playout_cinfo.pPushConstantRanges = &crange; ICHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize); @@ -923,6 +979,13 @@ class VulkanModuleNode final : public runtime::ModuleNode { VULKAN_CALL(vkCreateComputePipelines(vctx.device, VK_NULL_HANDLE, 1, &pipeline_cinfo, nullptr, &(pe->pipeline))); + if (nbytes_scalars > max_push_constants_) { + // Allocate, bind and map UBO + UniformBuffer& ubo = pe->ubo; + ubo.vk_buf = CreateBuffer(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); + } + if (vctx.UseImmediate()) { VkDescriptorUpdateTemplateCreateInfoKHR descrip_template_cinfo; descrip_template_cinfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR; @@ -966,6 +1029,8 @@ class VulkanModuleNode final : public runtime::ModuleNode { return source_; } + uint32_t MaxPushConstantsSize() const { return max_push_constants_; } + private: // function information table. std::unordered_map smap_; @@ -975,6 +1040,8 @@ class VulkanModuleNode final : public runtime::ModuleNode { std::string fmt_{"vulkan"}; // The source std::string source_; + // The maximum size of push constants in bytes + const uint32_t max_push_constants_; // Guards accesses to `ecache_` std::mutex mutex_; @@ -1076,6 +1143,17 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, binfo.range = VK_WHOLE_SIZE; descriptor_buffers[i] = binfo; } + const size_t nbytes_scalars = num_pack_args_ * sizeof(ArgUnion64); + bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > m_->MaxPushConstantsSize(); + if (use_ubo) { + CHECK(pipeline->ubo.host_buf) << "The UBO host buffer is not allocated"; + memcpy(pipeline->ubo.host_buf, pack_args, nbytes_scalars); + VkDescriptorBufferInfo binfo; + binfo.buffer = pipeline->ubo.vk_buf->buffer; + binfo.offset = 0; + binfo.range = VK_WHOLE_SIZE; + descriptor_buffers.push_back(binfo); + } if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) { @@ -1084,7 +1162,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vctx.descriptor_template_khr_functions->vkCmdPushDescriptorSetWithTemplateKHR( state->cmd_buffer_, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptor_buffers.data()); - if (num_pack_args_ != 0) { + if (num_pack_args_ > 0 && num_pack_args_ <= m_->MaxPushConstantsSize()) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion64), pack_args); @@ -1105,7 +1183,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, // Otherwise, the more expensive deferred path. std::vector pack_args_storage(pack_args, pack_args + num_pack_args_); - const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers]() { + const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers, use_ubo]() { std::vector write_descriptor_sets; write_descriptor_sets.resize(descriptor_buffers.size()); for (size_t i = 0; i < write_descriptor_sets.size(); i++) { @@ -1115,20 +1193,26 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, write_descriptor_sets[i].dstBinding = i; write_descriptor_sets[i].dstArrayElement = 0; write_descriptor_sets[i].descriptorCount = 1; - write_descriptor_sets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; write_descriptor_sets[i].pImageInfo = 0; write_descriptor_sets[i].pBufferInfo = &(descriptor_buffers[i]); write_descriptor_sets[i].pTexelBufferView = 0; + + if (use_ubo && i == write_descriptor_sets.size() - 1) { + // The last binding is for UBO + write_descriptor_sets[i].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + } else { + write_descriptor_sets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + } } vkUpdateDescriptorSets(vctx.device, write_descriptor_sets.size(), write_descriptor_sets.data(), 0, 0); }; - const auto& deferred_kernel = [pipeline, wl, pack_args_storage](VulkanStreamState* state) { + const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage](VulkanStreamState* state) { vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, nullptr); - if (pack_args_storage.size() != 0) { + if (num_pack_args_ > 0 && num_pack_args_ <= m_->MaxPushConstantsSize()) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, pack_args_storage.size() * sizeof(ArgUnion64), pack_args_storage.data()); @@ -1183,6 +1267,12 @@ Module VulkanModuleLoadBinary(void* strm) { return VulkanModuleCreate(smap, fmap, ""); } +uint32_t GetMaxPushConstantsSize() { + int device_id = VulkanThreadEntry::ThreadLocal()->device.device_id; + const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); + return vctx.phy_device_prop.limits.maxPushConstantsSize; +} + TVM_REGISTER_GLOBAL("runtime.module.loadfile_vulkan").set_body_typed(VulkanModuleLoadFile); TVM_REGISTER_GLOBAL("runtime.module.loadbinary_vulkan").set_body_typed(VulkanModuleLoadBinary); diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index 3083ba6f9ce4..e94a9fe7fa90 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -142,6 +142,9 @@ struct VulkanContext { bool UseImmediate() const { return descriptor_template_khr_functions.get() != nullptr; } }; +/*! \brief returns maximum push constant sizes in bytes for the target platform */ +uint32_t GetMaxPushConstantsSize(); + } // namespace vulkan } // namespace runtime } // namespace tvm diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index 24608ebc93f4..4d55f4c49a5f 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -30,6 +30,9 @@ #include +#include "../../runtime/pack_args.h" +#include "../../runtime/vulkan/vulkan_common.h" + namespace tvm { namespace codegen { @@ -66,16 +69,26 @@ std::vector CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: spirv::Value func_ptr = builder_->NewFunction(); builder_->StartFunction(func_ptr); - // All the POD arguments are passed in through PushConstant if (pod_args.size() != 0) { std::vector value_types; for (size_t i = 0; i < pod_args.size(); ++i) { value_types.push_back(builder_->GetSType(pod_args[i].dtype())); } - spirv::Value ptr = builder_->DeclarePushConstant(value_types); - for (size_t i = 0; i < pod_args.size(); ++i) { - spirv::Value value = builder_->GetPushConstant(ptr, value_types[i], static_cast(i)); - var_map_[pod_args[i].get()] = value; + const auto max_push_constants = runtime::vulkan::GetMaxPushConstantsSize(); + if (pod_args.size() * sizeof(runtime::ArgUnion64) <= max_push_constants) { + spirv::Value ptr = builder_->DeclarePushConstant(value_types); + for (size_t i = 0; i < pod_args.size(); ++i) { + spirv::Value value = + builder_->GetPushConstant(ptr, value_types[i], static_cast(i)); + var_map_[pod_args[i].get()] = value; + } + } else { + // If we need to pass more arguments than push constants could handle, we use UBO. + spirv::Value ptr = builder_->DeclareUniformBuffer(value_types, num_buffer); + for (size_t i = 0; i < pod_args.size(); ++i) { + spirv::Value value = builder_->GetUniform(ptr, value_types[i], static_cast(i)); + var_map_[pod_args[i].get()] = value; + } } } this->VisitStmt(f->body); diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc index 5a1457387ae5..cd48c93530ec 100644 --- a/src/target/spirv/ir_builder.cc +++ b/src/target/spirv/ir_builder.cc @@ -205,8 +205,8 @@ Value IRBuilder::BufferArgument(const SType& value_type, uint32_t descriptor_set return val; } -Value IRBuilder::DeclarePushConstant(const std::vector& value_types) { - ICHECK_EQ(push_const_.id, 0); +Value IRBuilder::DeclareStorageVariable(const std::vector& value_types, + spv::StorageClass storage_class, ValueKind kind) { SType struct_type; struct_type.id = id_counter_++; struct_type.type = DataType::Handle(); @@ -226,22 +226,26 @@ Value IRBuilder::DeclarePushConstant(const std::vector& value_types) { ICHECK_EQ(nbits % 8, 0); uint32_t bytes = (nbits / 8); if (t.bits() == 32) { - // In our Vulkan runtime, each push constant always occupies 64 bit. + // In our Vulkan runtime, each scalar argument always occupies 64 bit. offset += bytes * 2; } else { ICHECK_EQ(t.bits(), 64); offset += bytes; } } - // Decorate push constants as UBO this->Decorate(spv::OpDecorate, struct_type, spv::DecorationBlock); - SType ptr_type = GetPointerType(struct_type, spv::StorageClassPushConstant); - Value val = NewValue(ptr_type, kPushConstantPtr); - ib_.Begin(spv::OpVariable).AddSeq(ptr_type, val, spv::StorageClassPushConstant).Commit(&global_); + SType ptr_type = GetPointerType(struct_type, storage_class); + Value val = NewValue(ptr_type, kind); + ib_.Begin(spv::OpVariable).AddSeq(ptr_type, val, storage_class).Commit(&global_); return val; } +Value IRBuilder::DeclarePushConstant(const std::vector& value_types) { + ICHECK_EQ(push_const_.id, 0); + return DeclareStorageVariable(value_types, spv::StorageClassPushConstant, kPushConstantPtr); +} + Value IRBuilder::GetPushConstant(Value ptr_push_const, const SType& v_type, uint32_t index) { SType ptr_vtype = this->GetPointerType(v_type, spv::StorageClassPushConstant); Value ptr = this->MakeValue(spv::OpAccessChain, ptr_vtype, ptr_push_const, @@ -249,6 +253,19 @@ Value IRBuilder::GetPushConstant(Value ptr_push_const, const SType& v_type, uint return this->MakeValue(spv::OpLoad, v_type, ptr); } +Value IRBuilder::DeclareUniformBuffer(const std::vector& value_types, uint32_t binding) { + Value val = DeclareStorageVariable(value_types, spv::StorageClassUniform, kUniformPtr); + this->Decorate(spv::OpDecorate, val, spv::DecorationBinding, binding); + return val; +} + +Value IRBuilder::GetUniform(Value ptr_push_const, const SType& v_type, uint32_t index) { + SType ptr_vtype = this->GetPointerType(v_type, spv::StorageClassUniform); + Value ptr = this->MakeValue(spv::OpAccessChain, ptr_vtype, ptr_push_const, + IntImm(t_int32_, static_cast(index))); + return this->MakeValue(spv::OpLoad, v_type, ptr); +} + Value IRBuilder::NewFunction() { return NewValue(t_void_func_, kFunction); } void IRBuilder::CommitKernelFunction(const Value& func, const std::string& name) { diff --git a/src/target/spirv/ir_builder.h b/src/target/spirv/ir_builder.h index 8a08048e1955..05a2bc631743 100644 --- a/src/target/spirv/ir_builder.h +++ b/src/target/spirv/ir_builder.h @@ -60,7 +60,8 @@ enum ValueKind { kStructArrayPtr, kPushConstantPtr, kFunction, - kExtInst + kExtInst, + kUniformPtr }; /*! \brief Represent the SPIRV Value */ @@ -473,6 +474,7 @@ class IRBuilder { * \param The argument type. */ Value BufferArgument(const SType& value_type, uint32_t descriptor_set, uint32_t binding); + /*! * \brief Declare POD arguments through push constants. * @@ -488,6 +490,23 @@ class IRBuilder { * \return the value of push constant */ Value GetPushConstant(Value ptr_push_const, const SType& v_type, uint32_t index); + + /*! + * \brief Declare POD arguments through uniform buffer. + * + * \note Only call this function once! + * \param value_types The values in the uniform buffer + * \param binding The binding locaiton in descriptor set + * \return reference to self. + */ + Value DeclareUniformBuffer(const std::vector& value_types, uint32_t binding); + /*! + * \brief Get i-th uniform constant + * \param v_type The value type + * \param index The uniform index + * \return the value of uniform constant + */ + Value GetUniform(Value ptr_ubo, const SType& v_type, uint32_t index); /*! * \brief Declare a new function * \return The created function ID. @@ -555,6 +574,17 @@ class IRBuilder { val.flag = flag; return val; } + + /*! + * \brief The common function to declare push constants or uniform buffer + * \param value_types The values in the push constants or uniform buffer + * \param storage_class An enum defined by SPIR-V indicating push constant or uniform + * \param kind An enum indicating push constant or uniform + * \return The created new label + */ + Value DeclareStorageVariable(const std::vector& value_types, + spv::StorageClass storage_class, ValueKind kind); + // get constant given value encoded in uint64_t Value GetConst_(const SType& dtype, const uint64_t* pvalue); // declare type From 6db67afe9f44d1626c608d781077defb216fab38 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 06:41:13 +0900 Subject: [PATCH 02/28] deferred memcpy --- src/runtime/vulkan/vulkan.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index c8a0858ec1bc..89cd98e4c906 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -1147,7 +1147,6 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > m_->MaxPushConstantsSize(); if (use_ubo) { CHECK(pipeline->ubo.host_buf) << "The UBO host buffer is not allocated"; - memcpy(pipeline->ubo.host_buf, pack_args, nbytes_scalars); VkDescriptorBufferInfo binfo; binfo.buffer = pipeline->ubo.vk_buf->buffer; binfo.offset = 0; @@ -1156,6 +1155,9 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, } if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. + if (use_ubo) { + memcpy(pipeline->ubo.host_buf, pack_args, nbytes_scalars); + } VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) { vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); ICHECK(pipeline->descriptor_update_template != VK_NULL_HANDLE); @@ -1207,7 +1209,11 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkUpdateDescriptorSets(vctx.device, write_descriptor_sets.size(), write_descriptor_sets.data(), 0, 0); }; - const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage](VulkanStreamState* state) { + const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, + nbytes_scalars](VulkanStreamState* state) { + if (use_ubo) { + memcpy(pipeline->ubo.host_buf, pack_args_storage.data(), nbytes_scalars); + } vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, From 510425cfdee665ebd58fbde28bde75f1a893205a Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 07:11:12 +0900 Subject: [PATCH 03/28] refactoring CreateBuffer and FindMemoryType --- src/runtime/vulkan/vulkan.cc | 48 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 89cd98e4c906..8b20eddf63ee 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -110,13 +110,16 @@ struct VulkanPipeline { typedef dmlc::ThreadLocalStore VulkanThreadStore; -uint32_t FindMemoryType(VkDevice logical_device, VkPhysicalDevice phy_device, VkBuffer buffer, - VkMemoryPropertyFlags req_prop) { +int FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, + VkMemoryPropertyFlags req_prop) { + VkBuffer buffer; + VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); + VkMemoryRequirements mem_reqs; - vkGetBufferMemoryRequirements(logical_device, buffer, &mem_reqs); + vkGetBufferMemoryRequirements(vctx.device, buffer, &mem_reqs); uint32_t type_bits = mem_reqs.memoryTypeBits; VkPhysicalDeviceMemoryProperties phy_mem_prop; - vkGetPhysicalDeviceMemoryProperties(phy_device, &phy_mem_prop); + vkGetPhysicalDeviceMemoryProperties(vctx.phy_device, &phy_mem_prop); for (uint32_t i = 0; i < phy_mem_prop.memoryTypeCount; i++) { if ((type_bits & 1) == 1 && (phy_mem_prop.memoryTypes[i].propertyFlags & req_prop) == req_prop) { @@ -124,11 +127,12 @@ uint32_t FindMemoryType(VkDevice logical_device, VkPhysicalDevice phy_device, Vk } type_bits >>= 1; } - LOG(FATAL) << "Requested memory type not found"; - return 0; + LOG(INFO) << "Requested memory type not found"; + return -1; } -VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage) { +VkBufferCreateInfo MakeBufferCreateInfo(const VulkanContext& vctx, size_t nbytes, + VkBufferUsageFlags usage) { VkBufferCreateInfo info; info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; info.pNext = nullptr; @@ -138,18 +142,15 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsa info.pQueueFamilyIndices = &(vctx.queue_family_index); info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; info.usage = usage; + return info; +} + +VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, + uint32_t mem_type_index) { // create buffer VkBuffer buffer; VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); - uint32_t mem_type_index = vctx.compute_mtype_index; - - if (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { - // Find a memory type that supports UBO - auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - mem_type_index = FindMemoryType(vctx.device, vctx.phy_device, buffer, prop); - } - // bind to memory bool dedicated_allocation = false; VkMemoryRequirements2KHR req2; @@ -179,7 +180,7 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsa VkMemoryAllocateInfo minfo; minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; minfo.pNext = nullptr; - minfo.allocationSize = nbytes; + minfo.allocationSize = info.size; minfo.memoryTypeIndex = mem_type_index; VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); } else { @@ -226,7 +227,8 @@ class VulkanDeviceAPI final : public DeviceAPI { const auto& vctx = context(dev.device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - return CreateBuffer(vctx, nbytes, usage); + auto info = MakeBufferCreateInfo(vctx, nbytes, usage); + return CreateBuffer(vctx, info, vctx.compute_mtype_index); } void FreeDataSpace(Device dev, void* ptr) final { @@ -982,8 +984,16 @@ class VulkanModuleNode final : public runtime::ModuleNode { if (nbytes_scalars > max_push_constants_) { // Allocate, bind and map UBO UniformBuffer& ubo = pe->ubo; - ubo.vk_buf = CreateBuffer(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); - vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); + // Find a memory type that supports UBO + auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + auto info = MakeBufferCreateInfo(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + auto mem_type_index = FindMemoryType(vctx, info, prop); + if (mem_type_index == -1) { + ubo.vk_buf = CreateBuffer(vctx, info, vctx.compute_mtype_index); + } else { + ubo.vk_buf = CreateBuffer(vctx, info, mem_type_index); + vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); + } } if (vctx.UseImmediate()) { From cec63f91a2cde84d3163b3c7a885854c274e1bda Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 07:16:08 +0900 Subject: [PATCH 04/28] add case stub for the case when host visible buffer is not available --- src/runtime/vulkan/vulkan.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 8b20eddf63ee..4b69bc4c2145 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -824,7 +824,9 @@ class VulkanModuleNode final : public runtime::ModuleNode { vkDestroyShaderModule(vctx.device, pe->shader, nullptr); // UBO if (pe->ubo.vk_buf) { - vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); + if (pe->ubo.host_buf) { + vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); + } vkDestroyBuffer(vctx.device, pe->ubo.vk_buf->buffer, nullptr); vkFreeMemory(vctx.device, pe->ubo.vk_buf->memory, nullptr); delete pe->ubo.vk_buf; @@ -989,6 +991,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { auto info = MakeBufferCreateInfo(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); auto mem_type_index = FindMemoryType(vctx, info, prop); if (mem_type_index == -1) { + // If host visible memory is not found, use a normal storage buffer ubo.vk_buf = CreateBuffer(vctx, info, vctx.compute_mtype_index); } else { ubo.vk_buf = CreateBuffer(vctx, info, mem_type_index); @@ -1165,8 +1168,10 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, } if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. - if (use_ubo) { + if (use_ubo && pipeline->ubo.host_buf) { memcpy(pipeline->ubo.host_buf, pack_args, nbytes_scalars); + } else if (use_ubo) { + // TODO } VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) { vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); @@ -1221,9 +1226,12 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, }; const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, nbytes_scalars](VulkanStreamState* state) { - if (use_ubo) { + if (use_ubo && pipeline->ubo.host_buf) { memcpy(pipeline->ubo.host_buf, pack_args_storage.data(), nbytes_scalars); + } else if (use_ubo) { + // TODO } + vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, From 411c780324a6f24cb8d763ebf57e8635c1bf7b64 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 08:00:54 +0900 Subject: [PATCH 05/28] segfault on host to device copy --- src/runtime/vulkan/vulkan.cc | 106 +++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 4b69bc4c2145..77afb89215b6 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -205,6 +205,46 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, return pbuf; } +void CopyFromHostToDevice(VkDevice device, const void* from, size_t from_offset, void* to, + size_t to_offset, size_t size, int vk_device_id, int cpu_device_id, + bool coherent_staging) { + const auto* to_buf = static_cast(to); + VulkanStagingBuffer* temp = VulkanThreadEntry::ThreadLocal()->StagingBuffer(vk_device_id, size); + memcpy(temp->host_addr, static_cast(from) + from_offset, size); + // host side flush if access is not coherent. + // so writes from CPU is visible to GPU + if (!coherent_staging) { + VkMappedMemoryRange mrange; + mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + mrange.pNext = nullptr; + mrange.memory = temp->memory; + mrange.offset = 0; + mrange.size = VK_WHOLE_SIZE; // size; + VULKAN_CALL(vkFlushMappedMemoryRanges(device, 1, &mrange)); + } + + VulkanThreadEntry::ThreadLocal()->Stream(cpu_device_id)->Launch([&](VulkanStreamState* state) { + // 0: barrier(host->transfer) + VkMemoryBarrier barrier_info; + barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; + barrier_info.pNext = nullptr; + barrier_info.srcAccessMask = 0; + barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_HOST_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier_info, 0, nullptr, 0, + nullptr); + // 1: copy + VkBufferCopy copy_info; + copy_info.srcOffset = 0; + copy_info.dstOffset = to_offset; + copy_info.size = size; + vkCmdCopyBuffer(state->cmd_buffer_, temp->buffer, to_buf->buffer, 1, ©_info); + }); + // TODO(tulloch): should we instead make the staging buffer a property of the + // Stream? This would allow us to elide synchronizations here. + VulkanThreadEntry::ThreadLocal()->Stream(cpu_device_id)->Synchronize(); +} + class VulkanDeviceAPI final : public DeviceAPI { public: VulkanDeviceAPI(); @@ -308,44 +348,8 @@ class VulkanDeviceAPI final : public DeviceAPI { memcpy(static_cast(to) + to_offset, static_cast(temp->host_addr), size); } else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) { const auto& vctx = context(dev_to.device_id); - const auto* to_buf = static_cast(to); - VulkanStagingBuffer* temp = - VulkanThreadEntry::ThreadLocal()->StagingBuffer(dev_to.device_id, size); - memcpy(temp->host_addr, static_cast(from) + from_offset, size); - // host side flush if access is not coherent. - // so writes from CPU is visible to GPU - if (!vctx.coherent_staging) { - VkMappedMemoryRange mrange; - mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - mrange.pNext = nullptr; - mrange.memory = temp->memory; - mrange.offset = 0; - mrange.size = VK_WHOLE_SIZE; // size; - VULKAN_CALL(vkFlushMappedMemoryRanges(vctx.device, 1, &mrange)); - } - - VulkanThreadEntry::ThreadLocal() - ->Stream(dev_from.device_id) - ->Launch([&](VulkanStreamState* state) { - // 0: barrier(host->transfer) - VkMemoryBarrier barrier_info; - barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; - barrier_info.pNext = nullptr; - barrier_info.srcAccessMask = 0; - barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_HOST_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier_info, 0, nullptr, 0, - nullptr); - // 1: copy - VkBufferCopy copy_info; - copy_info.srcOffset = 0; - copy_info.dstOffset = to_offset; - copy_info.size = size; - vkCmdCopyBuffer(state->cmd_buffer_, temp->buffer, to_buf->buffer, 1, ©_info); - }); - // TODO(tulloch): should we instead make the staging buffer a property of the - // Stream? This would allow us to elide synchronizations here. - VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); + CopyFromHostToDevice(vctx.device, from, from_offset, to, to_offset, size, dev_to.device_id, + dev_from.device_id, vctx.coherent_staging); } else { LOG(FATAL) << "Expect copy from/to Vulkan or between Vulkan" << ", from=" << from_dev_type << ", to=" << to_dev_type; @@ -824,9 +828,9 @@ class VulkanModuleNode final : public runtime::ModuleNode { vkDestroyShaderModule(vctx.device, pe->shader, nullptr); // UBO if (pe->ubo.vk_buf) { - if (pe->ubo.host_buf) { - vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); - } + if (pe->ubo.host_buf) { + vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); + } vkDestroyBuffer(vctx.device, pe->ubo.vk_buf->buffer, nullptr); vkFreeMemory(vctx.device, pe->ubo.vk_buf->memory, nullptr); delete pe->ubo.vk_buf; @@ -990,9 +994,11 @@ class VulkanModuleNode final : public runtime::ModuleNode { auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; auto info = MakeBufferCreateInfo(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); auto mem_type_index = FindMemoryType(vctx, info, prop); - if (mem_type_index == -1) { - // If host visible memory is not found, use a normal storage buffer - ubo.vk_buf = CreateBuffer(vctx, info, vctx.compute_mtype_index); + if (true || mem_type_index == -1) { + // If host visible memory is not found, use a normal storage buffer + auto usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + ubo.vk_buf = CreateBuffer(vctx, MakeBufferCreateInfo(vctx, nbytes_scalars, usage), + vctx.compute_mtype_index); } else { ubo.vk_buf = CreateBuffer(vctx, info, mem_type_index); vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); @@ -1159,7 +1165,6 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const size_t nbytes_scalars = num_pack_args_ * sizeof(ArgUnion64); bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > m_->MaxPushConstantsSize(); if (use_ubo) { - CHECK(pipeline->ubo.host_buf) << "The UBO host buffer is not allocated"; VkDescriptorBufferInfo binfo; binfo.buffer = pipeline->ubo.vk_buf->buffer; binfo.offset = 0; @@ -1224,12 +1229,17 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkUpdateDescriptorSets(vctx.device, write_descriptor_sets.size(), write_descriptor_sets.data(), 0, 0); }; - const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, - nbytes_scalars](VulkanStreamState* state) { + bool coherent_staging = vctx.coherent_staging; + VkDevice device = vctx.device; + const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, nbytes_scalars, + device_id, coherent_staging, device](VulkanStreamState* state) { if (use_ubo && pipeline->ubo.host_buf) { memcpy(pipeline->ubo.host_buf, pack_args_storage.data(), nbytes_scalars); } else if (use_ubo) { - // TODO + // TODO(masahi): Is this ok + int cpu_device_id = 0; + CopyFromHostToDevice(device, pack_args_storage.data(), 0, pipeline->ubo.vk_buf, 0, + nbytes_scalars, cpu_device_id, device_id, coherent_staging); } vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); From f9deb3e19a43a1ce6e35615a6abb82e8f5f4f9f3 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 10:03:11 +0900 Subject: [PATCH 06/28] Revert changes for storage buffer backed uniform This reverts commit 3a3d06818a56a5df339dfdd214486dd7623f3897. --- src/runtime/vulkan/vulkan.cc | 154 ++++++++++++++--------------------- 1 file changed, 63 insertions(+), 91 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 77afb89215b6..89cd98e4c906 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -110,16 +110,13 @@ struct VulkanPipeline { typedef dmlc::ThreadLocalStore VulkanThreadStore; -int FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, - VkMemoryPropertyFlags req_prop) { - VkBuffer buffer; - VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); - +uint32_t FindMemoryType(VkDevice logical_device, VkPhysicalDevice phy_device, VkBuffer buffer, + VkMemoryPropertyFlags req_prop) { VkMemoryRequirements mem_reqs; - vkGetBufferMemoryRequirements(vctx.device, buffer, &mem_reqs); + vkGetBufferMemoryRequirements(logical_device, buffer, &mem_reqs); uint32_t type_bits = mem_reqs.memoryTypeBits; VkPhysicalDeviceMemoryProperties phy_mem_prop; - vkGetPhysicalDeviceMemoryProperties(vctx.phy_device, &phy_mem_prop); + vkGetPhysicalDeviceMemoryProperties(phy_device, &phy_mem_prop); for (uint32_t i = 0; i < phy_mem_prop.memoryTypeCount; i++) { if ((type_bits & 1) == 1 && (phy_mem_prop.memoryTypes[i].propertyFlags & req_prop) == req_prop) { @@ -127,12 +124,11 @@ int FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, } type_bits >>= 1; } - LOG(INFO) << "Requested memory type not found"; - return -1; + LOG(FATAL) << "Requested memory type not found"; + return 0; } -VkBufferCreateInfo MakeBufferCreateInfo(const VulkanContext& vctx, size_t nbytes, - VkBufferUsageFlags usage) { +VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage) { VkBufferCreateInfo info; info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; info.pNext = nullptr; @@ -142,15 +138,18 @@ VkBufferCreateInfo MakeBufferCreateInfo(const VulkanContext& vctx, size_t nbytes info.pQueueFamilyIndices = &(vctx.queue_family_index); info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; info.usage = usage; - return info; -} - -VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, - uint32_t mem_type_index) { // create buffer VkBuffer buffer; VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); + uint32_t mem_type_index = vctx.compute_mtype_index; + + if (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { + // Find a memory type that supports UBO + auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + mem_type_index = FindMemoryType(vctx.device, vctx.phy_device, buffer, prop); + } + // bind to memory bool dedicated_allocation = false; VkMemoryRequirements2KHR req2; @@ -180,7 +179,7 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, VkMemoryAllocateInfo minfo; minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; minfo.pNext = nullptr; - minfo.allocationSize = info.size; + minfo.allocationSize = nbytes; minfo.memoryTypeIndex = mem_type_index; VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); } else { @@ -205,46 +204,6 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, return pbuf; } -void CopyFromHostToDevice(VkDevice device, const void* from, size_t from_offset, void* to, - size_t to_offset, size_t size, int vk_device_id, int cpu_device_id, - bool coherent_staging) { - const auto* to_buf = static_cast(to); - VulkanStagingBuffer* temp = VulkanThreadEntry::ThreadLocal()->StagingBuffer(vk_device_id, size); - memcpy(temp->host_addr, static_cast(from) + from_offset, size); - // host side flush if access is not coherent. - // so writes from CPU is visible to GPU - if (!coherent_staging) { - VkMappedMemoryRange mrange; - mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - mrange.pNext = nullptr; - mrange.memory = temp->memory; - mrange.offset = 0; - mrange.size = VK_WHOLE_SIZE; // size; - VULKAN_CALL(vkFlushMappedMemoryRanges(device, 1, &mrange)); - } - - VulkanThreadEntry::ThreadLocal()->Stream(cpu_device_id)->Launch([&](VulkanStreamState* state) { - // 0: barrier(host->transfer) - VkMemoryBarrier barrier_info; - barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; - barrier_info.pNext = nullptr; - barrier_info.srcAccessMask = 0; - barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_HOST_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier_info, 0, nullptr, 0, - nullptr); - // 1: copy - VkBufferCopy copy_info; - copy_info.srcOffset = 0; - copy_info.dstOffset = to_offset; - copy_info.size = size; - vkCmdCopyBuffer(state->cmd_buffer_, temp->buffer, to_buf->buffer, 1, ©_info); - }); - // TODO(tulloch): should we instead make the staging buffer a property of the - // Stream? This would allow us to elide synchronizations here. - VulkanThreadEntry::ThreadLocal()->Stream(cpu_device_id)->Synchronize(); -} - class VulkanDeviceAPI final : public DeviceAPI { public: VulkanDeviceAPI(); @@ -267,8 +226,7 @@ class VulkanDeviceAPI final : public DeviceAPI { const auto& vctx = context(dev.device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - auto info = MakeBufferCreateInfo(vctx, nbytes, usage); - return CreateBuffer(vctx, info, vctx.compute_mtype_index); + return CreateBuffer(vctx, nbytes, usage); } void FreeDataSpace(Device dev, void* ptr) final { @@ -348,8 +306,44 @@ class VulkanDeviceAPI final : public DeviceAPI { memcpy(static_cast(to) + to_offset, static_cast(temp->host_addr), size); } else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) { const auto& vctx = context(dev_to.device_id); - CopyFromHostToDevice(vctx.device, from, from_offset, to, to_offset, size, dev_to.device_id, - dev_from.device_id, vctx.coherent_staging); + const auto* to_buf = static_cast(to); + VulkanStagingBuffer* temp = + VulkanThreadEntry::ThreadLocal()->StagingBuffer(dev_to.device_id, size); + memcpy(temp->host_addr, static_cast(from) + from_offset, size); + // host side flush if access is not coherent. + // so writes from CPU is visible to GPU + if (!vctx.coherent_staging) { + VkMappedMemoryRange mrange; + mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + mrange.pNext = nullptr; + mrange.memory = temp->memory; + mrange.offset = 0; + mrange.size = VK_WHOLE_SIZE; // size; + VULKAN_CALL(vkFlushMappedMemoryRanges(vctx.device, 1, &mrange)); + } + + VulkanThreadEntry::ThreadLocal() + ->Stream(dev_from.device_id) + ->Launch([&](VulkanStreamState* state) { + // 0: barrier(host->transfer) + VkMemoryBarrier barrier_info; + barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; + barrier_info.pNext = nullptr; + barrier_info.srcAccessMask = 0; + barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_HOST_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier_info, 0, nullptr, 0, + nullptr); + // 1: copy + VkBufferCopy copy_info; + copy_info.srcOffset = 0; + copy_info.dstOffset = to_offset; + copy_info.size = size; + vkCmdCopyBuffer(state->cmd_buffer_, temp->buffer, to_buf->buffer, 1, ©_info); + }); + // TODO(tulloch): should we instead make the staging buffer a property of the + // Stream? This would allow us to elide synchronizations here. + VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); } else { LOG(FATAL) << "Expect copy from/to Vulkan or between Vulkan" << ", from=" << from_dev_type << ", to=" << to_dev_type; @@ -828,9 +822,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { vkDestroyShaderModule(vctx.device, pe->shader, nullptr); // UBO if (pe->ubo.vk_buf) { - if (pe->ubo.host_buf) { - vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); - } + vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); vkDestroyBuffer(vctx.device, pe->ubo.vk_buf->buffer, nullptr); vkFreeMemory(vctx.device, pe->ubo.vk_buf->memory, nullptr); delete pe->ubo.vk_buf; @@ -990,19 +982,8 @@ class VulkanModuleNode final : public runtime::ModuleNode { if (nbytes_scalars > max_push_constants_) { // Allocate, bind and map UBO UniformBuffer& ubo = pe->ubo; - // Find a memory type that supports UBO - auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - auto info = MakeBufferCreateInfo(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); - auto mem_type_index = FindMemoryType(vctx, info, prop); - if (true || mem_type_index == -1) { - // If host visible memory is not found, use a normal storage buffer - auto usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - ubo.vk_buf = CreateBuffer(vctx, MakeBufferCreateInfo(vctx, nbytes_scalars, usage), - vctx.compute_mtype_index); - } else { - ubo.vk_buf = CreateBuffer(vctx, info, mem_type_index); - vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); - } + ubo.vk_buf = CreateBuffer(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); } if (vctx.UseImmediate()) { @@ -1165,6 +1146,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const size_t nbytes_scalars = num_pack_args_ * sizeof(ArgUnion64); bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > m_->MaxPushConstantsSize(); if (use_ubo) { + CHECK(pipeline->ubo.host_buf) << "The UBO host buffer is not allocated"; VkDescriptorBufferInfo binfo; binfo.buffer = pipeline->ubo.vk_buf->buffer; binfo.offset = 0; @@ -1173,10 +1155,8 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, } if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. - if (use_ubo && pipeline->ubo.host_buf) { + if (use_ubo) { memcpy(pipeline->ubo.host_buf, pack_args, nbytes_scalars); - } else if (use_ubo) { - // TODO } VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) { vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); @@ -1229,19 +1209,11 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkUpdateDescriptorSets(vctx.device, write_descriptor_sets.size(), write_descriptor_sets.data(), 0, 0); }; - bool coherent_staging = vctx.coherent_staging; - VkDevice device = vctx.device; - const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, nbytes_scalars, - device_id, coherent_staging, device](VulkanStreamState* state) { - if (use_ubo && pipeline->ubo.host_buf) { + const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, + nbytes_scalars](VulkanStreamState* state) { + if (use_ubo) { memcpy(pipeline->ubo.host_buf, pack_args_storage.data(), nbytes_scalars); - } else if (use_ubo) { - // TODO(masahi): Is this ok - int cpu_device_id = 0; - CopyFromHostToDevice(device, pack_args_storage.data(), 0, pipeline->ubo.vk_buf, 0, - nbytes_scalars, cpu_device_id, device_id, coherent_staging); } - vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, From 4ff52ffc9efa2858be4d4bc0732e0cf765bae2d5 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 10:30:49 +0900 Subject: [PATCH 07/28] move VulkanStagingBuffer out of header --- src/runtime/vulkan/vulkan.cc | 8 ++++++++ src/runtime/vulkan/vulkan_common.h | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 89cd98e4c906..16c0656eca57 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -45,6 +45,14 @@ static constexpr const int kVulkanMaxNumDevice = 8; /*! \brief TVM Vulkan binary pack magic number */ static constexpr const int kVulkanModuleMagic = 0x02700027; +struct VulkanStagingBuffer { + VkDevice device{nullptr}; + VkBuffer buffer{VK_NULL_HANDLE}; + VkDeviceMemory memory{VK_NULL_HANDLE}; + void* host_addr{nullptr}; + size_t size{0}; +}; + class VulkanThreadEntry { public: VulkanThreadEntry(); diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index e94a9fe7fa90..422ec4d36aa4 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -105,14 +105,6 @@ struct VulkanGetBufferMemoryRequirements2Functions { PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR{nullptr}; }; -struct VulkanStagingBuffer { - VkDevice device{nullptr}; - VkBuffer buffer{VK_NULL_HANDLE}; - VkDeviceMemory memory{VK_NULL_HANDLE}; - void* host_addr{nullptr}; - size_t size{0}; -}; - struct VulkanContext { // phyiscal device VkPhysicalDevice phy_device{nullptr}; From 79a39355627841b2270331ae08577db530c4ae37 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 10:37:29 +0900 Subject: [PATCH 08/28] introduce VulkanHostvisibleBuffer --- src/runtime/vulkan/vulkan.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 16c0656eca57..f397da8caf55 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -45,7 +45,7 @@ static constexpr const int kVulkanMaxNumDevice = 8; /*! \brief TVM Vulkan binary pack magic number */ static constexpr const int kVulkanModuleMagic = 0x02700027; -struct VulkanStagingBuffer { +struct VulkanHostVisibleBuffer { VkDevice device{nullptr}; VkBuffer buffer{VK_NULL_HANDLE}; VkDeviceMemory memory{VK_NULL_HANDLE}; @@ -53,6 +53,9 @@ struct VulkanStagingBuffer { size_t size{0}; }; +using VulkanStagingBuffer = VulkanHostVisibleBuffer; +using VulkanUniformBuffer = VulkanHostVisibleBuffer; + class VulkanThreadEntry { public: VulkanThreadEntry(); @@ -92,6 +95,7 @@ class VulkanThreadEntry { private: std::unordered_map> streams_; std::unordered_map> staging_buffers_; + std::unordered_map> uniform_buffers_; }; struct VulkanBuffer { @@ -848,7 +852,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { return cp; } // Create new pipeline - auto pe = std::shared_ptr(new VulkanPipeline()); + auto pe = std::make_shared(); { // create shader auto sit = smap_.find(func_name); @@ -1067,7 +1071,7 @@ VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() { return VulkanThreadStore:: VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size) { if (!staging_buffers_[device_id]) { - staging_buffers_[device_id] = std::unique_ptr(new VulkanStagingBuffer()); + staging_buffers_[device_id] = std::make_unique(); } auto& buf = *(staging_buffers_[device_id]); if (buf.device != nullptr && buf.size < size) { From a1b57b51d1f4d10b364ff738d6c47d2ada099141 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 10:45:06 +0900 Subject: [PATCH 09/28] refactor to use VulkanBuffer --- src/runtime/vulkan/vulkan.cc | 65 ++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index f397da8caf55..144618a89ac3 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -45,10 +45,20 @@ static constexpr const int kVulkanMaxNumDevice = 8; /*! \brief TVM Vulkan binary pack magic number */ static constexpr const int kVulkanModuleMagic = 0x02700027; -struct VulkanHostVisibleBuffer { - VkDevice device{nullptr}; +struct VulkanBuffer { VkBuffer buffer{VK_NULL_HANDLE}; VkDeviceMemory memory{VK_NULL_HANDLE}; +}; + +// To remove +struct UniformBuffer { + VulkanBuffer* vk_buf; + void* host_buf; +}; + +struct VulkanHostVisibleBuffer { + VkDevice device{nullptr}; + VulkanBuffer vk_buf; void* host_addr{nullptr}; size_t size{0}; }; @@ -76,13 +86,13 @@ class VulkanThreadEntry { } auto& buf = *(kv.second); if (buf.host_addr != nullptr) { - vkUnmapMemory(buf.device, buf.memory); + vkUnmapMemory(buf.device, buf.vk_buf.memory); } - if (buf.memory != VK_NULL_HANDLE) { - vkFreeMemory(buf.device, buf.memory, nullptr); + if (buf.vk_buf.memory != VK_NULL_HANDLE) { + vkFreeMemory(buf.device, buf.vk_buf.memory, nullptr); } - if (buf.buffer != VK_NULL_HANDLE) { - vkDestroyBuffer(buf.device, buf.buffer, nullptr); + if (buf.vk_buf.buffer != VK_NULL_HANDLE) { + vkDestroyBuffer(buf.device, buf.vk_buf.buffer, nullptr); } } } @@ -98,16 +108,6 @@ class VulkanThreadEntry { std::unordered_map> uniform_buffers_; }; -struct VulkanBuffer { - VkBuffer buffer{VK_NULL_HANDLE}; - VkDeviceMemory memory{VK_NULL_HANDLE}; -}; - -struct UniformBuffer { - VulkanBuffer* vk_buf; - void* host_buf; -}; - struct VulkanPipeline { VulkanContext* vctx_{nullptr}; VkShaderModule shader{VK_NULL_HANDLE}; @@ -303,14 +303,14 @@ class VulkanDeviceAPI final : public DeviceAPI { copy_info.srcOffset = from_offset; copy_info.dstOffset = 0; copy_info.size = size; - vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, temp->buffer, 1, ©_info); + vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, temp->vk_buf.buffer, 1, ©_info); }); VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); if (!vctx.coherent_staging) { VkMappedMemoryRange mrange; mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; mrange.pNext = nullptr; - mrange.memory = temp->memory; + mrange.memory = temp->vk_buf.memory; mrange.offset = 0; mrange.size = VK_WHOLE_SIZE; // size; VULKAN_CALL(vkInvalidateMappedMemoryRanges(vctx.device, 1, &mrange)); @@ -328,7 +328,7 @@ class VulkanDeviceAPI final : public DeviceAPI { VkMappedMemoryRange mrange; mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; mrange.pNext = nullptr; - mrange.memory = temp->memory; + mrange.memory = temp->vk_buf.memory; mrange.offset = 0; mrange.size = VK_WHOLE_SIZE; // size; VULKAN_CALL(vkFlushMappedMemoryRanges(vctx.device, 1, &mrange)); @@ -351,7 +351,7 @@ class VulkanDeviceAPI final : public DeviceAPI { copy_info.srcOffset = 0; copy_info.dstOffset = to_offset; copy_info.size = size; - vkCmdCopyBuffer(state->cmd_buffer_, temp->buffer, to_buf->buffer, 1, ©_info); + vkCmdCopyBuffer(state->cmd_buffer_, temp->vk_buf.buffer, to_buf->buffer, 1, ©_info); }); // TODO(tulloch): should we instead make the staging buffer a property of the // Stream? This would allow us to elide synchronizations here. @@ -1077,24 +1077,23 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size if (buf.device != nullptr && buf.size < size) { // free previous buffer if (buf.host_addr != nullptr) { - vkUnmapMemory(buf.device, buf.memory); + vkUnmapMemory(buf.device, buf.vk_buf.memory); } - if (buf.memory != VK_NULL_HANDLE) { - vkFreeMemory(buf.device, buf.memory, nullptr); + if (buf.vk_buf.memory != VK_NULL_HANDLE) { + vkFreeMemory(buf.device, buf.vk_buf.memory, nullptr); } - if (buf.buffer != VK_NULL_HANDLE) { - vkDestroyBuffer(buf.device, buf.buffer, nullptr); + if (buf.vk_buf.buffer != VK_NULL_HANDLE) { + vkDestroyBuffer(buf.device, buf.vk_buf.buffer, nullptr); } buf.host_addr = nullptr; - buf.memory = VK_NULL_HANDLE; - buf.buffer = VK_NULL_HANDLE; + buf.vk_buf.memory = VK_NULL_HANDLE; } const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); if (buf.device == nullptr) { buf.device = vctx.device; } - if (buf.memory == VK_NULL_HANDLE) { + if (buf.vk_buf.memory == VK_NULL_HANDLE) { // allocate the stagging buffer memory if necessary VkBufferCreateInfo info; info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; @@ -1105,15 +1104,15 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size info.pQueueFamilyIndices = &(vctx.queue_family_index); info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &(buf.buffer))); + VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &(buf.vk_buf.buffer))); VkMemoryAllocateInfo minfo; minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; minfo.pNext = nullptr; minfo.allocationSize = size; minfo.memoryTypeIndex = vctx.staging_mtype_index; - VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &(buf.memory))); - VULKAN_CALL(vkBindBufferMemory(vctx.device, (buf.buffer), buf.memory, 0)); - VULKAN_CALL(vkMapMemory(vctx.device, buf.memory, 0, size, 0, &(buf.host_addr))); + VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &(buf.vk_buf.memory))); + VULKAN_CALL(vkBindBufferMemory(vctx.device, (buf.vk_buf.buffer), buf.vk_buf.memory, 0)); + VULKAN_CALL(vkMapMemory(vctx.device, buf.vk_buf.memory, 0, size, 0, &(buf.host_addr))); buf.size = size; } memset(buf.host_addr, 0, size); From a72638623afb71a66ae6be7b131240702cd06eae Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 11:00:48 +0900 Subject: [PATCH 10/28] refactoring CreateBuffer and FindMemoryType --- src/runtime/vulkan/vulkan.cc | 48 ++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 144618a89ac3..7357bc234415 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -122,13 +122,16 @@ struct VulkanPipeline { typedef dmlc::ThreadLocalStore VulkanThreadStore; -uint32_t FindMemoryType(VkDevice logical_device, VkPhysicalDevice phy_device, VkBuffer buffer, - VkMemoryPropertyFlags req_prop) { +int FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, + VkMemoryPropertyFlags req_prop) { + VkBuffer buffer; + VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); + VkMemoryRequirements mem_reqs; - vkGetBufferMemoryRequirements(logical_device, buffer, &mem_reqs); + vkGetBufferMemoryRequirements(vctx.device, buffer, &mem_reqs); uint32_t type_bits = mem_reqs.memoryTypeBits; VkPhysicalDeviceMemoryProperties phy_mem_prop; - vkGetPhysicalDeviceMemoryProperties(phy_device, &phy_mem_prop); + vkGetPhysicalDeviceMemoryProperties(vctx.phy_device, &phy_mem_prop); for (uint32_t i = 0; i < phy_mem_prop.memoryTypeCount; i++) { if ((type_bits & 1) == 1 && (phy_mem_prop.memoryTypes[i].propertyFlags & req_prop) == req_prop) { @@ -136,11 +139,12 @@ uint32_t FindMemoryType(VkDevice logical_device, VkPhysicalDevice phy_device, Vk } type_bits >>= 1; } - LOG(FATAL) << "Requested memory type not found"; - return 0; + LOG(INFO) << "Requested memory type not found"; + return -1; } -VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage) { +VkBufferCreateInfo MakeBufferCreateInfo(const VulkanContext& vctx, size_t nbytes, + VkBufferUsageFlags usage) { VkBufferCreateInfo info; info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; info.pNext = nullptr; @@ -150,18 +154,15 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsa info.pQueueFamilyIndices = &(vctx.queue_family_index); info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; info.usage = usage; + return info; +} + +VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, + uint32_t mem_type_index) { // create buffer VkBuffer buffer; VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); - uint32_t mem_type_index = vctx.compute_mtype_index; - - if (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { - // Find a memory type that supports UBO - auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - mem_type_index = FindMemoryType(vctx.device, vctx.phy_device, buffer, prop); - } - // bind to memory bool dedicated_allocation = false; VkMemoryRequirements2KHR req2; @@ -191,7 +192,7 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsa VkMemoryAllocateInfo minfo; minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; minfo.pNext = nullptr; - minfo.allocationSize = nbytes; + minfo.allocationSize = info.size; minfo.memoryTypeIndex = mem_type_index; VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &memory)); } else { @@ -238,7 +239,8 @@ class VulkanDeviceAPI final : public DeviceAPI { const auto& vctx = context(dev.device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - return CreateBuffer(vctx, nbytes, usage); + auto info = MakeBufferCreateInfo(vctx, nbytes, usage); + return CreateBuffer(vctx, info, vctx.compute_mtype_index); } void FreeDataSpace(Device dev, void* ptr) final { @@ -994,8 +996,16 @@ class VulkanModuleNode final : public runtime::ModuleNode { if (nbytes_scalars > max_push_constants_) { // Allocate, bind and map UBO UniformBuffer& ubo = pe->ubo; - ubo.vk_buf = CreateBuffer(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); - vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); + // Find a memory type that supports UBO + auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + auto info = MakeBufferCreateInfo(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + auto mem_type_index = FindMemoryType(vctx, info, prop); + if (mem_type_index == -1) { + ubo.vk_buf = CreateBuffer(vctx, info, vctx.compute_mtype_index); + } else { + ubo.vk_buf = CreateBuffer(vctx, info, mem_type_index); + vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); + } } if (vctx.UseImmediate()) { From 9441b7e58df47e8dd9bc15c45788f72751616daf Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 11:28:33 +0900 Subject: [PATCH 11/28] refactor staging buffer alloc --- src/runtime/vulkan/vulkan.cc | 81 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 7357bc234415..3a0749f22f17 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -58,7 +58,7 @@ struct UniformBuffer { struct VulkanHostVisibleBuffer { VkDevice device{nullptr}; - VulkanBuffer vk_buf; + VulkanBuffer* vk_buf{nullptr}; void* host_addr{nullptr}; size_t size{0}; }; @@ -85,14 +85,17 @@ class VulkanThreadEntry { continue; } auto& buf = *(kv.second); - if (buf.host_addr != nullptr) { - vkUnmapMemory(buf.device, buf.vk_buf.memory); - } - if (buf.vk_buf.memory != VK_NULL_HANDLE) { - vkFreeMemory(buf.device, buf.vk_buf.memory, nullptr); - } - if (buf.vk_buf.buffer != VK_NULL_HANDLE) { - vkDestroyBuffer(buf.device, buf.vk_buf.buffer, nullptr); + if (buf.vk_buf) { + if (buf.host_addr != nullptr) { + vkUnmapMemory(buf.device, buf.vk_buf->memory); + } + if (buf.vk_buf->memory != VK_NULL_HANDLE) { + vkFreeMemory(buf.device, buf.vk_buf->memory, nullptr); + } + if (buf.vk_buf->buffer != VK_NULL_HANDLE) { + vkDestroyBuffer(buf.device, buf.vk_buf->buffer, nullptr); + } + delete buf.vk_buf; } } } @@ -217,6 +220,12 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, return pbuf; } +VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage, + uint32_t mem_type_index) { + auto info = MakeBufferCreateInfo(vctx, nbytes, usage); + return CreateBuffer(vctx, info, mem_type_index); +} + class VulkanDeviceAPI final : public DeviceAPI { public: VulkanDeviceAPI(); @@ -239,8 +248,7 @@ class VulkanDeviceAPI final : public DeviceAPI { const auto& vctx = context(dev.device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - auto info = MakeBufferCreateInfo(vctx, nbytes, usage); - return CreateBuffer(vctx, info, vctx.compute_mtype_index); + return CreateBuffer(vctx, nbytes, usage, vctx.staging_mtype_index); } void FreeDataSpace(Device dev, void* ptr) final { @@ -305,14 +313,15 @@ class VulkanDeviceAPI final : public DeviceAPI { copy_info.srcOffset = from_offset; copy_info.dstOffset = 0; copy_info.size = size; - vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, temp->vk_buf.buffer, 1, ©_info); + vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, temp->vk_buf->buffer, 1, + ©_info); }); VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); if (!vctx.coherent_staging) { VkMappedMemoryRange mrange; mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; mrange.pNext = nullptr; - mrange.memory = temp->vk_buf.memory; + mrange.memory = temp->vk_buf->memory; mrange.offset = 0; mrange.size = VK_WHOLE_SIZE; // size; VULKAN_CALL(vkInvalidateMappedMemoryRanges(vctx.device, 1, &mrange)); @@ -330,7 +339,7 @@ class VulkanDeviceAPI final : public DeviceAPI { VkMappedMemoryRange mrange; mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; mrange.pNext = nullptr; - mrange.memory = temp->vk_buf.memory; + mrange.memory = temp->vk_buf->memory; mrange.offset = 0; mrange.size = VK_WHOLE_SIZE; // size; VULKAN_CALL(vkFlushMappedMemoryRanges(vctx.device, 1, &mrange)); @@ -353,7 +362,8 @@ class VulkanDeviceAPI final : public DeviceAPI { copy_info.srcOffset = 0; copy_info.dstOffset = to_offset; copy_info.size = size; - vkCmdCopyBuffer(state->cmd_buffer_, temp->vk_buf.buffer, to_buf->buffer, 1, ©_info); + vkCmdCopyBuffer(state->cmd_buffer_, temp->vk_buf->buffer, to_buf->buffer, 1, + ©_info); }); // TODO(tulloch): should we instead make the staging buffer a property of the // Stream? This would allow us to elide synchronizations here. @@ -1085,46 +1095,35 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size } auto& buf = *(staging_buffers_[device_id]); if (buf.device != nullptr && buf.size < size) { + ICHECK(buf.vk_buf); // free previous buffer if (buf.host_addr != nullptr) { - vkUnmapMemory(buf.device, buf.vk_buf.memory); + vkUnmapMemory(buf.device, buf.vk_buf->memory); } - if (buf.vk_buf.memory != VK_NULL_HANDLE) { - vkFreeMemory(buf.device, buf.vk_buf.memory, nullptr); + if (buf.vk_buf->memory != VK_NULL_HANDLE) { + vkFreeMemory(buf.device, buf.vk_buf->memory, nullptr); } - if (buf.vk_buf.buffer != VK_NULL_HANDLE) { - vkDestroyBuffer(buf.device, buf.vk_buf.buffer, nullptr); + if (buf.vk_buf->buffer != VK_NULL_HANDLE) { + vkDestroyBuffer(buf.device, buf.vk_buf->buffer, nullptr); } buf.host_addr = nullptr; - buf.vk_buf.memory = VK_NULL_HANDLE; + delete buf.vk_buf; } + const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); if (buf.device == nullptr) { buf.device = vctx.device; } - if (buf.vk_buf.memory == VK_NULL_HANDLE) { - // allocate the stagging buffer memory if necessary - VkBufferCreateInfo info; - info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - info.pNext = nullptr; - info.flags = 0; - info.size = size; - info.queueFamilyIndexCount = 1; - info.pQueueFamilyIndices = &(vctx.queue_family_index); - info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &(buf.vk_buf.buffer))); - VkMemoryAllocateInfo minfo; - minfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; - minfo.pNext = nullptr; - minfo.allocationSize = size; - minfo.memoryTypeIndex = vctx.staging_mtype_index; - VULKAN_CALL(vkAllocateMemory(vctx.device, &minfo, nullptr, &(buf.vk_buf.memory))); - VULKAN_CALL(vkBindBufferMemory(vctx.device, (buf.vk_buf.buffer), buf.vk_buf.memory, 0)); - VULKAN_CALL(vkMapMemory(vctx.device, buf.vk_buf.memory, 0, size, 0, &(buf.host_addr))); + if (buf.host_addr == nullptr) { + // allocate the staging buffer memory if necessary + auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + buf.vk_buf = CreateBuffer(vctx, size, usage, vctx.staging_mtype_index); + VULKAN_CALL(vkMapMemory(vctx.device, buf.vk_buf->memory, 0, size, 0, &(buf.host_addr))); buf.size = size; } + + ICHECK(buf.size >= size); memset(buf.host_addr, 0, size); return &buf; } From dad2a177145412a1051c5e444e871e318178d9e9 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 11:51:26 +0900 Subject: [PATCH 12/28] refactor staging buffer realloc --- src/runtime/vulkan/vulkan.cc | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 3a0749f22f17..053b44e261c4 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -104,6 +104,7 @@ class VulkanThreadEntry { std::unique_ptr pool; VulkanStream* Stream(size_t device_id); VulkanStagingBuffer* StagingBuffer(int device_id, size_t size); + void AllocateUniformBuffer(int device_id, size_t size); private: std::unordered_map> streams_; @@ -1089,11 +1090,13 @@ Module VulkanModuleCreate(std::unordered_map smap, VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() { return VulkanThreadStore::Get(); } -VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size) { - if (!staging_buffers_[device_id]) { - staging_buffers_[device_id] = std::make_unique(); +VulkanHostVisibleBuffer* GetOrAllocate( + int device_id, size_t size, VkBufferUsageFlags usage, uint32_t mem_type_index, + std::unordered_map>& buffers) { + if (!buffers[device_id]) { + buffers[device_id] = std::make_unique(); } - auto& buf = *(staging_buffers_[device_id]); + auto& buf = *(buffers[device_id]); if (buf.device != nullptr && buf.size < size) { ICHECK(buf.vk_buf); // free previous buffer @@ -1116,18 +1119,21 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size buf.device = vctx.device; } if (buf.host_addr == nullptr) { - // allocate the staging buffer memory if necessary - auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - buf.vk_buf = CreateBuffer(vctx, size, usage, vctx.staging_mtype_index); + buf.vk_buf = CreateBuffer(vctx, size, usage, mem_type_index); VULKAN_CALL(vkMapMemory(vctx.device, buf.vk_buf->memory, 0, size, 0, &(buf.host_addr))); buf.size = size; } - - ICHECK(buf.size >= size); - memset(buf.host_addr, 0, size); return &buf; } +VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size) { + const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); + auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + auto buf = GetOrAllocate(device_id, size, usage, vctx.staging_mtype_index, staging_buffers_); + memset(buf->host_addr, 0, size); + return buf; +} + VulkanThreadEntry::VulkanThreadEntry() : pool(std::make_unique(static_cast(kDLVulkan), VulkanDeviceAPI::Global())) { From cd0ae060d9f0b912bce3e878d511ed4de96382ed Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 11:56:23 +0900 Subject: [PATCH 13/28] add AllocateUniformBuffer --- src/runtime/vulkan/vulkan.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 053b44e261c4..0e1fd9df9353 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -1134,6 +1134,15 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size return buf; } +void VulkanThreadEntry::AllocateUniformBuffer(int device_id, size_t size) { + const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); + auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + auto info = MakeBufferCreateInfo(vctx, size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + auto mem_type_index = FindMemoryType(vctx, info, prop); + GetOrAllocate(device_id, size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, mem_type_index, + uniform_buffers_); +} + VulkanThreadEntry::VulkanThreadEntry() : pool(std::make_unique(static_cast(kDLVulkan), VulkanDeviceAPI::Global())) { From 2f156101a479ec9b01b42e2bbe21ad4f164f843b Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 12:07:41 +0900 Subject: [PATCH 14/28] working --- src/runtime/vulkan/vulkan.cc | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 0e1fd9df9353..3feb714248ea 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -105,6 +105,7 @@ class VulkanThreadEntry { VulkanStream* Stream(size_t device_id); VulkanStagingBuffer* StagingBuffer(int device_id, size_t size); void AllocateUniformBuffer(int device_id, size_t size); + VulkanUniformBuffer* GetUniformBuffer(int device_id, size_t size); private: std::unordered_map> streams_; @@ -1006,17 +1007,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { if (nbytes_scalars > max_push_constants_) { // Allocate, bind and map UBO - UniformBuffer& ubo = pe->ubo; - // Find a memory type that supports UBO - auto prop = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - auto info = MakeBufferCreateInfo(vctx, nbytes_scalars, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); - auto mem_type_index = FindMemoryType(vctx, info, prop); - if (mem_type_index == -1) { - ubo.vk_buf = CreateBuffer(vctx, info, vctx.compute_mtype_index); - } else { - ubo.vk_buf = CreateBuffer(vctx, info, mem_type_index); - vkMapMemory(vctx.device, ubo.vk_buf->memory, 0, nbytes_scalars, 0, &(ubo.host_buf)); - } + VulkanThreadEntry::ThreadLocal()->AllocateUniformBuffer(device_id, nbytes_scalars); } if (vctx.UseImmediate()) { @@ -1143,6 +1134,13 @@ void VulkanThreadEntry::AllocateUniformBuffer(int device_id, size_t size) { uniform_buffers_); } +VulkanUniformBuffer* VulkanThreadEntry::GetUniformBuffer(int device_id, size_t size) { + auto& buf = uniform_buffers_[device_id]; + ICHECK(buf); + ICHECK_GE(buf->size, size); + return buf.get(); +} + VulkanThreadEntry::VulkanThreadEntry() : pool(std::make_unique(static_cast(kDLVulkan), VulkanDeviceAPI::Global())) { @@ -1181,9 +1179,10 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const size_t nbytes_scalars = num_pack_args_ * sizeof(ArgUnion64); bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > m_->MaxPushConstantsSize(); if (use_ubo) { - CHECK(pipeline->ubo.host_buf) << "The UBO host buffer is not allocated"; + auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); + CHECK(ubo->host_addr) << "The UBO host buffer is not allocated"; VkDescriptorBufferInfo binfo; - binfo.buffer = pipeline->ubo.vk_buf->buffer; + binfo.buffer = ubo->vk_buf->buffer; binfo.offset = 0; binfo.range = VK_WHOLE_SIZE; descriptor_buffers.push_back(binfo); @@ -1191,7 +1190,8 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. if (use_ubo) { - memcpy(pipeline->ubo.host_buf, pack_args, nbytes_scalars); + auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); + memcpy(ubo->host_addr, pack_args, nbytes_scalars); } VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) { vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); @@ -1245,9 +1245,10 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, 0, 0); }; const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, - nbytes_scalars](VulkanStreamState* state) { + nbytes_scalars, device_id](VulkanStreamState* state) { if (use_ubo) { - memcpy(pipeline->ubo.host_buf, pack_args_storage.data(), nbytes_scalars); + auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); + memcpy(ubo->host_addr, pack_args_storage.data(), nbytes_scalars); } vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, From 895f9650e5ad866bdd57e951327ca5fa8819012a Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 12:10:30 +0900 Subject: [PATCH 15/28] remove UniformBuffer from pipeline --- src/runtime/vulkan/vulkan.cc | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 3feb714248ea..fd6b9a27dd3d 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -50,12 +50,6 @@ struct VulkanBuffer { VkDeviceMemory memory{VK_NULL_HANDLE}; }; -// To remove -struct UniformBuffer { - VulkanBuffer* vk_buf; - void* host_buf; -}; - struct VulkanHostVisibleBuffer { VkDevice device{nullptr}; VulkanBuffer* vk_buf{nullptr}; @@ -122,7 +116,6 @@ struct VulkanPipeline { VkPipelineLayout pipeline_layout{VK_NULL_HANDLE}; VkPipeline pipeline{VK_NULL_HANDLE}; VkDescriptorUpdateTemplateKHR descriptor_update_template{VK_NULL_HANDLE}; - UniformBuffer ubo; }; typedef dmlc::ThreadLocalStore VulkanThreadStore; @@ -846,13 +839,6 @@ class VulkanModuleNode final : public runtime::ModuleNode { vkDestroyDescriptorPool(vctx.device, pe->descriptor_pool, nullptr); vkDestroyDescriptorSetLayout(vctx.device, pe->descriptor_set_layout, nullptr); vkDestroyShaderModule(vctx.device, pe->shader, nullptr); - // UBO - if (pe->ubo.vk_buf) { - vkUnmapMemory(vctx.device, pe->ubo.vk_buf->memory); - vkDestroyBuffer(vctx.device, pe->ubo.vk_buf->buffer, nullptr); - vkFreeMemory(vctx.device, pe->ubo.vk_buf->memory, nullptr); - delete pe->ubo.vk_buf; - } } } } @@ -1244,8 +1230,8 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkUpdateDescriptorSets(vctx.device, write_descriptor_sets.size(), write_descriptor_sets.data(), 0, 0); }; - const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, - nbytes_scalars, device_id](VulkanStreamState* state) { + const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, nbytes_scalars, + device_id](VulkanStreamState* state) { if (use_ubo) { auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); memcpy(ubo->host_addr, pack_args_storage.data(), nbytes_scalars); From 04f14924f6556d1350ce265d432a071fa94639d5 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 12:12:32 +0900 Subject: [PATCH 16/28] remove CreateBuffer overload --- src/runtime/vulkan/vulkan.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index fd6b9a27dd3d..8b4ec38052f8 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -155,8 +155,9 @@ VkBufferCreateInfo MakeBufferCreateInfo(const VulkanContext& vctx, size_t nbytes return info; } -VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, +VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage, uint32_t mem_type_index) { + auto info = MakeBufferCreateInfo(vctx, nbytes, usage); // create buffer VkBuffer buffer; VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); @@ -215,12 +216,6 @@ VulkanBuffer* CreateBuffer(const VulkanContext& vctx, VkBufferCreateInfo info, return pbuf; } -VulkanBuffer* CreateBuffer(const VulkanContext& vctx, size_t nbytes, VkBufferUsageFlags usage, - uint32_t mem_type_index) { - auto info = MakeBufferCreateInfo(vctx, nbytes, usage); - return CreateBuffer(vctx, info, mem_type_index); -} - class VulkanDeviceAPI final : public DeviceAPI { public: VulkanDeviceAPI(); From ff0f2659ebe4fca00457f312f920864786030aaf Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 12:36:27 +0900 Subject: [PATCH 17/28] clean up delete --- src/runtime/vulkan/vulkan.cc | 53 ++++++++++++++---------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 8b4ec38052f8..c79e5aeddf07 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -60,6 +60,22 @@ struct VulkanHostVisibleBuffer { using VulkanStagingBuffer = VulkanHostVisibleBuffer; using VulkanUniformBuffer = VulkanHostVisibleBuffer; +void DeleteHostVisibleBuffer(VulkanHostVisibleBuffer* buf) { + if (buf && buf->vk_buf) { + if (buf->host_addr != nullptr) { + vkUnmapMemory(buf->device, buf->vk_buf->memory); + } + if (buf->vk_buf->memory != VK_NULL_HANDLE) { + vkFreeMemory(buf->device, buf->vk_buf->memory, nullptr); + } + if (buf->vk_buf->buffer != VK_NULL_HANDLE) { + vkDestroyBuffer(buf->device, buf->vk_buf->buffer, nullptr); + } + buf->host_addr = nullptr; + delete buf->vk_buf; + } +} + class VulkanThreadEntry { public: VulkanThreadEntry(); @@ -75,22 +91,7 @@ class VulkanThreadEntry { pool.reset(); streams_.clear(); for (const auto& kv : staging_buffers_) { - if (!kv.second) { - continue; - } - auto& buf = *(kv.second); - if (buf.vk_buf) { - if (buf.host_addr != nullptr) { - vkUnmapMemory(buf.device, buf.vk_buf->memory); - } - if (buf.vk_buf->memory != VK_NULL_HANDLE) { - vkFreeMemory(buf.device, buf.vk_buf->memory, nullptr); - } - if (buf.vk_buf->buffer != VK_NULL_HANDLE) { - vkDestroyBuffer(buf.device, buf.vk_buf->buffer, nullptr); - } - delete buf.vk_buf; - } + DeleteHostVisibleBuffer(kv.second.get()); } } @@ -903,7 +904,9 @@ class VulkanModuleNode final : public runtime::ModuleNode { size_t nbytes_scalars = num_pod * sizeof(ArgUnion64); if (nbytes_scalars > max_push_constants_) { + // Use UBO instead of push constants push_arg_info(num_buffer, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); + VulkanThreadEntry::ThreadLocal()->AllocateUniformBuffer(device_id, nbytes_scalars); } { @@ -986,11 +989,6 @@ class VulkanModuleNode final : public runtime::ModuleNode { VULKAN_CALL(vkCreateComputePipelines(vctx.device, VK_NULL_HANDLE, 1, &pipeline_cinfo, nullptr, &(pe->pipeline))); - if (nbytes_scalars > max_push_constants_) { - // Allocate, bind and map UBO - VulkanThreadEntry::ThreadLocal()->AllocateUniformBuffer(device_id, nbytes_scalars); - } - if (vctx.UseImmediate()) { VkDescriptorUpdateTemplateCreateInfoKHR descrip_template_cinfo; descrip_template_cinfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR; @@ -1070,19 +1068,8 @@ VulkanHostVisibleBuffer* GetOrAllocate( } auto& buf = *(buffers[device_id]); if (buf.device != nullptr && buf.size < size) { - ICHECK(buf.vk_buf); // free previous buffer - if (buf.host_addr != nullptr) { - vkUnmapMemory(buf.device, buf.vk_buf->memory); - } - if (buf.vk_buf->memory != VK_NULL_HANDLE) { - vkFreeMemory(buf.device, buf.vk_buf->memory, nullptr); - } - if (buf.vk_buf->buffer != VK_NULL_HANDLE) { - vkDestroyBuffer(buf.device, buf.vk_buf->buffer, nullptr); - } - buf.host_addr = nullptr; - delete buf.vk_buf; + DeleteHostVisibleBuffer(&buf); } const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); From c01d760801debf58158da241f6eb084a42d93bd7 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 21:13:09 +0900 Subject: [PATCH 18/28] return VulkanShader from codegen --- src/target/spirv/build_vulkan.cc | 5 ++--- src/target/spirv/codegen_spirv.cc | 8 +++++--- src/target/spirv/codegen_spirv.h | 3 ++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/target/spirv/build_vulkan.cc b/src/target/spirv/build_vulkan.cc index a0f0b76eefbd..9f9718bef18e 100644 --- a/src/target/spirv/build_vulkan.cc +++ b/src/target/spirv/build_vulkan.cc @@ -88,10 +88,9 @@ runtime::Module BuildSPIRV(IRModule mod, Target target, bool webgpu_restriction) << "CodeGenSPIRV: Expect PrimFunc to have the global_symbol attribute"; std::string f_name = global_symbol.value(); - - VulkanShader shader; std::string entry = webgpu_restriction ? "main" : f_name; - shader.data = cg.BuildFunction(f, entry); + + VulkanShader shader = cg.BuildFunction(f, entry); if (webgpu_restriction) { for (auto param : f->params) { diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index 4d55f4c49a5f..4ef3edadc195 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -32,11 +32,12 @@ #include "../../runtime/pack_args.h" #include "../../runtime/vulkan/vulkan_common.h" +#include "../../runtime/vulkan/vulkan_shader.h" namespace tvm { namespace codegen { -std::vector CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::string& name) { +runtime::VulkanShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::string& name) { this->InitFuncState(); ICHECK(f->HasNonzeroAttr(tir::attr::kNoAlias)) << "SPIRV only takes restricted memory model"; std::vector pod_args; @@ -97,8 +98,9 @@ std::vector CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: builder_->MakeInst(spv::OpFunctionEnd); builder_->CommitKernelFunction(func_ptr, name); - - return builder_->Finalize(); + runtime::VulkanShader shader; + shader.data = builder_->Finalize(); + return shader; } void CodeGenSPIRV::InitFuncState() { diff --git a/src/target/spirv/codegen_spirv.h b/src/target/spirv/codegen_spirv.h index 1e80fcc4a931..e3d6c153d06f 100644 --- a/src/target/spirv/codegen_spirv.h +++ b/src/target/spirv/codegen_spirv.h @@ -36,6 +36,7 @@ #include #include "../../runtime/thread_storage_scope.h" +#include "../../runtime/vulkan/vulkan_shader.h" #include "ir_builder.h" namespace tvm { @@ -55,7 +56,7 @@ class CodeGenSPIRV : public ExprFunctor, * \param name The name of the target function. * \return The final spirv module. */ - virtual std::vector BuildFunction(const PrimFunc& f, const std::string& name); + virtual runtime::VulkanShader BuildFunction(const PrimFunc& f, const std::string& name); /*! * \brief Create Value for expression e * \param e The expression to be created value for. From 822b232baeddd8b87dc7e428cbcfe3bb853eb70d Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 21:21:26 +0900 Subject: [PATCH 19/28] remove GetMaxPushConstantsSize() --- src/runtime/vulkan/vulkan.cc | 22 ++++++---------------- src/runtime/vulkan/vulkan_common.h | 5 ++--- src/target/spirv/codegen_spirv.cc | 7 ++++--- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index c79e5aeddf07..956369240e71 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -801,7 +801,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { public: explicit VulkanModuleNode(std::unordered_map smap, std::unordered_map fmap, std::string source) - : smap_(smap), fmap_(fmap), source_(source), max_push_constants_(GetMaxPushConstantsSize()) {} + : smap_(smap), fmap_(fmap), source_(source) {} const char* type_key() const final { return "vulkan"; } @@ -903,7 +903,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { } size_t nbytes_scalars = num_pod * sizeof(ArgUnion64); - if (nbytes_scalars > max_push_constants_) { + if (nbytes_scalars > kMaxPushConstantsBytes) { // Use UBO instead of push constants push_arg_info(num_buffer, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); VulkanThreadEntry::ThreadLocal()->AllocateUniformBuffer(device_id, nbytes_scalars); @@ -960,7 +960,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { playout_cinfo.setLayoutCount = 1; playout_cinfo.pSetLayouts = &(pe->descriptor_set_layout); - if (0 < nbytes_scalars && nbytes_scalars <= max_push_constants_) { + if (0 < nbytes_scalars && nbytes_scalars <= kMaxPushConstantsBytes) { playout_cinfo.pushConstantRangeCount = 1; playout_cinfo.pPushConstantRanges = &crange; ICHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize); @@ -1032,8 +1032,6 @@ class VulkanModuleNode final : public runtime::ModuleNode { return source_; } - uint32_t MaxPushConstantsSize() const { return max_push_constants_; } - private: // function information table. std::unordered_map smap_; @@ -1043,8 +1041,6 @@ class VulkanModuleNode final : public runtime::ModuleNode { std::string fmt_{"vulkan"}; // The source std::string source_; - // The maximum size of push constants in bytes - const uint32_t max_push_constants_; // Guards accesses to `ecache_` std::mutex mutex_; @@ -1145,7 +1141,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, descriptor_buffers[i] = binfo; } const size_t nbytes_scalars = num_pack_args_ * sizeof(ArgUnion64); - bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > m_->MaxPushConstantsSize(); + bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > kMaxPushConstantsBytes; if (use_ubo) { auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); CHECK(ubo->host_addr) << "The UBO host buffer is not allocated"; @@ -1167,7 +1163,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vctx.descriptor_template_khr_functions->vkCmdPushDescriptorSetWithTemplateKHR( state->cmd_buffer_, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptor_buffers.data()); - if (num_pack_args_ > 0 && num_pack_args_ <= m_->MaxPushConstantsSize()) { + if (num_pack_args_ > 0 && num_pack_args_ <= kMaxPushConstantsBytes) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion64), pack_args); @@ -1222,7 +1218,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, nullptr); - if (num_pack_args_ > 0 && num_pack_args_ <= m_->MaxPushConstantsSize()) { + if (num_pack_args_ > 0 && num_pack_args_ <= kMaxPushConstantsBytes) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, pack_args_storage.size() * sizeof(ArgUnion64), pack_args_storage.data()); @@ -1277,12 +1273,6 @@ Module VulkanModuleLoadBinary(void* strm) { return VulkanModuleCreate(smap, fmap, ""); } -uint32_t GetMaxPushConstantsSize() { - int device_id = VulkanThreadEntry::ThreadLocal()->device.device_id; - const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); - return vctx.phy_device_prop.limits.maxPushConstantsSize; -} - TVM_REGISTER_GLOBAL("runtime.module.loadfile_vulkan").set_body_typed(VulkanModuleLoadFile); TVM_REGISTER_GLOBAL("runtime.module.loadbinary_vulkan").set_body_typed(VulkanModuleLoadBinary); diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index 422ec4d36aa4..fbd8aaf7cd04 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -35,6 +35,8 @@ namespace tvm { namespace runtime { namespace vulkan { +#define kMaxPushConstantsBytes 128 + inline const char* VKGetErrorString(VkResult error) { switch (error) { case VK_SUCCESS: @@ -134,9 +136,6 @@ struct VulkanContext { bool UseImmediate() const { return descriptor_template_khr_functions.get() != nullptr; } }; -/*! \brief returns maximum push constant sizes in bytes for the target platform */ -uint32_t GetMaxPushConstantsSize(); - } // namespace vulkan } // namespace runtime } // namespace tvm diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index 4ef3edadc195..6e0d6cb60f56 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -70,13 +70,14 @@ runtime::VulkanShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: spirv::Value func_ptr = builder_->NewFunction(); builder_->StartFunction(func_ptr); + runtime::VulkanShader shader; + if (pod_args.size() != 0) { std::vector value_types; for (size_t i = 0; i < pod_args.size(); ++i) { value_types.push_back(builder_->GetSType(pod_args[i].dtype())); } - const auto max_push_constants = runtime::vulkan::GetMaxPushConstantsSize(); - if (pod_args.size() * sizeof(runtime::ArgUnion64) <= max_push_constants) { + if (pod_args.size() * sizeof(runtime::ArgUnion64) <= kMaxPushConstantsBytes) { spirv::Value ptr = builder_->DeclarePushConstant(value_types); for (size_t i = 0; i < pod_args.size(); ++i) { spirv::Value value = @@ -98,7 +99,7 @@ runtime::VulkanShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: builder_->MakeInst(spv::OpFunctionEnd); builder_->CommitKernelFunction(func_ptr, name); - runtime::VulkanShader shader; + shader.data = builder_->Finalize(); return shader; } From 8ed7d74ec688ad9fe2d37c37f0055f890062e841 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Mon, 12 Apr 2021 21:37:19 +0900 Subject: [PATCH 20/28] check use_ubo flag in runtime --- src/runtime/vulkan/vulkan.cc | 23 ++++++++++++----------- src/runtime/vulkan/vulkan_common.h | 2 ++ src/target/spirv/codegen_spirv.cc | 1 + 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 956369240e71..90ce0c6855a1 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -117,6 +117,7 @@ struct VulkanPipeline { VkPipelineLayout pipeline_layout{VK_NULL_HANDLE}; VkPipeline pipeline{VK_NULL_HANDLE}; VkDescriptorUpdateTemplateKHR descriptor_update_template{VK_NULL_HANDLE}; + bool use_ubo{false}; }; typedef dmlc::ThreadLocalStore VulkanThreadStore; @@ -853,6 +854,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { // create shader auto sit = smap_.find(func_name); ICHECK(sit != smap_.end()); + pe->use_ubo = sit->second.flag & (1 << kUSE_UBO); const std::vector& data = sit->second.data; VkShaderModuleCreateInfo shader_cinfo; shader_cinfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; @@ -903,7 +905,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { } size_t nbytes_scalars = num_pod * sizeof(ArgUnion64); - if (nbytes_scalars > kMaxPushConstantsBytes) { + if (pe->use_ubo) { // Use UBO instead of push constants push_arg_info(num_buffer, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER); VulkanThreadEntry::ThreadLocal()->AllocateUniformBuffer(device_id, nbytes_scalars); @@ -960,7 +962,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { playout_cinfo.setLayoutCount = 1; playout_cinfo.pSetLayouts = &(pe->descriptor_set_layout); - if (0 < nbytes_scalars && nbytes_scalars <= kMaxPushConstantsBytes) { + if (0 < nbytes_scalars && !pe->use_ubo) { playout_cinfo.pushConstantRangeCount = 1; playout_cinfo.pPushConstantRanges = &crange; ICHECK_LE(crange.size, vctx.phy_device_prop.limits.maxPushConstantsSize); @@ -1141,8 +1143,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, descriptor_buffers[i] = binfo; } const size_t nbytes_scalars = num_pack_args_ * sizeof(ArgUnion64); - bool use_ubo = num_pack_args_ != 0 && nbytes_scalars > kMaxPushConstantsBytes; - if (use_ubo) { + if (pipeline->use_ubo) { auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); CHECK(ubo->host_addr) << "The UBO host buffer is not allocated"; VkDescriptorBufferInfo binfo; @@ -1153,7 +1154,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, } if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. - if (use_ubo) { + if (pipeline->use_ubo) { auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); memcpy(ubo->host_addr, pack_args, nbytes_scalars); } @@ -1163,7 +1164,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vctx.descriptor_template_khr_functions->vkCmdPushDescriptorSetWithTemplateKHR( state->cmd_buffer_, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptor_buffers.data()); - if (num_pack_args_ > 0 && num_pack_args_ <= kMaxPushConstantsBytes) { + if (num_pack_args_ > 0 && !pipeline->use_ubo) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion64), pack_args); @@ -1184,7 +1185,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, // Otherwise, the more expensive deferred path. std::vector pack_args_storage(pack_args, pack_args + num_pack_args_); - const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers, use_ubo]() { + const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers]() { std::vector write_descriptor_sets; write_descriptor_sets.resize(descriptor_buffers.size()); for (size_t i = 0; i < write_descriptor_sets.size(); i++) { @@ -1198,7 +1199,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, write_descriptor_sets[i].pBufferInfo = &(descriptor_buffers[i]); write_descriptor_sets[i].pTexelBufferView = 0; - if (use_ubo && i == write_descriptor_sets.size() - 1) { + if (pipeline->use_ubo && i == write_descriptor_sets.size() - 1) { // The last binding is for UBO write_descriptor_sets[i].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; } else { @@ -1208,9 +1209,9 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkUpdateDescriptorSets(vctx.device, write_descriptor_sets.size(), write_descriptor_sets.data(), 0, 0); }; - const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, use_ubo, nbytes_scalars, + const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, nbytes_scalars, device_id](VulkanStreamState* state) { - if (use_ubo) { + if (pipeline->use_ubo) { auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); memcpy(ubo->host_addr, pack_args_storage.data(), nbytes_scalars); } @@ -1218,7 +1219,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, nullptr); - if (num_pack_args_ > 0 && num_pack_args_ <= kMaxPushConstantsBytes) { + if (num_pack_args_ > 0 && !pipeline->use_ubo) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, pack_args_storage.size() * sizeof(ArgUnion64), pack_args_storage.data()); diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index fbd8aaf7cd04..c0105d45695b 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -37,6 +37,8 @@ namespace vulkan { #define kMaxPushConstantsBytes 128 +enum ShaderMetaDataKind { kUSE_UBO = 0 }; + inline const char* VKGetErrorString(VkResult error) { switch (error) { case VK_SUCCESS: diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index 6e0d6cb60f56..72ce1debe533 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -85,6 +85,7 @@ runtime::VulkanShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: var_map_[pod_args[i].get()] = value; } } else { + shader.flag |= 1 << runtime::vulkan::kUSE_UBO; // If we need to pass more arguments than push constants could handle, we use UBO. spirv::Value ptr = builder_->DeclareUniformBuffer(value_types, num_buffer); for (size_t i = 0; i < pod_args.size(); ++i) { From 3dfafddb505f11a2caddf4cad466779c66e55045 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 13 Apr 2021 07:35:31 +0900 Subject: [PATCH 21/28] remove memset --- src/runtime/vulkan/vulkan.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 90ce0c6855a1..fdd8d6b1789b 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -1087,7 +1087,6 @@ VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; auto buf = GetOrAllocate(device_id, size, usage, vctx.staging_mtype_index, staging_buffers_); - memset(buf->host_addr, 0, size); return buf; } From 8e1d68a397e28bdc62d2749aeed57fb03e31a718 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 13 Apr 2021 13:04:29 +0900 Subject: [PATCH 22/28] fix cpplint --- src/runtime/vulkan/vulkan.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index fdd8d6b1789b..123ce256c801 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -1060,7 +1060,8 @@ VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() { return VulkanThreadStore:: VulkanHostVisibleBuffer* GetOrAllocate( int device_id, size_t size, VkBufferUsageFlags usage, uint32_t mem_type_index, - std::unordered_map>& buffers) { + std::unordered_map>* buffers_ptr) { + auto& buffers = *buffers_ptr; if (!buffers[device_id]) { buffers[device_id] = std::make_unique(); } @@ -1086,7 +1087,7 @@ VulkanHostVisibleBuffer* GetOrAllocate( VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size) { const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - auto buf = GetOrAllocate(device_id, size, usage, vctx.staging_mtype_index, staging_buffers_); + auto buf = GetOrAllocate(device_id, size, usage, vctx.staging_mtype_index, &staging_buffers_); return buf; } @@ -1096,7 +1097,7 @@ void VulkanThreadEntry::AllocateUniformBuffer(int device_id, size_t size) { auto info = MakeBufferCreateInfo(vctx, size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); auto mem_type_index = FindMemoryType(vctx, info, prop); GetOrAllocate(device_id, size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, mem_type_index, - uniform_buffers_); + &uniform_buffers_); } VulkanUniformBuffer* VulkanThreadEntry::GetUniformBuffer(int device_id, size_t size) { From 056e1444fcaea818e17ad9f189a8d95bdb55f62c Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 13 Apr 2021 16:55:59 +0900 Subject: [PATCH 23/28] LOG FATAL if no ubo capable memory is found --- src/runtime/vulkan/vulkan.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 123ce256c801..af416606efd8 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -122,8 +122,8 @@ struct VulkanPipeline { typedef dmlc::ThreadLocalStore VulkanThreadStore; -int FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, - VkMemoryPropertyFlags req_prop) { +uint32_t FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, + VkMemoryPropertyFlags req_prop) { VkBuffer buffer; VULKAN_CALL(vkCreateBuffer(vctx.device, &info, nullptr, &buffer)); @@ -139,8 +139,8 @@ int FindMemoryType(const VulkanContext& vctx, VkBufferCreateInfo info, } type_bits >>= 1; } - LOG(INFO) << "Requested memory type not found"; - return -1; + LOG(FATAL) << "Requested memory type not found"; + return 0; } VkBufferCreateInfo MakeBufferCreateInfo(const VulkanContext& vctx, size_t nbytes, @@ -1087,8 +1087,7 @@ VulkanHostVisibleBuffer* GetOrAllocate( VulkanStagingBuffer* VulkanThreadEntry::StagingBuffer(int device_id, size_t size) { const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - auto buf = GetOrAllocate(device_id, size, usage, vctx.staging_mtype_index, &staging_buffers_); - return buf; + return GetOrAllocate(device_id, size, usage, vctx.staging_mtype_index, &staging_buffers_); } void VulkanThreadEntry::AllocateUniformBuffer(int device_id, size_t size) { From e8de7f29d1ba1babf1c6eed8223df350c307b9a3 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 14 Apr 2021 03:49:17 +0900 Subject: [PATCH 24/28] Make ubo and push constant handling logic closer for readability --- src/runtime/vulkan/vulkan.cc | 22 ++++++++++++---------- src/runtime/vulkan/vulkan_common.h | 2 +- src/target/spirv/codegen_spirv.cc | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index af416606efd8..d9e23702e924 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -1153,21 +1153,22 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, } if (vctx.UseImmediate()) { // Can safely capture by reference as this lambda is immediately executed on the calling thread. - if (pipeline->use_ubo) { - auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); - memcpy(ubo->host_addr, pack_args, nbytes_scalars); - } VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Launch([&](VulkanStreamState* state) { vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); ICHECK(pipeline->descriptor_update_template != VK_NULL_HANDLE); vctx.descriptor_template_khr_functions->vkCmdPushDescriptorSetWithTemplateKHR( state->cmd_buffer_, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptor_buffers.data()); - if (num_pack_args_ > 0 && !pipeline->use_ubo) { + + if (pipeline->use_ubo) { + auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); + memcpy(ubo->host_addr, pack_args, nbytes_scalars); + } else if (num_pack_args_ > 0) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion64), pack_args); } + vkCmdDispatch(state->cmd_buffer_, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); VkMemoryBarrier barrier_info; barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; @@ -1210,19 +1211,20 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, }; const auto& deferred_kernel = [this, pipeline, wl, pack_args_storage, nbytes_scalars, device_id](VulkanStreamState* state) { - if (pipeline->use_ubo) { - auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); - memcpy(ubo->host_addr, pack_args_storage.data(), nbytes_scalars); - } vkCmdBindPipeline(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); vkCmdBindDescriptorSets(state->cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline_layout, 0, 1, &(pipeline->descriptor_set), 0, nullptr); - if (num_pack_args_ > 0 && !pipeline->use_ubo) { + + if (pipeline->use_ubo) { + auto ubo = VulkanThreadEntry::ThreadLocal()->GetUniformBuffer(device_id, nbytes_scalars); + memcpy(ubo->host_addr, pack_args_storage.data(), nbytes_scalars); + } else if (num_pack_args_ > 0) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, pack_args_storage.size() * sizeof(ArgUnion64), pack_args_storage.data()); } + vkCmdDispatch(state->cmd_buffer_, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); VkMemoryBarrier barrier_info; barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index c0105d45695b..6c0f76b13bba 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -35,7 +35,7 @@ namespace tvm { namespace runtime { namespace vulkan { -#define kMaxPushConstantsBytes 128 +const int kMaxPushConstantsBytes = 128; enum ShaderMetaDataKind { kUSE_UBO = 0 }; diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index 72ce1debe533..64d50926cc9f 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -77,7 +77,7 @@ runtime::VulkanShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: for (size_t i = 0; i < pod_args.size(); ++i) { value_types.push_back(builder_->GetSType(pod_args[i].dtype())); } - if (pod_args.size() * sizeof(runtime::ArgUnion64) <= kMaxPushConstantsBytes) { + if (pod_args.size() * sizeof(runtime::ArgUnion64) <= runtime::vulkan::kMaxPushConstantsBytes) { spirv::Value ptr = builder_->DeclarePushConstant(value_types); for (size_t i = 0; i < pod_args.size(); ++i) { spirv::Value value = From 33fb8e6666491c020924f6d43d3bcab9be9c25ef Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 14 Apr 2021 05:13:40 +0900 Subject: [PATCH 25/28] update Flag mask --- src/runtime/vulkan/vulkan.cc | 2 +- src/runtime/vulkan/vulkan_common.h | 3 ++- src/target/spirv/codegen_spirv.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index d9e23702e924..a76208e0613b 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -854,7 +854,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { // create shader auto sit = smap_.find(func_name); ICHECK(sit != smap_.end()); - pe->use_ubo = sit->second.flag & (1 << kUSE_UBO); + pe->use_ubo = sit->second.flag & (1 << ShaderMetaDataFlagMask::kUseUBO); const std::vector& data = sit->second.data; VkShaderModuleCreateInfo shader_cinfo; shader_cinfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index 6c0f76b13bba..2ef879a487a6 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -37,7 +37,8 @@ namespace vulkan { const int kMaxPushConstantsBytes = 128; -enum ShaderMetaDataKind { kUSE_UBO = 0 }; +/*! \brief A mask used when we attach additional information to shaders */ +enum ShaderMetaDataFlagMask { kUseUBO = 0 }; inline const char* VKGetErrorString(VkResult error) { switch (error) { diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index 64d50926cc9f..5b26e9acf5a2 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -85,7 +85,7 @@ runtime::VulkanShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: var_map_[pod_args[i].get()] = value; } } else { - shader.flag |= 1 << runtime::vulkan::kUSE_UBO; + shader.flag |= 1 << runtime::vulkan::ShaderMetaDataFlagMask::kUseUBO; // If we need to pass more arguments than push constants could handle, we use UBO. spirv::Value ptr = builder_->DeclareUniformBuffer(value_types, num_buffer); for (size_t i = 0; i < pod_args.size(); ++i) { From 9df431ff4480fac3edd9422bdb4af2c34a12f5f8 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 14 Apr 2021 10:02:02 +0900 Subject: [PATCH 26/28] add sync after UBO realloc. Bugfix using CPU dev id on stream sync --- src/runtime/vulkan/vulkan.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index a76208e0613b..939c977cf1a3 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -338,7 +338,7 @@ class VulkanDeviceAPI final : public DeviceAPI { } VulkanThreadEntry::ThreadLocal() - ->Stream(dev_from.device_id) + ->Stream(dev_to.device_id) ->Launch([&](VulkanStreamState* state) { // 0: barrier(host->transfer) VkMemoryBarrier barrier_info; @@ -359,7 +359,7 @@ class VulkanDeviceAPI final : public DeviceAPI { }); // TODO(tulloch): should we instead make the staging buffer a property of the // Stream? This would allow us to elide synchronizations here. - VulkanThreadEntry::ThreadLocal()->Stream(dev_from.device_id)->Synchronize(); + VulkanThreadEntry::ThreadLocal()->Stream(dev_to.device_id)->Synchronize(); } else { LOG(FATAL) << "Expect copy from/to Vulkan or between Vulkan" << ", from=" << from_dev_type << ", to=" << to_dev_type; @@ -1060,15 +1060,20 @@ VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() { return VulkanThreadStore:: VulkanHostVisibleBuffer* GetOrAllocate( int device_id, size_t size, VkBufferUsageFlags usage, uint32_t mem_type_index, - std::unordered_map>* buffers_ptr) { + std::unordered_map>* buffers_ptr, + bool sync_after_realloc = false) { auto& buffers = *buffers_ptr; if (!buffers[device_id]) { buffers[device_id] = std::make_unique(); } + auto& buf = *(buffers[device_id]); if (buf.device != nullptr && buf.size < size) { // free previous buffer DeleteHostVisibleBuffer(&buf); + if (sync_after_realloc) { + VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Synchronize(); + } } const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); @@ -1096,7 +1101,7 @@ void VulkanThreadEntry::AllocateUniformBuffer(int device_id, size_t size) { auto info = MakeBufferCreateInfo(vctx, size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); auto mem_type_index = FindMemoryType(vctx, info, prop); GetOrAllocate(device_id, size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, mem_type_index, - &uniform_buffers_); + &uniform_buffers_, true); } VulkanUniformBuffer* VulkanThreadEntry::GetUniformBuffer(int device_id, size_t size) { From 98e90166f6d1375c1101882139d26a3c7061866a Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 15 Apr 2021 12:59:03 +0900 Subject: [PATCH 27/28] add doc --- src/runtime/vulkan/vulkan.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 939c977cf1a3..b7fc1eef1627 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -50,10 +50,15 @@ struct VulkanBuffer { VkDeviceMemory memory{VK_NULL_HANDLE}; }; +/*! \brief A struct to represent Vulkan buffers backed by host visible memory */ struct VulkanHostVisibleBuffer { + // A device where the buffer is allocated VkDevice device{nullptr}; + // Vulkan buffer and memory VulkanBuffer* vk_buf{nullptr}; + // The corresponding pointer to the host memory void* host_addr{nullptr}; + // The size of the buffer in bytes size_t size{0}; }; @@ -1061,7 +1066,7 @@ VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() { return VulkanThreadStore:: VulkanHostVisibleBuffer* GetOrAllocate( int device_id, size_t size, VkBufferUsageFlags usage, uint32_t mem_type_index, std::unordered_map>* buffers_ptr, - bool sync_after_realloc = false) { + bool sync_after_delete = false) { auto& buffers = *buffers_ptr; if (!buffers[device_id]) { buffers[device_id] = std::make_unique(); @@ -1071,7 +1076,12 @@ VulkanHostVisibleBuffer* GetOrAllocate( if (buf.device != nullptr && buf.size < size) { // free previous buffer DeleteHostVisibleBuffer(&buf); - if (sync_after_realloc) { + if (sync_after_delete) { + // For the deferred execution mode, we need to make sure that old tasks that use + // the older, smaller buffer get finished + // Synchronization on staging buffers is done after host to device memory copy + // For UBO, we sync here before we allocate a larger buffer, to minimize synchronization + // points VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Synchronize(); } } From 3b6fa8177b79436025cf53923e4b20c8807d869b Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Fri, 16 Apr 2021 01:18:27 +0900 Subject: [PATCH 28/28] Bugfix sync should be before realloc --- src/runtime/vulkan/vulkan.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index b7fc1eef1627..2fe5b30330aa 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -1066,7 +1066,7 @@ VulkanThreadEntry* VulkanThreadEntry::ThreadLocal() { return VulkanThreadStore:: VulkanHostVisibleBuffer* GetOrAllocate( int device_id, size_t size, VkBufferUsageFlags usage, uint32_t mem_type_index, std::unordered_map>* buffers_ptr, - bool sync_after_delete = false) { + bool sync_before_realloc = false) { auto& buffers = *buffers_ptr; if (!buffers[device_id]) { buffers[device_id] = std::make_unique(); @@ -1075,15 +1075,15 @@ VulkanHostVisibleBuffer* GetOrAllocate( auto& buf = *(buffers[device_id]); if (buf.device != nullptr && buf.size < size) { // free previous buffer - DeleteHostVisibleBuffer(&buf); - if (sync_after_delete) { + if (sync_before_realloc) { // For the deferred execution mode, we need to make sure that old tasks that use // the older, smaller buffer get finished // Synchronization on staging buffers is done after host to device memory copy - // For UBO, we sync here before we allocate a larger buffer, to minimize synchronization + // For UBO, we sync here before we reallocate a larger buffer, to minimize synchronization // points VulkanThreadEntry::ThreadLocal()->Stream(device_id)->Synchronize(); } + DeleteHostVisibleBuffer(&buf); } const auto& vctx = VulkanDeviceAPI::Global()->context(device_id);