From 873b9619a73de8e7a9014d4e64311cf51c475c63 Mon Sep 17 00:00:00 2001 From: Ishi Tatsuyuki Date: Sun, 11 Oct 2020 18:07:03 +0900 Subject: [PATCH 1/3] Adjust Vulkan queue selection and creation logic - The queue selection logic rewrite addresses a bug in the old implementation where the code would pass an invalid queue index when selecting any queues other than number 0. - The new implementation will attempt to use compute-only queues which are common on AMD GPUs. It's not clear how much difference will this make but hopefully it would lead to better scheduling. The queue changes were made as with the old configuration autotvm caused my system to hang, stutter or otherwise become unstable and crash. With the change I'm able to run autotvm tuning inside a desktop environment. --- src/runtime/vulkan/vulkan.cc | 70 +++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 568672591497..08d12951dfe9 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -117,6 +117,7 @@ class VulkanDeviceAPI final : public DeviceAPI { } void SetDevice(TVMContext ctx) final { VulkanThreadEntry::ThreadLocal()->ctx = ctx; } void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; + uint32_t FindComputeQueue(VkPhysicalDevice phy_dev); void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final { const auto& vctx = context(ctx.device_id); @@ -490,33 +491,17 @@ VulkanDeviceAPI::VulkanDeviceAPI() { std::vector all_phy_devs(phy_dev_count); VULKAN_CALL(vkEnumeratePhysicalDevices(instance_, &phy_dev_count, dmlc::BeginPtr(all_phy_devs))); for (VkPhysicalDevice phy_dev : all_phy_devs) { - uint32_t queue_prop_count = 0; - vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr); - std::vector queue_props(queue_prop_count); - vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, - dmlc::BeginPtr(queue_props)); - uint32_t queue_family_index = 0; - std::vector queue_create_info; + uint32_t queue_family_index = FindComputeQueue(phy_dev); + if (queue_family_index == -1U) continue; float priority = 1.0f; - for (uint32_t i = 0; i < queue_props.size(); i++) { - // find queues that support compute - if (VK_QUEUE_COMPUTE_BIT & queue_props[i].queueFlags) { - VkDeviceQueueCreateInfo info; - info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - info.pNext = nullptr; - info.flags = 0; - info.queueFamilyIndex = i; - info.queueCount = 1; - info.pQueuePriorities = &priority; - - queue_create_info.push_back(info); - // only use the first available queue for now - if (queue_create_info.size() == 0) { - queue_family_index = i; - } - } - } - if (queue_create_info.size() == 0) continue; + + VkDeviceQueueCreateInfo queue_create_info; + queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queue_create_info.pNext = nullptr; + queue_create_info.flags = 0; + queue_create_info.queueFamilyIndex = queue_family_index; + queue_create_info.queueCount = 1; + queue_create_info.pQueuePriorities = &priority; VulkanContext ctx; // setup context @@ -554,8 +539,8 @@ VulkanDeviceAPI::VulkanDeviceAPI() { device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; device_create_info.pNext = nullptr; device_create_info.flags = 0; - device_create_info.queueCreateInfoCount = static_cast(queue_create_info.size()); - device_create_info.pQueueCreateInfos = queue_create_info.data(); + device_create_info.queueCreateInfoCount = 1; + device_create_info.pQueueCreateInfos = &queue_create_info; device_create_info.enabledLayerCount = 0; device_create_info.ppEnabledLayerNames = nullptr; device_create_info.enabledExtensionCount = extensions.size(); @@ -677,7 +662,34 @@ VulkanDeviceAPI::VulkanDeviceAPI() { << "\' phy_dev_id=" << context_[i].phy_device << " use_immediate=" << context_[i].UseImmediate(); } -} // namespace vulkan +} + +uint32_t VulkanDeviceAPI::FindComputeQueue(VkPhysicalDevice phy_dev) { + uint32_t queue_prop_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr); + std::vector queue_props(queue_prop_count); + vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, dmlc::BeginPtr(queue_props)); + // Prefer compute-only queues. On cerain devices supporting this (e.g. Mesa RADV), using + // compute-only queues gives better responsiveness for other graphics workload (e.g. desktop). + auto compute_dedicated = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) { + return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0 && + (VK_QUEUE_GRAPHICS_BIT & prop.queueFlags) == 0; + }); + if (compute_dedicated == queue_props.end()) { + auto compute = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) { + return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0; + }); + if (compute == queue_props.end()) { + return -1; + } else { + return std::distance(queue_props.begin(), compute); + } + } else { + return std::distance(queue_props.begin(), compute_dedicated); + } +} + +// namespace vulkan class VulkanModuleNode; // a wrapped function class to get packed func. From b9c4ff2e5441f6027a1d000450857ae7ab413596 Mon Sep 17 00:00:00 2001 From: Ishi Tatsuyuki Date: Tue, 13 Oct 2020 12:54:30 +0900 Subject: [PATCH 2/3] Return multiple queue family indexes from GetComputeQueues --- src/runtime/vulkan/vulkan.cc | 61 ++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 08d12951dfe9..57ce2e650ab6 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -117,7 +117,7 @@ class VulkanDeviceAPI final : public DeviceAPI { } void SetDevice(TVMContext ctx) final { VulkanThreadEntry::ThreadLocal()->ctx = ctx; } void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; - uint32_t FindComputeQueue(VkPhysicalDevice phy_dev); + std::vector GetComputeQueueFamilies(VkPhysicalDevice phy_dev); void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final { const auto& vctx = context(ctx.device_id); @@ -491,17 +491,24 @@ VulkanDeviceAPI::VulkanDeviceAPI() { std::vector all_phy_devs(phy_dev_count); VULKAN_CALL(vkEnumeratePhysicalDevices(instance_, &phy_dev_count, dmlc::BeginPtr(all_phy_devs))); for (VkPhysicalDevice phy_dev : all_phy_devs) { - uint32_t queue_family_index = FindComputeQueue(phy_dev); - if (queue_family_index == -1U) continue; + // Get a list of queue families supporting compute, in order of preference. We currently only + // make use of the most preferred one family. + std::vector queue_family_indexes = GetComputeQueueFamilies(phy_dev); + if (queue_family_indexes.empty()) continue; + uint32_t queue_family_index = queue_family_indexes[0]; float priority = 1.0f; - VkDeviceQueueCreateInfo queue_create_info; - queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - queue_create_info.pNext = nullptr; - queue_create_info.flags = 0; - queue_create_info.queueFamilyIndex = queue_family_index; - queue_create_info.queueCount = 1; - queue_create_info.pQueuePriorities = &priority; + std::vector queue_create_info; + for (const auto& index : queue_family_indexes) { + struct VkDeviceQueueCreateInfo info {}; + info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + info.pNext = nullptr; + info.flags = 0; + info.queueFamilyIndex = index; + info.queueCount = 1; + info.pQueuePriorities = &priority; + queue_create_info.push_back(info); + } VulkanContext ctx; // setup context @@ -539,8 +546,8 @@ VulkanDeviceAPI::VulkanDeviceAPI() { device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; device_create_info.pNext = nullptr; device_create_info.flags = 0; - device_create_info.queueCreateInfoCount = 1; - device_create_info.pQueueCreateInfos = &queue_create_info; + device_create_info.queueCreateInfoCount = queue_create_info.size(); + device_create_info.pQueueCreateInfos = queue_create_info.data(); device_create_info.enabledLayerCount = 0; device_create_info.ppEnabledLayerNames = nullptr; device_create_info.enabledExtensionCount = extensions.size(); @@ -664,29 +671,29 @@ VulkanDeviceAPI::VulkanDeviceAPI() { } } -uint32_t VulkanDeviceAPI::FindComputeQueue(VkPhysicalDevice phy_dev) { +std::vector VulkanDeviceAPI::GetComputeQueueFamilies(VkPhysicalDevice phy_dev) { uint32_t queue_prop_count = 0; vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr); std::vector queue_props(queue_prop_count); vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, dmlc::BeginPtr(queue_props)); + + std::vector result; // Prefer compute-only queues. On cerain devices supporting this (e.g. Mesa RADV), using // compute-only queues gives better responsiveness for other graphics workload (e.g. desktop). - auto compute_dedicated = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) { - return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0 && - (VK_QUEUE_GRAPHICS_BIT & prop.queueFlags) == 0; - }); - if (compute_dedicated == queue_props.end()) { - auto compute = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) { - return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0; - }); - if (compute == queue_props.end()) { - return -1; - } else { - return std::distance(queue_props.begin(), compute); + for (uint32_t i = 0; i != queue_prop_count; ++i) { + if ((VK_QUEUE_COMPUTE_BIT & queue_props[i].queueFlags) != 0 && + (VK_QUEUE_GRAPHICS_BIT & queue_props[i].queueFlags) == 0) { + result.push_back(i); + } + } + // Now, push the compute queues that we skipped above into the list. + for (uint32_t i = 0; i != queue_prop_count; ++i) { + if ((VK_QUEUE_COMPUTE_BIT & queue_props[i].queueFlags) != 0 && + (VK_QUEUE_GRAPHICS_BIT & queue_props[i].queueFlags) != 0) { + result.push_back(i); } - } else { - return std::distance(queue_props.begin(), compute_dedicated); } + return result; } // namespace vulkan From 5c8beab72c06fd86c151016fad5b0bffe03acc45 Mon Sep 17 00:00:00 2001 From: Ishi Tatsuyuki Date: Tue, 13 Oct 2020 22:22:32 +0900 Subject: [PATCH 3/3] Only create one queue --- src/runtime/vulkan/vulkan.cc | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index 57ce2e650ab6..5b630337acbb 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -498,17 +498,13 @@ VulkanDeviceAPI::VulkanDeviceAPI() { uint32_t queue_family_index = queue_family_indexes[0]; float priority = 1.0f; - std::vector queue_create_info; - for (const auto& index : queue_family_indexes) { - struct VkDeviceQueueCreateInfo info {}; - info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - info.pNext = nullptr; - info.flags = 0; - info.queueFamilyIndex = index; - info.queueCount = 1; - info.pQueuePriorities = &priority; - queue_create_info.push_back(info); - } + struct VkDeviceQueueCreateInfo queue_create_info; + queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queue_create_info.pNext = nullptr; + queue_create_info.flags = 0; + queue_create_info.queueFamilyIndex = queue_family_index; + queue_create_info.queueCount = 1; + queue_create_info.pQueuePriorities = &priority; VulkanContext ctx; // setup context @@ -546,8 +542,8 @@ VulkanDeviceAPI::VulkanDeviceAPI() { device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; device_create_info.pNext = nullptr; device_create_info.flags = 0; - device_create_info.queueCreateInfoCount = queue_create_info.size(); - device_create_info.pQueueCreateInfos = queue_create_info.data(); + device_create_info.queueCreateInfoCount = 1; + device_create_info.pQueueCreateInfos = &queue_create_info; device_create_info.enabledLayerCount = 0; device_create_info.ppEnabledLayerNames = nullptr; device_create_info.enabledExtensionCount = extensions.size();