From 81286c85c48ca28587cf7a432dd62776aad98bf4 Mon Sep 17 00:00:00 2001 From: avulisha Date: Mon, 21 Mar 2022 22:44:22 +0530 Subject: [PATCH] [Runtime][Vulkan] Add RGP support to TVM for vulkan device RGP(Raedon GPU Profiler) is a tool used to analyze the applications run on AMD GPU. RGP captures the data based on VKPresent and provides the hardware specific information. Allowing the developer to optimize the application. To add RGP support to TVM, debug labels "AmdFrameBegin" and "AmdFrameEnd" need to be inserted into the vulkan queue.These Labels helps the RGP tool to understand the start|end of frame when no present is available. Thus enabling the RGP tool to capture and analyze the data. At runtime, set the envirnoment variable "TVM_USE_AMD_RGP=1" to start inserting the Debug Labels into the vulkan queue. Signed-off-by: Wilkin Chau Signed-off-by: Anurag Kumar Vulisha --- src/runtime/vulkan/vulkan_amdrgp.cc | 53 +++++++++++++++++++ src/runtime/vulkan/vulkan_amdrgp.h | 63 +++++++++++++++++++++++ src/runtime/vulkan/vulkan_device.cc | 13 +++++ src/runtime/vulkan/vulkan_device.h | 12 +++++ src/runtime/vulkan/vulkan_device_api.cc | 3 ++ src/runtime/vulkan/vulkan_instance.cc | 7 +++ src/runtime/vulkan/vulkan_stream.cc | 13 +++++ src/runtime/vulkan/vulkan_stream.h | 16 ++++++ src/runtime/vulkan/vulkan_wrapped_func.cc | 18 +++++++ 9 files changed, 198 insertions(+) create mode 100644 src/runtime/vulkan/vulkan_amdrgp.cc create mode 100644 src/runtime/vulkan/vulkan_amdrgp.h diff --git a/src/runtime/vulkan/vulkan_amdrgp.cc b/src/runtime/vulkan/vulkan_amdrgp.cc new file mode 100644 index 000000000000..54e566410f49 --- /dev/null +++ b/src/runtime/vulkan/vulkan_amdrgp.cc @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "vulkan_device.h" + +namespace tvm { +namespace runtime { +namespace vulkan { + +VulkanStreamProfiler::VulkanStreamProfiler(const VulkanDevice* device) + : device_(device), curr_state_(READY), available_(device->UseDebugUtilsLabel()) {} + +void AmdRgpProfiler::capture() { + if (!available_) { + return; + } + + // Trigger RGP capture by using dummy present and switch state from READY to RUNNING + if (curr_state_ == READY) { + VkDebugUtilsLabelEXT frame_end_label = { + VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, NULL, "AmdFrameEnd", {0.0f, 0.0f, 0.0f, 0.0f}}; + device_->queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT( + device_->Queue(), &frame_end_label); + + VkDebugUtilsLabelEXT frame_begin_label = { + VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, NULL, "AmdFrameBegin", {0.0f, 0.0f, 0.0f, 0.0f}}; + device_->queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT( + device_->Queue(), &frame_begin_label); + + // Set state as RUNNING + curr_state_ = RUNNING; + } +} + +} // namespace vulkan +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/vulkan/vulkan_amdrgp.h b/src/runtime/vulkan/vulkan_amdrgp.h new file mode 100644 index 000000000000..aa090eeaa829 --- /dev/null +++ b/src/runtime/vulkan/vulkan_amdrgp.h @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_RUNTIME_VULKAN_VULKAN_AMDRGP_H_ +#define TVM_RUNTIME_VULKAN_VULKAN_AMDRGP_H_ + +namespace tvm { +namespace runtime { +namespace vulkan { + +class VulkanDevice; + +class VulkanStreamProfiler { + public: + enum state { READY = 0, RUNNING, RESET }; + + explicit VulkanStreamProfiler(const VulkanDevice* device); + + virtual ~VulkanStreamProfiler() {} + + virtual void reset() { curr_state_ = RESET; } + + virtual void ready() { + if (curr_state_ == RESET) { + curr_state_ = READY; + } + } + + virtual void capture() = 0; + + protected: + const VulkanDevice* device_; + state curr_state_; + bool available_; +}; + +class AmdRgpProfiler : public VulkanStreamProfiler { + public: + explicit AmdRgpProfiler(const VulkanDevice* device) : VulkanStreamProfiler(device) {} + + void capture(); +}; + +} // namespace vulkan +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_VULKAN_VULKAN_AMDRGP_H_ diff --git a/src/runtime/vulkan/vulkan_device.cc b/src/runtime/vulkan/vulkan_device.cc index 29908bed8189..7a6b92943c90 100644 --- a/src/runtime/vulkan/vulkan_device.cc +++ b/src/runtime/vulkan/vulkan_device.cc @@ -228,6 +228,12 @@ VulkanGetBufferMemoryRequirements2Functions::VulkanGetBufferMemoryRequirements2F vkGetDeviceProcAddr(device, "vkGetBufferMemoryRequirements2KHR")); } +VulkanQueueInsertDebugUtilsLabelFunctions::VulkanQueueInsertDebugUtilsLabelFunctions( + VkInstance instance) { + vkQueueInsertDebugUtilsLabelEXT = (PFN_vkQueueInsertDebugUtilsLabelEXT)ICHECK_NOTNULL( + vkGetInstanceProcAddr(instance, "vkQueueInsertDebugUtilsLabelEXT")); +} + VulkanDevice::VulkanDevice(const VulkanInstance& instance, VkPhysicalDevice phy_device) : physical_device_(phy_device) { queue_family_index = SelectComputeQueueFamily(); @@ -325,6 +331,11 @@ VulkanDevice::VulkanDevice(const VulkanInstance& instance, VkPhysicalDevice phy_ get_buffer_memory_requirements_2_functions = std::make_unique(device_); } + + if (instance.HasExtension("VK_EXT_debug_utils")) { + queue_insert_debug_utils_label_functions = + std::make_unique(instance); + } } VulkanDevice::~VulkanDevice() { @@ -363,6 +374,8 @@ void VulkanDevice::do_swap(VulkanDevice&& other) { std::swap(descriptor_template_khr_functions, other.descriptor_template_khr_functions); std::swap(get_buffer_memory_requirements_2_functions, other.get_buffer_memory_requirements_2_functions); + std::swap(queue_insert_debug_utils_label_functions, + other.queue_insert_debug_utils_label_functions); std::swap(compute_mtype_index, other.compute_mtype_index); std::swap(queue, other.queue); std::swap(queue_family_index, other.queue_family_index); diff --git a/src/runtime/vulkan/vulkan_device.h b/src/runtime/vulkan/vulkan_device.h index 3ca2d093bf1d..a1257a732aff 100644 --- a/src/runtime/vulkan/vulkan_device.h +++ b/src/runtime/vulkan/vulkan_device.h @@ -57,6 +57,12 @@ struct VulkanGetBufferMemoryRequirements2Functions { PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR{nullptr}; }; +struct VulkanQueueInsertDebugUtilsLabelFunctions { + explicit VulkanQueueInsertDebugUtilsLabelFunctions(VkInstance instance); + + PFN_vkQueueInsertDebugUtilsLabelEXT vkQueueInsertDebugUtilsLabelEXT{nullptr}; +}; + /*! * \brief Stores the capabilities/limits queried from the physical device. * @@ -212,6 +218,8 @@ class VulkanDevice { std::unique_ptr descriptor_template_khr_functions{nullptr}; std::unique_ptr get_buffer_memory_requirements_2_functions{nullptr}; + std::unique_ptr + queue_insert_debug_utils_label_functions{nullptr}; // Memory type index for compute uint32_t compute_mtype_index{0}; @@ -220,6 +228,10 @@ class VulkanDevice { bool UseImmediate() const { return descriptor_template_khr_functions != nullptr; } + bool UseDebugUtilsLabel() const { return queue_insert_debug_utils_label_functions != nullptr; } + + VkQueue Queue() const { return queue; } + private: /*! \brief Helper function for move assignment/construction * diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc index 676f14667d70..93f017a5aa66 100644 --- a/src/runtime/vulkan/vulkan_device_api.cc +++ b/src/runtime/vulkan/vulkan_device_api.cc @@ -367,6 +367,7 @@ void VulkanDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* ©_info); }); stream.Synchronize(); + stream.ProfilerReset(); if (!device.coherent_staging) { VkMappedMemoryRange mrange; mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; @@ -413,6 +414,8 @@ void VulkanDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* vkCmdCopyBuffer(state->cmd_buffer_, staging_buffer.vk_buf.buffer, to_buf->buffer, 1, ©_info); }); + + stream.ProfilerReady(); // TODO(tulloch): should we instead make the staging buffer a property of the // Stream? This would allow us to elide synchronizations here. stream.Synchronize(); diff --git a/src/runtime/vulkan/vulkan_instance.cc b/src/runtime/vulkan/vulkan_instance.cc index b8295d2cd605..a77531a5214f 100644 --- a/src/runtime/vulkan/vulkan_instance.cc +++ b/src/runtime/vulkan/vulkan_instance.cc @@ -59,6 +59,13 @@ VulkanInstance::VulkanInstance() { std::vector required_extensions{}; std::vector optional_extensions{"VK_KHR_get_physical_device_properties2"}; + // Check if RGP support is needed. If needed, enable VK_EXT_debug_utils extension for + // inserting debug labels into the queue. + if (support::BoolEnvironmentVar("TVM_USE_AMD_RGP")) { + LOG(INFO) << "Push VK_EXT_debug_utils"; + required_extensions.push_back("VK_EXT_debug_utils"); + } + uint32_t inst_extension_prop_count; VULKAN_CALL( vkEnumerateInstanceExtensionProperties(nullptr, &inst_extension_prop_count, nullptr)); diff --git a/src/runtime/vulkan/vulkan_stream.cc b/src/runtime/vulkan/vulkan_stream.cc index 3eff112a6eea..5cdb5768924b 100644 --- a/src/runtime/vulkan/vulkan_stream.cc +++ b/src/runtime/vulkan/vulkan_stream.cc @@ -19,6 +19,7 @@ #include "vulkan_stream.h" +#include "../../support/utils.h" #include "vulkan_device.h" namespace tvm { @@ -55,11 +56,19 @@ VulkanStream::VulkanStream(const VulkanDevice* device) cb_begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; cb_begin.pInheritanceInfo = 0; VULKAN_CALL(vkBeginCommandBuffer(state_->cmd_buffer_, &cb_begin)); + + if (support::BoolEnvironmentVar("TVM_USE_AMD_RGP")) { + profiler_ = new AmdRgpProfiler(device_); + } } VulkanStream::~VulkanStream() { vkDestroyFence(*device_, state_->fence_, nullptr); vkDestroyCommandPool(*device_, cmd_pool_, nullptr); + + if (profiler_) { + delete (profiler_); + } } void VulkanStream::Launch(const std::function& kernel) { @@ -132,6 +141,10 @@ void VulkanStream::Synchronize() { cb_submit.signalSemaphoreCount = 0; cb_submit.pSignalSemaphores = nullptr; + if (profiler_) { + profiler_->capture(); + } + device_->QueueSubmit(cb_submit, state_->fence_); uint64_t timeout = 1UL << 30UL; diff --git a/src/runtime/vulkan/vulkan_stream.h b/src/runtime/vulkan/vulkan_stream.h index fb4e447c15e1..742a66f15dd4 100644 --- a/src/runtime/vulkan/vulkan_stream.h +++ b/src/runtime/vulkan/vulkan_stream.h @@ -25,6 +25,7 @@ #include #include +#include "vulkan_amdrgp.h" #include "vulkan_common.h" namespace tvm { @@ -99,6 +100,20 @@ class VulkanStream { const std::function& deferred_kernel, const VulkanStreamToken& deferred_token); + // reset profiler state + void ProfilerReset() { + if (profiler_) { + profiler_->reset(); + } + } + + // set profiler to READY state after reset + void ProfilerReady() { + if (profiler_) { + profiler_->ready(); + } + } + // Synchronize the current stream `state_` with respect to the host. void Synchronize(); @@ -110,6 +125,7 @@ class VulkanStream { std::unordered_map> deferred_tokens_; std::vector> deferred_kernels_; VkCommandPool cmd_pool_; + VulkanStreamProfiler* profiler_ = nullptr; }; } // namespace vulkan diff --git a/src/runtime/vulkan/vulkan_wrapped_func.cc b/src/runtime/vulkan/vulkan_wrapped_func.cc index 0712f723bb64..f06ca5043b01 100644 --- a/src/runtime/vulkan/vulkan_wrapped_func.cc +++ b/src/runtime/vulkan/vulkan_wrapped_func.cc @@ -98,6 +98,15 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &barrier_info, 0, nullptr, 0, nullptr); + + if (device.UseDebugUtilsLabel()) { + VkDebugUtilsLabelEXT dispatch_label = {VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + NULL, + func_name_.c_str(), + {0.0f, 0.0f, 0.0f, 0.0f}}; + device.queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT( + device.Queue(), &dispatch_label); + } }); return; } @@ -164,6 +173,15 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, deferred_token.buffers_[i] = descriptor_buffers[i].buffer; } device.ThreadLocalStream().LaunchDeferred(deferred_initializer, deferred_kernel, deferred_token); + + if (device.UseDebugUtilsLabel()) { + VkDebugUtilsLabelEXT dispatch_label = {VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + NULL, + func_name_.c_str(), + {0.0f, 0.0f, 0.0f, 0.0f}}; + device.queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT( + device.Queue(), &dispatch_label); + } } VulkanModuleNode::~VulkanModuleNode() {