From 2244a1a9dfe96db078e4fda9d970bcb410194efd Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 31 Oct 2025 16:40:29 +0800 Subject: [PATCH 1/2] feat(tracy): integrate Tracy performance profiling support - Add Tracy profiling support with new CMake option `MLLM_TRACY_ENABLE` - Integrate Tracy memory tracking in CPU allocator (`TracyAlloc`/`TracyFree`) - Add Tracy zones to key engine components (Dispatcher, Context, MemoryManager) - Update Tracy CMakeLists to link against `Tracy::TracyClient` - Include Tracy headers and zone macros in relevant source files - Disable Tracy by default in macOS build configuration - Install `MllmTracy` target when Tracy is enabled - Link `MllmRT` with `MllmTracy` when Tracy support is active - Refactor Tracy header guards and include paths for better compatibility --- CMakeLists.txt | 9 +++++++++ mllm/CMakeLists.txt | 1 + mllm/backends/cpu/CPUAllocator.cpp | 11 ++++++++++- mllm/backends/cpu/CPUDispatcher.cpp | 2 ++ mllm/engine/Context.cpp | 2 ++ mllm/engine/DispatcherManager.cpp | 6 +++++- mllm/engine/MemoryManager.cpp | 3 +++ mllm/tracy_perf/CMakeLists.txt | 3 ++- mllm/tracy_perf/Tracy.cpp | 8 ++++++-- mllm/tracy_perf/Tracy.hpp | 10 +++++----- tasks/build_osx_apple_silicon_accelerate.yaml | 1 + 11 files changed, 46 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 40004729a..49b263bfe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -300,6 +300,15 @@ if(MLLM_BUILD_SDK_C_BINDING) RUNTIME DESTINATION bin) endif() +if(MLLM_TRACY_ENABLE) + install( + TARGETS MllmTracy + EXPORT MllmTargets + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin) +endif() + install( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/mllm/ DESTINATION include/mllm diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt index c75928605..8d0dbca38 100644 --- a/mllm/CMakeLists.txt +++ b/mllm/CMakeLists.txt @@ -80,6 +80,7 @@ endif() if(MLLM_TRACY_ENABLE) add_subdirectory(tracy_perf) + target_link_libraries(MllmRT PUBLIC MllmTracy) endif() # Host backend will be build by default diff --git a/mllm/backends/cpu/CPUAllocator.cpp b/mllm/backends/cpu/CPUAllocator.cpp index 0358b85c3..2b12c7cdd 100644 --- a/mllm/backends/cpu/CPUAllocator.cpp +++ b/mllm/backends/cpu/CPUAllocator.cpp @@ -3,6 +3,7 @@ #include "mllm/backends/cpu/CPUAllocator.hpp" #include "mllm/backends/cpu/kernels/Kernels.hpp" +#include "mllm/tracy_perf/Tracy.hpp" namespace mllm::cpu { @@ -18,12 +19,20 @@ void align_alloc(void** ptr, size_t required_bytes, size_t align) { *ptr = nullptr; return; } +#if defined(MLLM_TRACY_ENABLE) && MLLM_TRACY_ENABLE == 1 + TracyAlloc(p1, required_bytes + offset); +#endif p2 = (void**)(((size_t)(p1) + offset) & ~(align - 1)); // NOLINT p2[-1] = p1; *ptr = p2; } -void align_free(void* ptr) { free(((void**)ptr)[-1]); } +void align_free(void* ptr) { + free(((void**)ptr)[-1]); +#if defined(MLLM_TRACY_ENABLE) && MLLM_TRACY_ENABLE == 1 + TracyFree(((void**)ptr)[-1]); +#endif +} bool CPUAllocator::alloc(Storage* storage) { void* ptr; diff --git a/mllm/backends/cpu/CPUDispatcher.cpp b/mllm/backends/cpu/CPUDispatcher.cpp index 1316ff258..65c71a47a 100644 --- a/mllm/backends/cpu/CPUDispatcher.cpp +++ b/mllm/backends/cpu/CPUDispatcher.cpp @@ -5,6 +5,7 @@ #include "mllm/engine/Dispatcher.hpp" #include "mllm/utils/Common.hpp" #include "mllm/nn/Module.hpp" +#include "mllm/tracy_perf/Tracy.hpp" #ifdef MLLM_PERFETTO_ENABLE #include "mllm/engine/Perf.hpp" @@ -39,6 +40,7 @@ TaskResult::sender_t CPUDispatcher::asyncReceive(const Task::ptr_t& task) { } void CPUDispatcher::process(const Task::ptr_t& task) { + MLLM_TRACY_ZONE_SCOPED; switch (task->type) { case TaskTypes::kExecuteOp: { #ifdef MLLM_PERFETTO_ENABLE diff --git a/mllm/engine/Context.cpp b/mllm/engine/Context.cpp index a16bb4b7d..c4e178b55 100644 --- a/mllm/engine/Context.cpp +++ b/mllm/engine/Context.cpp @@ -7,6 +7,7 @@ #include "mllm/engine/Context.hpp" #include "mllm/engine/SessionTCB.hpp" #include "mllm/engine/DispatcherManager.hpp" +#include "mllm/tracy_perf/Tracy.hpp" namespace mllm { @@ -42,6 +43,7 @@ Backend::ptr_t Context::getBackend(const DeviceTypes& device) { std::vector Context::buildOpAndSubmitTask(OpTypes op_type, const BaseOpOptionsBase& base_options, const std::vector& inputs, DeviceTypes special_device) { + MLLM_TRACY_ZONE_SCOPED; auto device = special_device != kDeviceTypes_End ? special_device : inputs[0].device(); // If input device and special device are different, prefer non-CPU device diff --git a/mllm/engine/DispatcherManager.cpp b/mllm/engine/DispatcherManager.cpp index 4e905e935..6ff0e62cf 100644 --- a/mllm/engine/DispatcherManager.cpp +++ b/mllm/engine/DispatcherManager.cpp @@ -5,6 +5,7 @@ #include "exec/static_thread_pool.hpp" #include "mllm/utils/Common.hpp" #include "mllm/engine/Context.hpp" +#include "mllm/tracy_perf/Tracy.hpp" namespace mllm { @@ -14,7 +15,10 @@ DispatcherManager::DispatcherManager(const DispatcherManagerOptions& options) exec::numa_policy numa{exec::no_numa_policy{}}; } -void DispatcherManager::submit(dispatcher_id_t id, const Task::ptr_t& task) { dispatchers_[id]->receive(task); } +void DispatcherManager::submit(dispatcher_id_t id, const Task::ptr_t& task) { + MLLM_TRACY_ZONE_SCOPED; + dispatchers_[id]->receive(task); +} TaskResult::sender_t DispatcherManager::asyncSubmit(dispatcher_id_t id, const Task::ptr_t& task) { return dispatchers_[id]->asyncReceive(task); diff --git a/mllm/engine/MemoryManager.cpp b/mllm/engine/MemoryManager.cpp index 0e02a19a0..5372ef0d7 100644 --- a/mllm/engine/MemoryManager.cpp +++ b/mllm/engine/MemoryManager.cpp @@ -3,6 +3,7 @@ #include "mllm/utils/Common.hpp" #include "mllm/engine/MemoryManager.hpp" +#include "mllm/tracy_perf/Tracy.hpp" #ifdef MLLM_PERFETTO_ENABLE #include "mllm/engine/Perf.hpp" @@ -25,6 +26,7 @@ void MemoryManager::registerAllocator(const DeviceTypes& device, const Allocator } void MemoryManager::alloc(Storage* s) { + MLLM_TRACY_ZONE_SCOPED; auto& allocator = allocators_[s->device_]; auto try_to_alloc_size = allocator->allocSize(s); @@ -58,6 +60,7 @@ void MemoryManager::alloc(Storage* s) { void MemoryManager::alloc(const std::shared_ptr& s) { alloc(s.get()); } void MemoryManager::free(Storage* s) { + MLLM_TRACY_ZONE_SCOPED; auto& allocator = allocators_[s->device_]; auto try_to_alloc_size = allocator->allocSize(s); diff --git a/mllm/tracy_perf/CMakeLists.txt b/mllm/tracy_perf/CMakeLists.txt index 9e418cbc5..1030682c8 100644 --- a/mllm/tracy_perf/CMakeLists.txt +++ b/mllm/tracy_perf/CMakeLists.txt @@ -1,5 +1,6 @@ if(MLLM_TRACY_ENABLE) add_library(MllmTracy SHARED Tracy.cpp) - target_link_libraries(MllmTracy PUBLIC tracy) + target_link_libraries(MllmTracy PUBLIC Tracy::TracyClient) + target_include_directories(MllmTracy PUBLIC ${MLLM_INCLUDE_DIR}) target_compile_definitions(MllmTracy PUBLIC MLLM_TRACY_ENABLE) endif() diff --git a/mllm/tracy_perf/Tracy.cpp b/mllm/tracy_perf/Tracy.cpp index 948a7a09b..f37e100b9 100644 --- a/mllm/tracy_perf/Tracy.cpp +++ b/mllm/tracy_perf/Tracy.cpp @@ -1,3 +1,7 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + #ifdef MLLM_TRACY_ENABLE -#include "tracy/Tracy.cpp" -#endif \ No newline at end of file +#include +#include "mllm/tracy_perf/Tracy.hpp" +#endif diff --git a/mllm/tracy_perf/Tracy.hpp b/mllm/tracy_perf/Tracy.hpp index 72928f8ac..c35e0ab74 100644 --- a/mllm/tracy_perf/Tracy.hpp +++ b/mllm/tracy_perf/Tracy.hpp @@ -1,8 +1,10 @@ -#ifndef MLLM_TRACY_HPP -#define MLLM_TRACY_HPP +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once #ifdef MLLM_TRACY_ENABLE -#include "tracy/Tracy.hpp" +#include #define MLLM_TRACY_ZONE_SCOPED ZoneScoped #define MLLM_TRACY_ZONE_SCOPED_NAMED(name) ZoneScopedN(name) #define MLLM_TRACY_FRAME_MARK FrameMark @@ -11,5 +13,3 @@ #define MLLM_TRACY_ZONE_SCOPED_NAMED(name) #define MLLM_TRACY_FRAME_MARK #endif - -#endif // MLLM_TRACY_HPP \ No newline at end of file diff --git a/tasks/build_osx_apple_silicon_accelerate.yaml b/tasks/build_osx_apple_silicon_accelerate.yaml index d76807053..6ea18b1cb 100644 --- a/tasks/build_osx_apple_silicon_accelerate.yaml +++ b/tasks/build_osx_apple_silicon_accelerate.yaml @@ -10,6 +10,7 @@ Tasks: - "-DMLLM_KERNEL_USE_THREADS=ON" - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=OFF" - "-DMLLM_KERNEL_THREADS_VENDOR_APPLE_GCD=ON" + - "-DMLLM_TRACY_ENABLE=OFF" - CMakeBuildTask: cmake_cfg_path: "build-osx-accelerate" From 5b55ec259116ae3ebd0b46ee831cd7da0d515a8c Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 31 Oct 2025 17:55:35 +0800 Subject: [PATCH 2/2] fix(cpu): correct TracyFree placement in align_free function The TracyFree call was placed after the free() call, which could lead to undefined behavior. This change ensures that TracyFree is called before the memory is actually freed, allowing proper tracking and profiling of memory deallocation when Tracy is enabled. Additionally, remove redundant tracy include in Tracy.cpp to avoid potential conflicts with the local tracy implementation. --- mllm/backends/cpu/CPUAllocator.cpp | 2 +- mllm/tracy_perf/Tracy.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mllm/backends/cpu/CPUAllocator.cpp b/mllm/backends/cpu/CPUAllocator.cpp index 2b12c7cdd..24d8b6398 100644 --- a/mllm/backends/cpu/CPUAllocator.cpp +++ b/mllm/backends/cpu/CPUAllocator.cpp @@ -28,10 +28,10 @@ void align_alloc(void** ptr, size_t required_bytes, size_t align) { } void align_free(void* ptr) { - free(((void**)ptr)[-1]); #if defined(MLLM_TRACY_ENABLE) && MLLM_TRACY_ENABLE == 1 TracyFree(((void**)ptr)[-1]); #endif + free(((void**)ptr)[-1]); } bool CPUAllocator::alloc(Storage* storage) { diff --git a/mllm/tracy_perf/Tracy.cpp b/mllm/tracy_perf/Tracy.cpp index f37e100b9..0f86ce06d 100644 --- a/mllm/tracy_perf/Tracy.cpp +++ b/mllm/tracy_perf/Tracy.cpp @@ -2,6 +2,5 @@ // Licensed under the MIT License. #ifdef MLLM_TRACY_ENABLE -#include #include "mllm/tracy_perf/Tracy.hpp" #endif