diff --git a/ddprof-lib/src/main/cpp/os_dd.h b/ddprof-lib/src/main/cpp/os_dd.h index 79c2c0009..3abfd7604 100644 --- a/ddprof-lib/src/main/cpp/os_dd.h +++ b/ddprof-lib/src/main/cpp/os_dd.h @@ -24,9 +24,7 @@ class OS : public ::OS { static void uninstallTlsPrimeSignalHandler(int signal_num); static void enumerateThreadIds(const std::function& callback); static void signalThread(int tid, int signum); - static bool startThreadDirectoryWatcher(const std::function& on_new_thread, const std::function& on_dead_thread); static int getThreadCount(); - static void stopThreadDirectoryWatcher(); }; } #endif // _OS_DD_H diff --git a/ddprof-lib/src/main/cpp/os_linux_dd.cpp b/ddprof-lib/src/main/cpp/os_linux_dd.cpp index b3887001c..9d7906c3b 100644 --- a/ddprof-lib/src/main/cpp/os_linux_dd.cpp +++ b/ddprof-lib/src/main/cpp/os_linux_dd.cpp @@ -6,14 +6,11 @@ #include #include #include -#include #include #include -#include -#include #include +#include #include -#include #ifndef __musl__ #include @@ -25,43 +22,6 @@ #define MMAP_SYSCALL __NR_mmap2 #endif -// Thread directory watcher state -static std::atomic g_watcher_running{false}; -static std::atomic g_watcher_fd{-1}; -static pthread_t g_watcher_thread; -static std::atomic g_watcher_thread_created{false}; -static std::function g_on_new_thread; -static std::function g_on_dead_thread; - -static void* threadDirectoryWatcherLoop(void* arg); - -// Fork handler to reset watcher state in child process -static void resetWatcherStateInChild() { - // After fork(), child process doesn't have the watcher thread - // Reset all state to prevent deadlock when child tries to cleanup - g_watcher_running.store(false); - g_watcher_thread_created.store(false); - - // Close the inherited fd in child to prevent issues - int fd = g_watcher_fd.exchange(-1); - if (fd >= 0) { - close(fd); - } - - // Clear callback functions to prevent accidental invocation - g_on_new_thread = nullptr; - g_on_dead_thread = nullptr; -} - -// Register fork handler on first use -static void ensureForkHandlerRegistered() { - static bool registered = false; - if (!registered) { - pthread_atfork(nullptr, nullptr, resetWatcherStateInChild); - registered = true; - } -} - int ddprof::OS::truncateFile(int fd) { int rslt = ftruncate(fd, 0); if (rslt == 0) { @@ -176,134 +136,4 @@ int ddprof::OS::getThreadCount() { return thread_count; } -bool ddprof::OS::startThreadDirectoryWatcher(const std::function& on_new_thread, const std::function& on_dead_thread) { - // Ensure fork handler is registered to prevent deadlock in child processes - ensureForkHandlerRegistered(); - - if (g_watcher_running.load()) { - return true; // Already running - } - - int inotify_fd = inotify_init1(IN_CLOEXEC | IN_NONBLOCK); - if (inotify_fd == -1) { - TEST_LOG("Failed to initialize inotify: %s", strerror(errno)); - return false; - } - - int watch_fd = inotify_add_watch(inotify_fd, "/proc/self/task", IN_CREATE | IN_DELETE | IN_MOVED_FROM | IN_MOVED_TO); - if (watch_fd == -1) { - TEST_LOG("Failed to add inotify watch on /proc/self/task: %s", strerror(errno)); - close(inotify_fd); - return false; - } - - g_on_new_thread = on_new_thread; - g_on_dead_thread = on_dead_thread; - g_watcher_fd.store(inotify_fd); - g_watcher_running.store(true); - - if (pthread_create(&g_watcher_thread, nullptr, threadDirectoryWatcherLoop, nullptr) != 0) { - TEST_LOG("Failed to create thread directory watcher thread: %s", strerror(errno)); - g_watcher_running.store(false); - g_watcher_fd.store(-1); - close(inotify_fd); - return false; - } - - g_watcher_thread_created.store(true); - TEST_LOG("Started thread directory watcher (thread will be joined on cleanup)"); - return true; -} - -void ddprof::OS::stopThreadDirectoryWatcher() { - if (!g_watcher_running.load()) { - return; - } - - TEST_LOG("Stopping thread directory watcher..."); - - // Signal the watcher thread to stop - g_watcher_running.store(false); - - // Close the inotify fd to wake up select() - int fd = g_watcher_fd.exchange(-1); - if (fd >= 0) { - close(fd); - } - - // Wait for the watcher thread to actually terminate - if (g_watcher_thread_created.load()) { - TEST_LOG("Waiting for watcher thread to terminate..."); - void* retval; - int join_result = pthread_join(g_watcher_thread, &retval); - if (join_result != 0) { - TEST_LOG("Failed to join watcher thread: %s", strerror(join_result)); - } else { - TEST_LOG("Watcher thread terminated successfully"); - } - g_watcher_thread_created.store(false); - } - - TEST_LOG("Thread directory watcher stopped"); -} - -static void* threadDirectoryWatcherLoop(void* arg) { - const int fd = g_watcher_fd.load(); - if (fd < 0) return nullptr; - - char buffer[4096]; - fd_set readfds; - struct timeval timeout; - - while (g_watcher_running.load()) { - FD_ZERO(&readfds); - FD_SET(fd, &readfds); - timeout.tv_sec = 1; - timeout.tv_usec = 0; - - int ret = select(fd + 1, &readfds, nullptr, nullptr, &timeout); - if (ret < 0) { - if (errno != EINTR) { - TEST_LOG("Thread directory watcher select failed: %s", strerror(errno)); - break; - } - continue; - } - - if (ret == 0) continue; // Timeout, check running flag - - ssize_t len = read(fd, buffer, sizeof(buffer)); - if (len <= 0) { - if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK) { - TEST_LOG("Thread directory watcher read failed: %s", strerror(errno)); - break; - } - continue; - } - - // Parse inotify events - for (ssize_t i = 0; i < len;) { - struct inotify_event *event = (struct inotify_event *)(buffer + i); - - if (event->mask & IN_Q_OVERFLOW) { - TEST_LOG("Thread directory watcher queue overflow, triggering full rescan"); - // TODO: Trigger full rescan callback - } else if (event->len > 0 && event->name[0] >= '1' && event->name[0] <= '9') { - int tid = atoi(event->name); - if (tid > 0) { - if (event->mask & (IN_CREATE | IN_MOVED_TO)) { - if (g_on_new_thread) g_on_new_thread(tid); - } else if (event->mask & (IN_DELETE | IN_MOVED_FROM)) { - if (g_on_dead_thread) g_on_dead_thread(tid); - } - } - } - - i += sizeof(struct inotify_event) + event->len; - } - } - - return nullptr; -} - #endif // __linux__ diff --git a/ddprof-lib/src/main/cpp/os_macos_dd.cpp b/ddprof-lib/src/main/cpp/os_macos_dd.cpp index e01918c19..2a2905a60 100644 --- a/ddprof-lib/src/main/cpp/os_macos_dd.cpp +++ b/ddprof-lib/src/main/cpp/os_macos_dd.cpp @@ -75,36 +75,28 @@ void ddprof::OS::enumerateThreadIds(const std::function& callback) { void ddprof::OS::signalThread(int tid, int signum) { // On macOS, tid is actually a mach thread port thread_t thread = static_cast(tid); - + // Convert mach thread to pthread for signaling // This is a limitation - we can't easily signal arbitrary mach threads // In practice, this is mainly used for TLS priming which is disabled on macOS TEST_LOG("Thread signaling not fully supported on macOS (thread=%d, signal=%d)", tid, signum); } -bool ddprof::OS::startThreadDirectoryWatcher(const std::function& on_new_thread, const std::function& on_dead_thread) { - return false; // Thread directory watching not supported on macOS -} - int ddprof::OS::getThreadCount() { task_t task = mach_task_self(); thread_act_array_t thread_list; mach_msg_type_number_t thread_count; - + kern_return_t kr = task_threads(task, &thread_list, &thread_count); if (kr != KERN_SUCCESS) { TEST_LOG("Failed to get thread count: %d", kr); return 0; } - + // Clean up vm_deallocate(task, (vm_address_t)thread_list, thread_count * sizeof(thread_t)); - - return static_cast(thread_count); -} -void ddprof::OS::stopThreadDirectoryWatcher() { - // No-op on macOS + return static_cast(thread_count); } #endif // __APPLE__ diff --git a/ddprof-lib/src/main/cpp/thread.cpp b/ddprof-lib/src/main/cpp/thread.cpp index 379f64d11..d93463792 100644 --- a/ddprof-lib/src/main/cpp/thread.cpp +++ b/ddprof-lib/src/main/cpp/thread.cpp @@ -108,9 +108,6 @@ ProfiledThread::initCurrentThreadWithBuffer() { static void resetTlsPrimingStateInChild() { // After fork(), reset signal number to prevent cleanup attempts g_tls_prime_signal = -1; - - // Note: The watcher state is reset by os_linux_dd.cpp fork handler - // This just ensures we don't try to uninstall signals or cleanup resources } // Register fork handler on first initialization @@ -140,27 +137,10 @@ void ProfiledThread::doInitExistingThreads() { TEST_LOG("Successfully installed TLS priming handler on RT signal %d", g_tls_prime_signal); - // Use a modest buffer size since we're only handling new threads via watcher - // 256 should be more than enough for concurrent new thread creation + // Use a modest buffer size for concurrent thread TLS initialization + // 256 should be more than enough for typical workloads prepareBuffer(256); - // Start thread directory watcher to prime new threads (no mass-priming of existing threads) - bool watcher_started = ddprof::OS::startThreadDirectoryWatcher( - [](int tid) { - // Prime new thread with TLS signal - ddprof::OS::signalThread(tid, g_tls_prime_signal); - }, - [](int tid) { - // No-op for dead threads - cleanup handled elsewhere - } - ); - - if (!watcher_started) { - TEST_LOG("Failed to start thread directory watcher for TLS priming"); - } else { - TEST_LOG("Started thread directory watcher for TLS priming"); - } - initialized = true; } @@ -169,10 +149,6 @@ void ProfiledThread::cleanupTlsPriming() { return; } - // Stop the thread directory watcher - ddprof::OS::stopThreadDirectoryWatcher(); - TEST_LOG("Stopped thread directory watcher"); - // Uninstall the TLS priming signal handler if (g_tls_prime_signal > 0) { ddprof::OS::uninstallTlsPrimeSignalHandler(g_tls_prime_signal); diff --git a/ddprof-lib/src/test/cpp/test_tlsPriming.cpp b/ddprof-lib/src/test/cpp/test_tlsPriming.cpp index 383c4c1ad..c8bbfa848 100644 --- a/ddprof-lib/src/test/cpp/test_tlsPriming.cpp +++ b/ddprof-lib/src/test/cpp/test_tlsPriming.cpp @@ -85,11 +85,11 @@ TEST_F(TlsPrimingTest, GetThreadCount) { TEST_F(TlsPrimingTest, SignalCurrentThread) { int signal_num = ddprof::OS::installTlsPrimeSignalHandler(testTlsSignalHandler, 6); - + #ifdef __linux__ if (signal_num > 0) { TEST_LOG("Signaling current thread with signal %d", signal_num); - + // Get the first thread ID from enumeration std::atomic first_tid{-1}; ddprof::OS::enumerateThreadIds([&](int tid) { @@ -97,18 +97,18 @@ TEST_F(TlsPrimingTest, SignalCurrentThread) { first_tid.store(tid); } }); - + int tid = first_tid.load(); if (tid >= 0) { ddprof::OS::signalThread(tid, signal_num); - + // Wait a bit for signal to be delivered std::this_thread::sleep_for(std::chrono::milliseconds(100)); - + EXPECT_GT(g_signal_received.load(), 0); EXPECT_GT(g_threads_primed.load(), 0); EXPECT_EQ(g_test_tls, 0x1234ABCD); - + TEST_LOG("Signal delivered successfully, TLS primed"); } else { TEST_LOG("No threads found for signaling"); @@ -124,45 +124,6 @@ TEST_F(TlsPrimingTest, SignalCurrentThread) { #endif } -TEST_F(TlsPrimingTest, ThreadDirectoryWatcher) { - std::atomic new_threads{0}; - std::atomic dead_threads{0}; - - bool started = ddprof::OS::startThreadDirectoryWatcher( - [&](int tid) { - TEST_LOG("New thread detected: %d", tid); - new_threads++; - }, - [&](int tid) { - TEST_LOG("Thread died: %d", tid); - dead_threads++; - } - ); - - if (started) { - TEST_LOG("Thread directory watcher started successfully"); - - // Create a short-lived thread to trigger the watcher - std::thread test_thread([]() { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - }); - - test_thread.join(); - - // Wait for watcher to detect changes - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - - ddprof::OS::stopThreadDirectoryWatcher(); - TEST_LOG("Thread directory watcher stopped"); - - // We might see events, but it's not guaranteed due to timing - TEST_LOG("Detected %d new threads, %d dead threads", - new_threads.load(), dead_threads.load()); - } else { - TEST_LOG("Thread directory watcher not supported on this platform"); - } -} - // Test TLS cleanup for JVMTI-allocated threads (non-buffer) TEST_F(TlsPrimingTest, JvmtiThreadCleanup) { TEST_LOG("Testing JVMTI-allocated thread cleanup"); diff --git a/docs/architecture/TlsPriming.md b/docs/architecture/TlsPriming.md index d69a32d3c..2a2e500db 100644 --- a/docs/architecture/TlsPriming.md +++ b/docs/architecture/TlsPriming.md @@ -4,16 +4,15 @@ The TLS (Thread-Local Storage) Priming system ensures that thread-local profiling data structures are initialized before signal handlers access them. This prevents allocation and initialization from occurring within async-signal-unsafe contexts (signal handlers), eliminating potential deadlocks and crashes. -The system uses a dual-path initialization strategy combining JVMTI callbacks for Java threads and filesystem-based monitoring for native threads, with careful deduplication to prevent double-initialization overhead. +The system uses JVMTI callbacks for Java threads to initialize thread-local storage. Native threads will be initialized through future lib patching mechanisms (the previous filesystem-based monitoring approach has been removed due to performance concerns). ## Core Design Principles 1. **Signal Handler Safety**: Never allocate or initialize TLS within signal handlers -2. **Dual-Path Coverage**: JVMTI for Java threads, filesystem watching for native threads -3. **Deduplication**: Prevent wasteful double-initialization -4. **Lock-Free Buffer Management**: Use GCC atomic builtins instead of `std::atomic` -5. **Graceful Degradation**: Handle slot exhaustion without crashing -6. **Platform Specificity**: Linux gets full priming, macOS gets simplified approach +2. **JVMTI-Based Initialization**: Java threads initialized via JVMTI callbacks +3. **Lock-Free Buffer Management**: Use GCC atomic builtins instead of `std::atomic` +4. **Graceful Degradation**: Handle slot exhaustion without crashing +5. **Platform Specificity**: TLS priming supported on Linux, simplified approach on macOS ## Problem Statement @@ -129,44 +128,31 @@ void pushFreeSlot(int slot_index) { 2. GCC `__atomic_*` builtins are **guaranteed lock-free** for aligned types 3. Signal handlers require strict async-signal-safety guarantees -### 3. Dual-Path Initialization +### 3. JVMTI-Based Initialization -The system uses two complementary initialization paths: +The system initializes Java threads via JVMTI callbacks: ``` ┌──────────────────────────────────────────────────────────────┐ │ Thread Lifecycle │ ├──────────────────────────────────────────────────────────────┤ │ │ -│ Java Thread Created Native Thread Created │ -│ │ │ │ -│ ├─ JVMTI ThreadStart │ │ -│ │ │ │ │ -│ │ └─ initCurrentThread() │ │ -│ │ │ │ │ -│ │ │ ┌──────┘ │ -│ │ │ │ │ -│ │ │ /proc/self/task watcher │ -│ │ │ │ │ -│ │ │ detects new thread │ -│ │ │ │ │ -│ │ │ sends RT signal │ -│ │ │ │ │ -│ │ │ simpleTlsSignalHandler() │ -│ │ │ │ │ -│ │ │ checks: VMThread::current() │ -│ │ │ │ │ -│ │ │ NULL? (native thread) │ -│ │ │ │ │ -│ │ │ initCurrentThreadWithBuffer() │ -│ │ │ │ │ -│ └──────────────┴───────────────┘ │ -│ │ │ -│ TLS Initialized │ -│ │ │ -│ ProfiledThread* set via pthread_setspecific() │ -│ │ │ -│ Signal handlers safe │ +│ Java Thread Created │ +│ │ │ +│ ├─ JVMTI ThreadStart │ +│ │ │ │ +│ │ └─ initCurrentThread() │ +│ │ │ │ +│ │ └─ TLS Initialized │ +│ │ │ +│ │ ProfiledThread* set via │ +│ │ pthread_setspecific() │ +│ │ │ +│ └─ Signal handlers safe │ +│ │ +│ Note: Native threads currently use lazy initialization │ +│ Future lib patching will address native thread │ +│ initialization before signal handlers access TLS │ │ │ └──────────────────────────────────────────────────────────────┘ ``` @@ -203,99 +189,11 @@ int installTlsPrimeSignalHandler(SigHandler handler, int signal_offset) { - Multiple available (SIGRTMIN to SIGRTMAX) - Separate from profiling signals (SIGPROF, SIGALRM) -#### Filesystem Watching with inotify - -Monitors `/proc/self/task` for new threads: - -```cpp -bool startThreadDirectoryWatcher( - const std::function& on_new_thread, - const std::function& on_dead_thread) -{ - int inotify_fd = inotify_init1(IN_CLOEXEC | IN_NONBLOCK); - if (inotify_fd == -1) return false; - - int watch_fd = inotify_add_watch(inotify_fd, "/proc/self/task", - IN_CREATE | IN_DELETE | - IN_MOVED_FROM | IN_MOVED_TO); - if (watch_fd == -1) { - close(inotify_fd); - return false; - } - - // Create watcher thread - pthread_create(&g_watcher_thread, nullptr, threadDirectoryWatcherLoop, nullptr); +**Note:** The filesystem-based thread monitoring with inotify has been removed due to performance concerns. Future implementations will use lib patching for native thread TLS initialization. - return true; -} -``` +### 5. Signal Handler Implementation -**Watcher Thread Loop:** - -```cpp -void* threadDirectoryWatcherLoop(void* arg) { - char buffer[4096]; - fd_set readfds; - struct timeval timeout; - - while (g_watcher_running.load()) { - FD_ZERO(&readfds); - FD_SET(fd, &readfds); - timeout.tv_sec = 1; - timeout.tv_usec = 0; - - int ret = select(fd + 1, &readfds, nullptr, nullptr, &timeout); - if (ret <= 0) continue; - - ssize_t len = read(fd, buffer, sizeof(buffer)); - - // Parse inotify events - for (ssize_t i = 0; i < len;) { - struct inotify_event *event = (struct inotify_event *)(buffer + i); - - if (event->len > 0 && event->name[0] >= '1' && event->name[0] <= '9') { - int tid = atoi(event->name); - - if (event->mask & (IN_CREATE | IN_MOVED_TO)) { - if (g_on_new_thread) g_on_new_thread(tid); - } else if (event->mask & (IN_DELETE | IN_MOVED_FROM)) { - if (g_on_dead_thread) g_on_dead_thread(tid); - } - } - - i += sizeof(struct inotify_event) + event->len; - } - } - - return nullptr; -} -``` - -**New Thread Detection Flow:** - -``` -New Native Thread Started - │ - ├─ /proc/self/task/{tid} directory created - │ - ├─ inotify fires IN_CREATE event - │ - ├─ Watcher thread parses event - │ - ├─ Extracts TID from directory name - │ - ├─ Sends RT signal to TID - │ - ├─ simpleTlsSignalHandler() executes - │ - ├─ Checks: VMThread::current() == nullptr? - │ - └─ Yes → initCurrentThreadWithBuffer() -``` - -### 5. Signal Handler with Deduplication - -The signal handler prevents double-initialization for Java threads: +The signal handler infrastructure remains in place for potential future use: ```cpp void simpleTlsSignalHandler(int signo) { @@ -307,34 +205,7 @@ void simpleTlsSignalHandler(int signo) { } ``` -**Deduplication Logic:** - -``` -Signal arrives on Java thread: - │ - ├─ VMThread::current() → returns JavaThread* - │ - └─ Handler does nothing (already initialized by JVMTI) - -Signal arrives on native thread: - │ - ├─ VMThread::current() → returns nullptr - │ - └─ Handler calls initCurrentThreadWithBuffer() -``` - -**Additional Safety Check:** - -```cpp -void initCurrentThreadWithBuffer() { - // Early check - if already initialized, return immediately - if (pthread_getspecific(_tls_key) != NULL) { - return; - } - - // ... claim slot and initialize ... -} -``` +**Note:** With the removal of filesystem-based thread monitoring, this signal handler is currently not actively used. It remains available for future lib patching implementations that may signal threads explicitly. ### 6. JVMTI Integration @@ -368,18 +239,10 @@ JVMTI ThreadStart fires │ │ │ └─ TLS now initialized with dedicated allocation │ - ├─ Later: filesystem watcher detects thread (Linux only) - │ - ├─ Sends RT signal to thread - │ - ├─ simpleTlsSignalHandler() fires - │ - ├─ VMThread::current() != nullptr (Java thread) - │ - └─ Handler exits without action (already initialized) + └─ Thread ready for profiling ``` -**Key Distinction: Two Separate Initialization Strategies** +**Key Characteristics:** 1. **JVMTI Path** (`initCurrentThread()`): - Used for: New Java threads created after profiler starts @@ -387,61 +250,51 @@ JVMTI ThreadStart fires - Not from buffer: Java threads get dedicated allocations - Safe context: Called from JVMTI callback (not signal handler) -2. **Signal Priming Path** (`initCurrentThreadWithBuffer()`): - - Used for: Native threads and existing Java threads at startup - - Allocation: Claims pre-allocated buffer slot - - Signal-safe: No malloc, just atomic slot claim - - Buffer reuse: Slots recycled when threads die - -**Why Two Strategies?** - -Java threads are managed via JVMTI callbacks (safe context), so they can use `new` operator. Native threads have no interception point, so they must use pre-allocated buffer slots claimed via async-signal-safe operations. +2. **Native Threads:** + - Currently use lazy initialization (may allocate in signal handler) + - Future lib patching will enable pre-initialization of native threads + - Buffer-based priming infrastructure remains available for future use ## Platform-Specific Behavior -### Linux (Full TLS Priming) +### Linux (TLS Priming) **Capabilities:** - ✅ RT signal handler installation - ✅ Thread enumeration via `/proc/self/task` - ✅ Per-thread signaling via `tgkill()` -- ✅ Filesystem watching with inotify - ✅ Thread count via `/proc/self/status` +- ✅ JVMTI ThreadStart for Java threads **Implementation:** ```cpp bool OS::isTlsPrimingAvailable() { - return true; // Full support on Linux + return true; // TLS priming supported on Linux } ``` +**Current Behavior:** +- Java threads: Fully initialized via JVMTI callbacks +- Native threads: Use lazy initialization (awaiting lib patching implementation) + ### macOS (Limited TLS Priming) **Limitations:** - ❌ No RT signals (SIGRTMIN/SIGRTMAX unavailable) - ❌ No `/proc` filesystem -- ❌ No inotify equivalent - ✅ JVMTI ThreadStart still works for Java threads **Implementation:** ```cpp bool OS::isTlsPrimingAvailable() { - return false; // Filesystem watching unavailable -} - -// JVMTI still initializes Java threads -void initCurrentThread() { - if (OS::isTlsPrimingAvailable()) { - initCurrentThreadWithBuffer(); // Not called on macOS - } - // Java threads still work via JVMTI + return false; // TLS priming not available on macOS } ``` **macOS Behavior:** - Java threads: Initialized via JVMTI (works normally) - Native threads: Lazy initialization on first signal (may allocate in handler) -- Acceptable tradeoff: macOS profiling is less critical for production +- Acceptable tradeoff: macOS profiling is primarily for development ## Performance Characteristics @@ -464,23 +317,9 @@ Total Memory: ~33 KB (negligible) - 1 pthread_setspecific call - **Total: ~100-200 CPU cycles** -**Native Thread (Signal Path):** -- Signal delivery latency: ~1-10 μs -- Handler execution: ~100-200 cycles -- **Total: ~1-10 μs per thread** - -### Watcher Thread Overhead - -``` -Idle State: -- select() with 1-second timeout -- ~0% CPU usage - -Active State (new threads): -- inotify read + parse: ~1-5 μs per event -- Signal send: ~1-5 μs per thread -- **Total: ~2-10 μs per new thread** -``` +**Native Thread:** +- Currently uses lazy initialization +- Future lib patching will provide pre-initialization with minimal overhead ## Signal Safety Guarantees @@ -548,24 +387,18 @@ Thread A (Java, JVMTI) Thread B (Native, Signal) ```cpp // 1. Profiler initialization Profiler::start() { - // 2. Initialize existing threads (if priming available) + // 2. Initialize TLS infrastructure (if priming available) ProfiledThread::initExistingThreads(); } -// 3. Install signal handler and start watcher +// 3. Install signal handler and prepare buffer void initExistingThreads() { // Install RT signal handler (Linux only) g_tls_prime_signal = OS::installTlsPrimeSignalHandler( simpleTlsSignalHandler, 4); - // Prepare buffer + // Prepare buffer for future use prepareBuffer(256); - - // Start filesystem watcher (Linux only) - OS::startThreadDirectoryWatcher( - [](int tid) { OS::signalThread(tid, g_tls_prime_signal); }, - [](int tid) { /* thread death - no-op */ } - ); } ``` @@ -573,13 +406,10 @@ void initExistingThreads() { ```cpp void cleanupTlsPriming() { - // 1. Stop watcher thread - OS::stopThreadDirectoryWatcher(); // Joins watcher thread - - // 2. Uninstall signal handler + // 1. Uninstall signal handler OS::uninstallTlsPrimeSignalHandler(g_tls_prime_signal); - // 3. Note: Don't clean buffer (threads may still be using it) + // 2. Note: Don't clean buffer (threads may still be using it) // Buffer cleaned up on process exit } ``` @@ -588,28 +418,22 @@ void cleanupTlsPriming() { ### Unit Tests -**Signal Handler Installation** (`test_tlsPriming.cpp:38-57`): +**Signal Handler Installation** (`test_tlsPriming.cpp`): - Verifies RT signal allocation - Checks signal number range (SIGRTMIN to SIGRTMAX) - Platform-specific expectations -**Thread Enumeration** (`test_tlsPriming.cpp:59-74`): +**Thread Enumeration** (`test_tlsPriming.cpp`): - Enumerates current threads - Validates TID values - Ensures at least 1 thread found -**Signal Delivery** (`test_tlsPriming.cpp:84-123`): +**Signal Delivery** (`test_tlsPriming.cpp`): - Installs handler - Signals thread - Verifies TLS modification - Confirms signal delivery -**Filesystem Watcher** (`test_tlsPriming.cpp:125-162`): -- Starts watcher -- Creates short-lived thread -- Detects thread creation/destruction -- Validates cleanup - ### Integration Tests **ProfiledThread Tests:** @@ -617,31 +441,38 @@ void cleanupTlsPriming() { - Slot reuse - Concurrent initialization - Thread isolation +- JVMTI vs buffer-based initialization ## Key Architectural Benefits -1. **Crash Prevention**: Eliminates malloc() in signal handlers +1. **Crash Prevention**: Eliminates malloc() in signal handlers for Java threads 2. **Deadlock Avoidance**: No locks in signal handler paths -3. **Platform Optimization**: Full support on Linux, graceful degradation on macOS +3. **Platform Optimization**: JVMTI-based initialization on both Linux and macOS 4. **Efficient Memory**: Small fixed overhead (33 KB) 5. **Scalability**: Lock-free operations scale with thread count 6. **Reliability**: Handles race conditions without corruption ## Future Enhancements -### Potential Improvements +### Planned Improvements -1. **Dynamic Buffer Sizing**: Grow buffer if 256 slots exhausted -2. **macOS Native Support**: Explore kqueue for thread monitoring +1. **Lib Patching for Native Threads**: Replace filesystem monitoring with library patching to pre-initialize native threads +2. **Dynamic Buffer Sizing**: Grow buffer if 256 slots exhausted 3. **Metrics**: Track slot utilization and initialization latency -4. **Proactive Priming**: Prime threads during profiler start -5. **Buffer Compaction**: Defragment free slots periodically +4. **Buffer Compaction**: Defragment free slots periodically ### Known Limitations -1. **Fixed Buffer Size**: 256 slots may be insufficient for extreme workloads -2. **macOS Gap**: Native threads not pre-initialized -3. **Watcher Latency**: ~1-10 μs delay between thread start and priming -4. **Signal Exhaustion**: RT signals limited (typically 32 available) +1. **Native Thread Gap**: Native threads currently use lazy initialization (awaiting lib patching) +2. **Fixed Buffer Size**: 256 slots may be insufficient for extreme workloads (unlikely for native threads) +3. **macOS Gap**: Native threads not pre-initialized +4. **Signal Exhaustion**: RT signals limited (typically 32 available, unlikely to happen) + +### Recent Changes + +**Removed Filesystem Monitoring (2025):** +- Filesystem-based thread monitoring via inotify has been removed due to performance concerns +- The thread directory watcher caused overhead in production environments +- Future implementations will use lib patching instead for native thread initialization -This architecture provides a robust, platform-aware solution to the TLS initialization problem, ensuring signal handlers can safely access thread-local data without risk of deadlock or crash. +This architecture provides a robust, platform-aware solution to the TLS initialization problem for Java threads, ensuring signal handlers can safely access thread-local data without risk of deadlock or crash. Native thread support will be enhanced through future lib patching implementations.