From 01ab39c3e0a829e1badc485b162ae4004db7c7d1 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Tue, 3 Mar 2026 13:48:20 -0500 Subject: [PATCH 1/3] emit best effort mac stack and dont skip tests --- bin_tests/src/artifacts.rs | 5 - bin_tests/tests/crashtracker_bin_test.rs | 15 +-- libdd-crashtracker/src/collector/emitters.rs | 115 +++++++++++++++++- .../src/receiver/entry_points.rs | 11 +- 4 files changed, 125 insertions(+), 21 deletions(-) diff --git a/bin_tests/src/artifacts.rs b/bin_tests/src/artifacts.rs index 3f5341e4d9..36fc5a3bb8 100644 --- a/bin_tests/src/artifacts.rs +++ b/bin_tests/src/artifacts.rs @@ -29,7 +29,6 @@ pub fn crashtracker_bin_test(profile: BuildProfile, panic_abort: bool) -> Artifa } /// Creates an ArtifactsBuild for the crashing_test_app binary. -#[cfg(not(target_os = "macos"))] pub fn crashing_app(profile: BuildProfile, panic_abort: bool) -> ArtifactsBuild { ArtifactsBuild { name: "crashing_test_app".to_owned(), @@ -90,15 +89,11 @@ pub fn all_prebuild_artifacts() -> Vec { artifacts.push(crashtracker_receiver(profile)); artifacts.push(test_the_tests(profile)); artifacts.push(profiling_ffi(profile)); - - #[cfg(not(target_os = "macos"))] artifacts.push(crashing_app(profile, false)); } // Panic abort variants (used by panic hook tests) artifacts.push(crashtracker_bin_test(BuildProfile::Debug, true)); - - #[cfg(not(target_os = "macos"))] artifacts.push(crashing_app(BuildProfile::Debug, true)); artifacts diff --git a/bin_tests/tests/crashtracker_bin_test.rs b/bin_tests/tests/crashtracker_bin_test.rs index 20c36e2c2c..2e8f2d2e98 100644 --- a/bin_tests/tests/crashtracker_bin_test.rs +++ b/bin_tests/tests/crashtracker_bin_test.rs @@ -423,19 +423,16 @@ fn test_crash_tracking_errors_intake_uds_socket() { /// DD_CRASHTRACK_END_STACKTRACE. Error: Can't set non-existant stack complete\n") #[test] #[cfg_attr(miri, ignore)] -#[cfg(not(target_os = "macos"))] fn test_crash_tracking_bin_panic() { test_crash_tracking_app("panic"); } #[test] -#[cfg(not(target_os = "macos"))] #[cfg_attr(miri, ignore)] fn test_crash_tracking_bin_segfault() { test_crash_tracking_app("segfault"); } -#[cfg(not(target_os = "macos"))] fn test_crash_tracking_app(crash_type: &str) { use bin_tests::test_runner::run_custom_crash_test; @@ -484,7 +481,6 @@ fn test_crash_tracking_app(crash_type: &str) { #[test] #[cfg_attr(miri, ignore)] -#[cfg(not(target_os = "macos"))] // Same restriction as other panic tests fn test_crash_tracking_bin_panic_hook_after_fork() { test_panic_hook_mode( "panic_hook_after_fork", @@ -495,14 +491,12 @@ fn test_crash_tracking_bin_panic_hook_after_fork() { #[test] #[cfg_attr(miri, ignore)] -#[cfg(not(target_os = "macos"))] // Same restriction as other panic tests fn test_crash_tracking_bin_panic_hook_string() { test_panic_hook_mode("panic_hook_string", "message", Some("Panic with value: 42")); } #[test] #[cfg_attr(miri, ignore)] -#[cfg(not(target_os = "macos"))] // Same restriction as other panic tests fn test_crash_tracking_bin_panic_hook_unknown_type() { test_panic_hook_mode( "panic_hook_unknown_type", @@ -513,7 +507,6 @@ fn test_crash_tracking_bin_panic_hook_unknown_type() { /// Helper function to run panic hook tests with different payload types. /// Note: Since tests are built with Debug profile, location is always expected. -#[cfg(not(target_os = "macos"))] fn test_panic_hook_mode(mode: &str, expected_category: &str, expected_panic_message: Option<&str>) { use bin_tests::test_runner::run_custom_crash_test; @@ -580,20 +573,20 @@ fn test_panic_hook_mode(mode: &str, expected_category: &str, expected_panic_mess // ==================================================================================== // These tests use `run_custom_crash_test` with the crashing_test_app artifact. -// This test is disabled for now on x86_64 musl and macos +// This test is disabled for now on x86_64 musl // It seems that on aarch64 musl, libc has CFI which allows // unwinding passed the signal frame. // Don't forget to update the ignore condition for this and also // `test_crash_tracking_callstack` when this is revisited. #[test] -#[cfg(not(any(all(target_arch = "x86_64", target_env = "musl"), target_os = "macos")))] +#[cfg(not(any(all(target_arch = "x86_64", target_env = "musl"))))] #[cfg_attr(miri, ignore)] fn test_crasht_tracking_validate_callstack() { test_crash_tracking_callstack() } -// This test is disabled for now on x86_64 musl and macos for the reason mentioned above. -#[cfg(not(any(all(target_arch = "x86_64", target_env = "musl"), target_os = "macos")))] +// This test is disabled for now on x86_64 musl for the reason mentioned above. +#[cfg(not(any(all(target_arch = "x86_64", target_env = "musl"))))] fn test_crash_tracking_callstack() { use bin_tests::test_runner::run_custom_crash_test; diff --git a/libdd-crashtracker/src/collector/emitters.rs b/libdd-crashtracker/src/collector/emitters.rs index 9b4b09b85e..25c32f6ace 100644 --- a/libdd-crashtracker/src/collector/emitters.rs +++ b/libdd-crashtracker/src/collector/emitters.rs @@ -55,11 +55,38 @@ unsafe fn emit_backtrace_by_frames( w: &mut impl Write, resolve_frames: StacktraceCollection, fault_ip: usize, + ucontext: *const ucontext_t, ) -> Result<(), EmitterError> { // https://docs.rs/backtrace/latest/backtrace/index.html writeln!(w, "{DD_CRASHTRACK_BEGIN_STACKTRACE}")?; - // Absolute addresses appear to be safer to collect during a crash than debug info. + // On macOS, backtrace::trace_unsynchronized fails in forked children because + // macOS restricts many APIs after fork-without-exec. Walk the frame pointer + // chain directly from the saved ucontext registers instead. The parent's + // stack memory is still readable in the forked child + #[cfg(target_os = "macos")] + { + let _ = (resolve_frames, fault_ip); + emit_backtrace_from_ucontext(w, ucontext)?; + } + + #[cfg(not(target_os = "macos"))] + { + let _ = ucontext; + emit_backtrace_via_library(w, resolve_frames, fault_ip)?; + } + + writeln!(w, "{DD_CRASHTRACK_END_STACKTRACE}")?; + w.flush()?; + Ok(()) +} + +#[allow(dead_code)] // used from tests on macOS, from emit_backtrace_by_frames on other platforms +unsafe fn emit_backtrace_via_library( + w: &mut impl Write, + resolve_frames: StacktraceCollection, + fault_ip: usize, +) -> Result<(), EmitterError> { fn emit_absolute_addresses(w: &mut impl Write, frame: &Frame) -> Result<(), EmitterError> { write!(w, "\"ip\": \"{:?}\"", frame.ip())?; if let Some(module_base_address) = frame.module_base_address() { @@ -132,7 +159,83 @@ unsafe fn emit_backtrace_by_frames( // emit anything at all, if the crashing frame is not found for some reason ip_found = true; } - writeln!(w, "{DD_CRASHTRACK_END_STACKTRACE}")?; + Ok(()) +} + +/// Walk the frame pointer chain from the ucontext's saved registers. +/// +/// After fork(), the child process has a copy-on-write view of the parent's +/// stack memory, so the frame pointer chain from the crashed thread is still +/// readable. This avoids depending on `backtrace::trace_unsynchronized` which +/// uses macOS APIs that don't work in forked-but-not-exec'd children. +/// +/// For each IP we call `dladdr` to resolve the symbol name, symbol address, +/// and containing shared-object path. `dladdr` is safe here because it only +/// reads dyld's internal data structures (no allocation, no Mach IPC). +#[cfg(target_os = "macos")] +unsafe fn emit_backtrace_from_ucontext( + w: &mut impl Write, + ucontext: *const ucontext_t, +) -> Result<(), EmitterError> { + if ucontext.is_null() { + return Ok(()); + } + let mcontext = (*ucontext).uc_mcontext; + if mcontext.is_null() { + return Ok(()); + } + + let ss = &(*mcontext).__ss; + #[cfg(target_arch = "aarch64")] + let (pc, mut fp) = (ss.__pc as usize, ss.__fp as usize); + #[cfg(target_arch = "x86_64")] + let (pc, mut fp) = (ss.__rip as usize, ss.__rbp as usize); + + emit_frame_with_dladdr(w, pc)?; + + loop { + if fp == 0 || fp % std::mem::align_of::() != 0 { + break; + } + let next_fp = *(fp as *const usize); + let return_addr = *((fp + std::mem::size_of::()) as *const usize); + if return_addr == 0 { + break; + } + emit_frame_with_dladdr(w, return_addr)?; + if next_fp <= fp { + break; + } + fp = next_fp; + } + + Ok(()) +} + +/// Emit a single stack frame, enriched with `dladdr` symbol information. +#[cfg(target_os = "macos")] +unsafe fn emit_frame_with_dladdr(w: &mut impl Write, ip: usize) -> Result<(), EmitterError> { + let mut info: libc::Dl_info = std::mem::zeroed(); + let resolved = libc::dladdr(ip as *const libc::c_void, &mut info) != 0; + + write!(w, "{{\"ip\": \"0x{ip:x}\"")?; + + if resolved { + if !info.dli_fbase.is_null() { + write!(w, ", \"module_base_address\": \"{:?}\"", info.dli_fbase)?; + } + if !info.dli_saddr.is_null() { + write!(w, ", \"symbol_address\": \"{:?}\"", info.dli_saddr)?; + } + if !info.dli_sname.is_null() { + let name = std::ffi::CStr::from_ptr(info.dli_sname); + if let Ok(s) = name.to_str() { + write!(w, ", \"function\": \"{s}\"")?; + } + } + } + + writeln!(w, "}}")?; w.flush()?; Ok(()) } @@ -213,7 +316,9 @@ pub(crate) fn emit_crashreport( // https://doc.rust-lang.org/src/std/backtrace.rs.html#332 // We do this last, so even if it crashes, we still get the other info. let fault_ip = extract_ip(ucontext); - unsafe { emit_backtrace_by_frames(pipe, config.resolve_frames(), fault_ip)? }; + unsafe { + emit_backtrace_by_frames(pipe, config.resolve_frames(), fault_ip, ucontext)? + }; } if is_runtime_callback_registered() { emit_runtime_stack(pipe)?; @@ -501,9 +606,11 @@ mod tests { }) }; let mut buf = Vec::new(); + writeln!(buf, "{DD_CRASHTRACK_BEGIN_STACKTRACE}").unwrap(); unsafe { - emit_backtrace_by_frames(&mut buf, collection, ip_of_test_fn).expect("to work ;-)"); + emit_backtrace_via_library(&mut buf, collection, ip_of_test_fn).expect("to work ;-)"); } + writeln!(buf, "{DD_CRASHTRACK_END_STACKTRACE}").unwrap(); buf } diff --git a/libdd-crashtracker/src/receiver/entry_points.rs b/libdd-crashtracker/src/receiver/entry_points.rs index 5ed5f27e44..532c580cb4 100644 --- a/libdd-crashtracker/src/receiver/entry_points.rs +++ b/libdd-crashtracker/src/receiver/entry_points.rs @@ -2,7 +2,10 @@ // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ use super::receive_report::receive_report_from_stream; -use crate::{crash_info::CrashInfo, CrashtrackerConfiguration, StacktraceCollection}; +use crate::crash_info::CrashInfo; +use crate::CrashtrackerConfiguration; +#[cfg(target_os = "linux")] +use crate::StacktraceCollection; use anyhow::Context; use std::time::Duration; use tokio::{ @@ -133,6 +136,10 @@ fn resolve_frames( config: &CrashtrackerConfiguration, crash_info: &mut CrashInfo, ) -> anyhow::Result<()> { + // enrich_callstacks uses blazesym's normalize_user_addrs (reads /proc//maps) + // and assumes ELF binaries. Both are Linux-specific; macOS has no procfs and + // uses Mach-O binaries. + #[cfg(target_os = "linux")] if config.resolve_frames() == StacktraceCollection::EnabledWithSymbolsInReceiver { let pid = crash_info .proc_info @@ -141,5 +148,7 @@ fn resolve_frames( .pid; crash_info.enrich_callstacks(pid)?; } + #[cfg(not(target_os = "linux"))] + let _ = (config, crash_info); Ok(()) } From 4dc6057b788eec7cc930da4eed245873021faefe Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Wed, 4 Mar 2026 10:01:22 -0500 Subject: [PATCH 2/3] Add frame bound and stack checking when traversing --- libdd-crashtracker/src/collector/emitters.rs | 41 +++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/libdd-crashtracker/src/collector/emitters.rs b/libdd-crashtracker/src/collector/emitters.rs index 25c32f6ace..aaee832f8a 100644 --- a/libdd-crashtracker/src/collector/emitters.rs +++ b/libdd-crashtracker/src/collector/emitters.rs @@ -185,6 +185,39 @@ unsafe fn emit_backtrace_from_ucontext( return Ok(()); } + // Get the thread's stack bounds so we only deref frame pointers + // that lie within known stack memory. Both pthread_get_stackaddr_np and + // pthread_get_stacksize_np are async-signal-safe on macOS + let thread = libc::pthread_self(); + let stack_top = libc::pthread_get_stackaddr_np(thread) as usize; + let stack_size = libc::pthread_get_stacksize_np(thread); + let stack_bottom = stack_top.saturating_sub(stack_size); + + // Also account for an alt stack, since the + // signal handler may have been invoked on one + let mut alt_stack: libc::stack_t = std::mem::zeroed(); + let has_alt_stack = libc::sigaltstack(std::ptr::null(), &mut alt_stack) == 0 + && (alt_stack.ss_flags & libc::SS_DISABLE) == 0 + && alt_stack.ss_size > 0; + let alt_bottom = if has_alt_stack { + alt_stack.ss_sp as usize + } else { + 0 + }; + let alt_top = if has_alt_stack { + alt_bottom + alt_stack.ss_size + } else { + 0 + }; + + // Returns true when the range [addr, addr+len) falls within + // either the thread stack or the alt stack + let in_stack_bounds = |addr: usize, len: usize| -> bool { + let end = addr.saturating_add(len); + (addr >= stack_bottom && end <= stack_top) + || (has_alt_stack && addr >= alt_bottom && end <= alt_top) + }; + let ss = &(*mcontext).__ss; #[cfg(target_arch = "aarch64")] let (pc, mut fp) = (ss.__pc as usize, ss.__fp as usize); @@ -193,10 +226,16 @@ unsafe fn emit_backtrace_from_ucontext( emit_frame_with_dladdr(w, pc)?; - loop { + const MAX_FRAMES: usize = 512; + for _ in 0..MAX_FRAMES { if fp == 0 || fp % std::mem::align_of::() != 0 { break; } + // Each frame record is two pointer-sized words: [saved_fp, return_addr] + // Bail out if the record falls outside known stack regions + if !in_stack_bounds(fp, 2 * std::mem::size_of::()) { + break; + } let next_fp = *(fp as *const usize); let return_addr = *((fp + std::mem::size_of::()) as *const usize); if return_addr == 0 { From c78d98d527a732bc4e6637965805fdc2d7f8f210 Mon Sep 17 00:00:00 2001 From: Gyuheon Oh Date: Wed, 4 Mar 2026 10:22:26 -0500 Subject: [PATCH 3/3] Don't check alt stack since unwind from ucontext --- libdd-crashtracker/src/collector/emitters.rs | 25 +++----------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/libdd-crashtracker/src/collector/emitters.rs b/libdd-crashtracker/src/collector/emitters.rs index aaee832f8a..379edfeb74 100644 --- a/libdd-crashtracker/src/collector/emitters.rs +++ b/libdd-crashtracker/src/collector/emitters.rs @@ -193,29 +193,10 @@ unsafe fn emit_backtrace_from_ucontext( let stack_size = libc::pthread_get_stacksize_np(thread); let stack_bottom = stack_top.saturating_sub(stack_size); - // Also account for an alt stack, since the - // signal handler may have been invoked on one - let mut alt_stack: libc::stack_t = std::mem::zeroed(); - let has_alt_stack = libc::sigaltstack(std::ptr::null(), &mut alt_stack) == 0 - && (alt_stack.ss_flags & libc::SS_DISABLE) == 0 - && alt_stack.ss_size > 0; - let alt_bottom = if has_alt_stack { - alt_stack.ss_sp as usize - } else { - 0 - }; - let alt_top = if has_alt_stack { - alt_bottom + alt_stack.ss_size - } else { - 0 - }; - - // Returns true when the range [addr, addr+len) falls within - // either the thread stack or the alt stack + // Returns true when the range [addr, addr+len) falls within the thread stack let in_stack_bounds = |addr: usize, len: usize| -> bool { let end = addr.saturating_add(len); - (addr >= stack_bottom && end <= stack_top) - || (has_alt_stack && addr >= alt_bottom && end <= alt_top) + addr >= stack_bottom && end <= stack_top }; let ss = &(*mcontext).__ss; @@ -232,7 +213,7 @@ unsafe fn emit_backtrace_from_ucontext( break; } // Each frame record is two pointer-sized words: [saved_fp, return_addr] - // Bail out if the record falls outside known stack regions + // Bail out if the record falls outside the thread stack if !in_stack_bounds(fp, 2 * std::mem::size_of::()) { break; }