From d8904a71f41ab9277f9058ab2669a7fe2968ec0b Mon Sep 17 00:00:00 2001 From: Attila Szegedi Date: Wed, 19 Mar 2025 16:27:45 +0100 Subject: [PATCH 1/2] Only collect async ID when it's safe to do so (#197) * Guard against collecting async ID while performing GC * Additional sanity checks: - don't sample a dead isolate - only try to retrieve an async ID when there's a context * Move GetAsyncId and GC callbacks out of the header file. Use relaxed load/store with appropriate signal fences for cheaper reading/writing of gcCount. * Have a config option for collecting async IDs --- bindings/profilers/wall.cc | 151 ++++++++++++++++++++++------------ bindings/profilers/wall.hh | 9 ++ ts/src/time-profiler.ts | 2 + ts/test/test-time-profiler.ts | 55 +++++++++++-- 4 files changed, 159 insertions(+), 58 deletions(-) diff --git a/bindings/profilers/wall.cc b/bindings/profilers/wall.cc index b5e90b09..fe805bfb 100644 --- a/bindings/profilers/wall.cc +++ b/bindings/profilers/wall.cc @@ -291,6 +291,9 @@ void SignalHandler::HandleProfilerSignal(int sig, return; } auto isolate = Isolate::GetCurrent(); + if (!isolate || isolate->IsDead()) { + return; + } WallProfiler* prof = g_profilers.GetProfiler(isolate); if (!prof) { @@ -315,9 +318,7 @@ void SignalHandler::HandleProfilerSignal(int sig, auto time_from = Now(); old_handler(sig, info, context); auto time_to = Now(); - int64_t async_id = -1; - // don't capture for now until we work out the issues with GC and thread start - // static_cast(node::AsyncHooksGetExecutionAsyncId(isolate)); + auto async_id = prof->GetAsyncId(isolate); prof->PushContext(time_from, time_to, cpu_time, async_id); } #else @@ -473,11 +474,13 @@ std::shared_ptr WallProfiler::GetContextsByNode( sampleContext.context.get()->Get(isolate)) .Check(); } - timedContext - ->Set(v8Context, - asyncIdKey, - NewNumberFromInt64(isolate, sampleContext.async_id)) - .Check(); + if (collectAsyncId_) { + timedContext + ->Set(v8Context, + asyncIdKey, + NewNumberFromInt64(isolate, sampleContext.async_id)) + .Check(); + } } } array->Set(v8Context, array->Length(), timedContext).Check(); @@ -492,12 +495,27 @@ std::shared_ptr WallProfiler::GetContextsByNode( return contextsByNode; } +void GCPrologueCallback(Isolate* isolate, + GCType type, + GCCallbackFlags flags, + void* data) { + static_cast(data)->OnGCStart(isolate); +} + +void GCEpilogueCallback(Isolate* isolate, + GCType type, + GCCallbackFlags flags, + void* data) { + static_cast(data)->OnGCEnd(); +} + WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, std::chrono::microseconds duration, bool includeLines, bool withContexts, bool workaroundV8Bug, bool collectCpuTime, + bool collectAsyncId, bool isMainThread) : samplingPeriod_(samplingPeriod), includeLines_(includeLines), @@ -509,6 +527,7 @@ WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, // event just after triggers the issue. workaroundV8Bug_ = workaroundV8Bug && DD_WALL_USE_SIGPROF && detectV8Bug_; collectCpuTime_ = collectCpuTime && withContexts; + collectAsyncId_ = collectAsyncId && withContexts; if (withContexts_) { contexts_.reserve(duration * 2 / samplingPeriod); @@ -516,7 +535,9 @@ WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, curContext_.store(&context1_, std::memory_order_relaxed); collectionMode_.store(CollectionMode::kNoCollect, std::memory_order_relaxed); + gcCount.store(0, std::memory_order_relaxed); + // TODO: bind to this isolate? Would fix the Dispose(nullptr) issue. auto isolate = v8::Isolate::GetCurrent(); v8::Local buffer = v8::ArrayBuffer::New(isolate, sizeof(uint32_t) * kFieldCount); @@ -526,6 +547,11 @@ WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, fields_ = static_cast(buffer->GetBackingStore()->Data()); jsArray_ = v8::Global(isolate, jsArray); std::fill(fields_, fields_ + kFieldCount, 0); + + if (collectAsyncId_) { + isolate->AddGCPrologueCallback(&GCPrologueCallback, this); + isolate->AddGCEpilogueCallback(&GCEpilogueCallback, this); + } } WallProfiler::~WallProfiler() { @@ -538,9 +564,22 @@ void WallProfiler::Dispose(Isolate* isolate) { cpuProfiler_ = nullptr; g_profilers.RemoveProfiler(isolate, this); + + if (isolate != nullptr && collectAsyncId_) { + isolate->RemoveGCPrologueCallback(&GCPrologueCallback, this); + isolate->RemoveGCEpilogueCallback(&GCEpilogueCallback, this); + } } } +#define DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(name) \ + auto name##Value = \ + Nan::Get(arg, Nan::New(#name).ToLocalChecked()); \ + if (name##Value.IsEmpty() || !name##Value.ToLocalChecked()->IsBoolean()) { \ + return Nan::ThrowTypeError(#name " must be a boolean."); \ + } \ + bool name = name##Value.ToLocalChecked().As()->Value(); + NAN_METHOD(WallProfiler::New) { if (info.Length() != 1 || !info[0]->IsObject()) { return Nan::ThrowTypeError("WallProfiler must have one object argument."); @@ -579,50 +618,12 @@ NAN_METHOD(WallProfiler::New) { return Nan::ThrowTypeError("Duration must not be less than sample rate."); } - auto lineNumbersValue = - Nan::Get(arg, Nan::New("lineNumbers").ToLocalChecked()); - if (lineNumbersValue.IsEmpty() || - !lineNumbersValue.ToLocalChecked()->IsBoolean()) { - return Nan::ThrowTypeError("lineNumbers must be a boolean."); - } - bool lineNumbers = - lineNumbersValue.ToLocalChecked().As()->Value(); - - auto withContextsValue = - Nan::Get(arg, Nan::New("withContexts").ToLocalChecked()); - if (withContextsValue.IsEmpty() || - !withContextsValue.ToLocalChecked()->IsBoolean()) { - return Nan::ThrowTypeError("withContext must be a boolean."); - } - bool withContexts = - withContextsValue.ToLocalChecked().As()->Value(); - - auto workaroundV8BugValue = - Nan::Get(arg, Nan::New("workaroundV8Bug").ToLocalChecked()); - if (workaroundV8BugValue.IsEmpty() || - !workaroundV8BugValue.ToLocalChecked()->IsBoolean()) { - return Nan::ThrowTypeError("workaroundV8Bug must be a boolean."); - } - bool workaroundV8Bug = - workaroundV8BugValue.ToLocalChecked().As()->Value(); - - auto collectCpuTimeValue = - Nan::Get(arg, Nan::New("collectCpuTime").ToLocalChecked()); - if (collectCpuTimeValue.IsEmpty() || - !collectCpuTimeValue.ToLocalChecked()->IsBoolean()) { - return Nan::ThrowTypeError("collectCpuTime must be a boolean."); - } - bool collectCpuTime = - collectCpuTimeValue.ToLocalChecked().As()->Value(); - - auto isMainThreadValue = - Nan::Get(arg, Nan::New("isMainThread").ToLocalChecked()); - if (isMainThreadValue.IsEmpty() || - !isMainThreadValue.ToLocalChecked()->IsBoolean()) { - return Nan::ThrowTypeError("isMainThread must be a boolean."); - } - bool isMainThread = - isMainThreadValue.ToLocalChecked().As()->Value(); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(lineNumbers); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(withContexts); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(workaroundV8Bug); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(collectCpuTime); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(collectAsyncId); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(isMainThread); if (withContexts && !DD_WALL_USE_SIGPROF) { return Nan::ThrowTypeError("Contexts are not supported."); @@ -632,6 +633,10 @@ NAN_METHOD(WallProfiler::New) { return Nan::ThrowTypeError("Cpu time collection requires contexts."); } + if (collectAsyncId && !withContexts) { + return Nan::ThrowTypeError("Async ID collection requires contexts."); + } + if (lineNumbers && withContexts) { // Currently custom contexts are not compatible with caller line // information, because it's not possible to associate context with line @@ -657,6 +662,7 @@ NAN_METHOD(WallProfiler::New) { withContexts, workaroundV8Bug, collectCpuTime, + collectAsyncId, isMainThread); obj->Wrap(info.This()); info.GetReturnValue().Set(info.This()); @@ -668,6 +674,8 @@ NAN_METHOD(WallProfiler::New) { } } +#undef DD_WALL_PROFILER_GET_BOOLEAN_CONFIG + NAN_METHOD(WallProfiler::Start) { WallProfiler* wallProfiler = Nan::ObjectWrap::Unwrap(info.Holder()); @@ -1019,6 +1027,45 @@ NAN_METHOD(WallProfiler::Dispose) { delete profiler; } +int64_t GetAsyncIdNoGC(v8::Isolate* isolate) { + return isolate->InContext() + ? static_cast( + node::AsyncHooksGetExecutionAsyncId(isolate)) + : -1; +} + +int64_t WallProfiler::GetAsyncId(v8::Isolate* isolate) { + if (!collectAsyncId_) { + return -1; + } + auto curGcCount = gcCount.load(std::memory_order_relaxed); + std::atomic_signal_fence(std::memory_order_acquire); + if (curGcCount > 0) { + return gcAsyncId; + } + return GetAsyncIdNoGC(isolate); +} + +void WallProfiler::OnGCStart(v8::Isolate* isolate) { + auto curCount = gcCount.load(std::memory_order_relaxed); + std::atomic_signal_fence(std::memory_order_acquire); + if (curCount == 0) { + gcAsyncId = GetAsyncIdNoGC(isolate); + } + gcCount.store(curCount + 1, std::memory_order_relaxed); + std::atomic_signal_fence(std::memory_order_release); +} + +void WallProfiler::OnGCEnd() { + auto newCount = gcCount.load(std::memory_order_relaxed) - 1; + std::atomic_signal_fence(std::memory_order_acquire); + gcCount.store(newCount, std::memory_order_relaxed); + std::atomic_signal_fence(std::memory_order_release); + if (newCount == 0) { + gcAsyncId = -1; + } +} + void WallProfiler::PushContext(int64_t time_from, int64_t time_to, int64_t cpu_time, diff --git a/bindings/profilers/wall.hh b/bindings/profilers/wall.hh index 0f1f3322..7b85c80d 100644 --- a/bindings/profilers/wall.hh +++ b/bindings/profilers/wall.hh @@ -58,6 +58,9 @@ class WallProfiler : public Nan::ObjectWrap { ContextPtr context2_; std::atomic curContext_; + std::atomic gcCount = 0; + int64_t gcAsyncId; + std::atomic collectionMode_; std::atomic noCollectCallCount_; std::string profileId_; @@ -68,6 +71,7 @@ class WallProfiler : public Nan::ObjectWrap { bool workaroundV8Bug_; static inline constexpr bool detectV8Bug_ = true; bool collectCpuTime_; + bool collectAsyncId_; bool isMainThread_; int v8ProfilerStuckEventLoopDetected_ = 0; ProcessCpuClock::time_point startProcessCpuTime_{}; @@ -117,6 +121,7 @@ class WallProfiler : public Nan::ObjectWrap { bool withContexts, bool workaroundV8bug, bool collectCpuTime, + bool collectAsyncId, bool isMainThread); v8::Local GetContext(v8::Isolate*); @@ -149,6 +154,10 @@ class WallProfiler : public Nan::ObjectWrap { return threadCpuStopWatch_.GetAndReset(); } + int64_t GetAsyncId(v8::Isolate* isolate); + void OnGCStart(v8::Isolate* isolate); + void OnGCEnd(); + static NAN_METHOD(New) GENERAL_REGS_ONLY; static NAN_METHOD(Start); static NAN_METHOD(Stop); diff --git a/ts/src/time-profiler.ts b/ts/src/time-profiler.ts index 5d1defd1..b9cd299d 100644 --- a/ts/src/time-profiler.ts +++ b/ts/src/time-profiler.ts @@ -66,6 +66,7 @@ export interface TimeProfilerOptions { withContexts?: boolean; workaroundV8Bug?: boolean; collectCpuTime?: boolean; + collectAsyncId?: boolean; } const DEFAULT_OPTIONS: TimeProfilerOptions = { @@ -75,6 +76,7 @@ const DEFAULT_OPTIONS: TimeProfilerOptions = { withContexts: false, workaroundV8Bug: true, collectCpuTime: false, + collectAsyncId: false, }; export async function profile(options: TimeProfilerOptions = {}) { diff --git a/ts/test/test-time-profiler.ts b/ts/test/test-time-profiler.ts index 720f520c..2434c813 100644 --- a/ts/test/test-time-profiler.ts +++ b/ts/test/test-time-profiler.ts @@ -39,7 +39,7 @@ describe('Time Profiler', () => { assert.equal(profile.stringTable.strings!.indexOf('(program)'), -1); }); - it('should update state', function () { + it('should update state', function shouldUpdateState() { if (process.platform !== 'darwin' && process.platform !== 'linux') { this.skip(); } @@ -90,7 +90,7 @@ describe('Time Profiler', () => { assert(checked, 'No context found'); }); - it('should assign labels', function () { + it('should have labels', function shouldHaveLabels() { if (process.platform !== 'darwin' && process.platform !== 'linux') { this.skip(); } @@ -129,10 +129,8 @@ describe('Time Profiler', () => { if (!context) { return {}; } - assert( - typeof context.asyncId === 'number', - 'context.asyncId should be a number' - ); + // Does not collect async IDs by default + assert(typeof context.asyncId === 'undefined'); const labels: LabelSet = {}; for (const [key, value] of Object.entries(context.context ?? {})) { if (typeof value === 'string') { @@ -356,6 +354,51 @@ describe('Time Profiler', () => { }); }); + it('should have async IDs when enabled', async function shouldCollectAsyncIDs() { + if (process.platform !== 'darwin' && process.platform !== 'linux') { + this.skip(); + } + this.timeout(3000); + + time.start({ + intervalMicros: PROFILE_OPTIONS.intervalMicros, + durationMillis: PROFILE_OPTIONS.durationMillis, + withContexts: true, + lineNumbers: false, + collectAsyncId: true, + }); + let setDone: () => void; + const done = new Promise(resolve => { + setDone = resolve; + }); + + const testStart = hrtime.bigint(); + const testDurationNanos = PROFILE_OPTIONS.durationMillis * 1_000_000; + setTimeout(loop, 0); + + function loop() { + const loopDurationNanos = PROFILE_OPTIONS.intervalMicros * 1_000; + const loopStart = hrtime.bigint(); + while (hrtime.bigint() - loopStart < loopDurationNanos); + if (hrtime.bigint() - testStart < testDurationNanos) { + setTimeout(loop, 0); + } else { + setDone(); + } + } + + await done; + + let asyncIdObserved = false; + time.stop(false, ({context}: GenerateTimeLabelsArgs) => { + if (!asyncIdObserved && typeof context?.asyncId === 'number') { + asyncIdObserved = context?.asyncId !== -1; + } + return {}; + }); + assert(asyncIdObserved, 'Async ID was not observed'); + }); + describe('profile (w/ stubs)', () => { // eslint-disable-next-line @typescript-eslint/no-explicit-any const sinonStubs: Array> = []; From 893e9110ab029527c4328dc23f39fbe50b548c60 Mon Sep 17 00:00:00 2001 From: Attila Szegedi Date: Wed, 19 Mar 2025 16:29:51 +0100 Subject: [PATCH 2/2] v5.7.0 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3c08897f..9f11279d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@datadog/pprof", - "version": "5.6.0", + "version": "5.7.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@datadog/pprof", - "version": "5.6.0", + "version": "5.7.0", "hasInstallScript": true, "license": "Apache-2.0", "dependencies": { diff --git a/package.json b/package.json index 5ca67a82..66b772f6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@datadog/pprof", - "version": "5.6.0", + "version": "5.7.0", "description": "pprof support for Node.js", "repository": "datadog/pprof-nodejs", "main": "out/src/index.js",