diff --git a/ddprof-lib/src/main/cpp/arch_dd.h b/ddprof-lib/src/main/cpp/arch_dd.h index 8378fc6f2..8bddcdd2c 100644 --- a/ddprof-lib/src/main/cpp/arch_dd.h +++ b/ddprof-lib/src/main/cpp/arch_dd.h @@ -14,19 +14,32 @@ static inline long long atomicInc(volatile long long &var, return __sync_fetch_and_add(&var, increment); } -static inline u64 loadAcquire(volatile u64 &var) { - return __atomic_load_n(&var, __ATOMIC_ACQUIRE); +template +static inline long long atomicIncRelaxed(volatile T &var, + T increment = 1) { + return __atomic_fetch_add(&var, increment, __ATOMIC_RELAXED); } -static inline size_t loadAcquire(volatile size_t &var) { - return __atomic_load_n(&var, __ATOMIC_ACQUIRE); +// Atomic load/store (unordered) +template +static inline T load(volatile T& var) { + return __atomic_load_n(&var, __ATOMIC_RELAXED); } -static inline void storeRelease(volatile long long &var, long long value) { - return __atomic_store_n(&var, value, __ATOMIC_RELEASE); +template +static inline void store(volatile T& var, T value) { + return __atomic_store_n(&var, value, __ATOMIC_RELAXED); +} + + +// Atomic load-acquire/release-store +template +static inline T loadAcquire(volatile T& var) { + return __atomic_load_n(&var, __ATOMIC_ACQUIRE); } -static inline void storeRelease(volatile size_t &var, size_t value) { +template +static inline void storeRelease(volatile T& var, T value) { return __atomic_store_n(&var, value, __ATOMIC_RELEASE); } diff --git a/ddprof-lib/src/main/cpp/callTraceHashTable.cpp b/ddprof-lib/src/main/cpp/callTraceHashTable.cpp index 1c32150bc..c10f4fd98 100644 --- a/ddprof-lib/src/main/cpp/callTraceHashTable.cpp +++ b/ddprof-lib/src/main/cpp/callTraceHashTable.cpp @@ -301,7 +301,7 @@ u64 CallTraceHashTable::put(int num_frames, ASGCT_CallFrame *frames, if (++step >= capacity) { // Very unlikely case of a table overflow - atomicInc(_overflow); + atomicIncRelaxed(_overflow); return OVERFLOW_TRACE_ID; } // Improved version of linear probing @@ -359,7 +359,7 @@ void CallTraceHashTable::collectAndCopySelective(std::unordered_set traces.insert(&_overflow_trace); if (trace_ids_to_preserve.find(OVERFLOW_TRACE_ID) != trace_ids_to_preserve.end()) { // Copy overflow trace to target - it's a static trace so just increment overflow counter - atomicInc(target->_overflow); + atomicIncRelaxed(target->_overflow); } } } diff --git a/ddprof-lib/src/main/cpp/callTraceHashTable.h b/ddprof-lib/src/main/cpp/callTraceHashTable.h index 5dceb65b4..2541d12fb 100644 --- a/ddprof-lib/src/main/cpp/callTraceHashTable.h +++ b/ddprof-lib/src/main/cpp/callTraceHashTable.h @@ -52,7 +52,7 @@ class CallTraceHashTable { LinearAllocator _allocator; LongHashTable *_current_table; - u64 _overflow; + volatile u64 _overflow; u64 calcHash(int num_frames, ASGCT_CallFrame *frames, bool truncated); CallTrace *storeCallTrace(int num_frames, ASGCT_CallFrame *frames, diff --git a/ddprof-lib/src/main/cpp/counters.h b/ddprof-lib/src/main/cpp/counters.h index 1df8fe3f5..b276f029d 100644 --- a/ddprof-lib/src/main/cpp/counters.h +++ b/ddprof-lib/src/main/cpp/counters.h @@ -124,9 +124,9 @@ class Counters { static void increment(CounterId counter, long long delta = 1, int offset = 0) { #ifdef COUNTERS - atomicInc(Counters::instance() - ._counters[address(static_cast(counter) + offset)], - delta); + atomicIncRelaxed(Counters::instance() + ._counters[address(static_cast(counter) + offset)], + delta); #endif // COUNTERS } diff --git a/ddprof-lib/src/main/cpp/livenessTracker.cpp b/ddprof-lib/src/main/cpp/livenessTracker.cpp index e085f3cdd..1923c991f 100644 --- a/ddprof-lib/src/main/cpp/livenessTracker.cpp +++ b/ddprof-lib/src/main/cpp/livenessTracker.cpp @@ -26,12 +26,12 @@ constexpr int LivenessTracker::MAX_TRACKING_TABLE_SIZE; constexpr int LivenessTracker::MIN_SAMPLING_INTERVAL; void LivenessTracker::cleanup_table(bool forced) { - u64 current = loadAcquire(_last_gc_epoch); - u64 target_gc_epoch = loadAcquire(_gc_epoch); + u64 current = load(_last_gc_epoch); + u64 target_gc_epoch = load(_gc_epoch); if ((target_gc_epoch == _last_gc_epoch || - !__sync_bool_compare_and_swap(&_last_gc_epoch, current, - target_gc_epoch)) && + !__atomic_compare_exchange_n(&_last_gc_epoch, ¤t, + target_gc_epoch, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) && !forced) { // if the last processed GC epoch hasn't changed, or if we failed to update // it, there's nothing to do @@ -383,10 +383,10 @@ void LivenessTracker::onGC() { } // just increment the epoch - atomicInc(_gc_epoch, 1); + atomicIncRelaxed(_gc_epoch,u64(1)); if (!ddprof::HeapUsage::isLastGCUsageSupported()) { - storeRelease(_used_after_last_gc, ddprof::HeapUsage::get(false)._used); + store(_used_after_last_gc, ddprof::HeapUsage::get(false)._used); } } diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index e51b3dc70..727746367 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -559,7 +559,7 @@ int Profiler::getJavaTraceAsync(void *ucontext, ASGCT_CallFrame *frames, return 0; } - atomicInc(_failures[-trace.num_frames]); + atomicIncRelaxed(_failures[-trace.num_frames]); trace.frames->bci = BCI_ERROR; trace.frames->method_id = (jmethodID)err_string; return trace.frames - frames + 1; @@ -607,14 +607,14 @@ void Profiler::fillFrameTypes(ASGCT_CallFrame *frames, int num_frames, } u64 Profiler::recordJVMTISample(u64 counter, int tid, jthread thread, jint event_type, Event *event, bool deferred) { - atomicInc(_total_samples); + atomicIncRelaxed(_total_samples); u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { // Too many concurrent signals already - atomicInc(_failures[-ticks_skipped]); + atomicIncRelaxed(_failures[-ticks_skipped]); return 0; } @@ -654,14 +654,14 @@ u64 Profiler::recordJVMTISample(u64 counter, int tid, jthread thread, jint event } void Profiler::recordDeferredSample(int tid, u64 call_trace_id, jint event_type, Event *event) { - atomicInc(_total_samples); + atomicIncRelaxed(_total_samples); u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { // Too many concurrent signals already - atomicInc(_failures[-ticks_skipped]); + atomicIncRelaxed(_failures[-ticks_skipped]); return; } @@ -672,14 +672,14 @@ void Profiler::recordDeferredSample(int tid, u64 call_trace_id, jint event_type, void Profiler::recordSample(void *ucontext, u64 counter, int tid, jint event_type, u64 call_trace_id, Event *event) { - atomicInc(_total_samples); + atomicIncRelaxed(_total_samples); u32 lock_index = getLockIndex(tid); if (!_locks[lock_index].tryLock() && !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { // Too many concurrent signals already - atomicInc(_failures[-ticks_skipped]); + atomicIncRelaxed(_failures[-ticks_skipped]); if (event_type == BCI_CPU && _cpu_engine == &perf_events) { // Need to reset PerfEvents ring buffer, even though we discard the @@ -788,7 +788,7 @@ void Profiler::recordQueueTime(int tid, QueueTimeEvent *event) { void Profiler::recordExternalSample(u64 weight, int tid, int num_frames, ASGCT_CallFrame *frames, bool truncated, jint event_type, Event *event) { - atomicInc(_total_samples); + atomicIncRelaxed(_total_samples); u64 call_trace_id = _call_trace_storage.put(num_frames, frames, truncated, weight); @@ -798,7 +798,7 @@ void Profiler::recordExternalSample(u64 weight, int tid, int num_frames, !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { // Too many concurrent signals already - atomicInc(_failures[-ticks_skipped]); + atomicIncRelaxed(_failures[-ticks_skipped]); return; } diff --git a/ddprof-lib/src/main/cpp/profiler.h b/ddprof-lib/src/main/cpp/profiler.h index 1e8e9902f..09229d845 100644 --- a/ddprof-lib/src/main/cpp/profiler.h +++ b/ddprof-lib/src/main/cpp/profiler.h @@ -93,7 +93,7 @@ class Profiler { WaitableMutex _timer_lock; void *_timer_id; - u64 _total_samples; + volatile u64 _total_samples; u64 _failures[ASGCT_FAILURE_TYPES]; SpinLock _class_map_lock;