diff --git a/Cargo.lock b/Cargo.lock
index f16b7a3594..32454cdc83 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2867,6 +2867,7 @@ dependencies = [
  "bytes",
  "cc",
  "const_format",
+ "criterion",
  "futures",
  "futures-core",
  "futures-util",
diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv
index 51ccb8e07d..0c1a897df2 100644
--- a/LICENSE-3rdparty.csv
+++ b/LICENSE-3rdparty.csv
@@ -95,6 +95,7 @@ core-foundation-sys,https://github.com/servo/core-foundation-rs,MIT OR Apache-2.
 cpp_demangle,https://github.com/gimli-rs/cpp_demangle,MIT OR Apache-2.0,"Nick Fitzgerald <fitzgen@gmail.com>, Jim Blandy <jimb@red-bean.com>, Kyle Huey <khuey@kylehuey.com>"
 cpufeatures,https://github.com/RustCrypto/utils,MIT OR Apache-2.0,RustCrypto Developers
 crc32fast,https://github.com/srijs/rust-crc32fast,MIT OR Apache-2.0,"Sam Rijs <srijs@airpost.net>, Alex Crichton <alex@alexcrichton.com>"
+criterion,https://github.com/bheisler/criterion.rs,Apache-2.0 OR MIT,"Jorge Aparicio <japaricious@gmail.com>, Brook Heisler <brookheisler@gmail.com>"
 criterion-plot,https://github.com/bheisler/criterion.rs,MIT OR Apache-2.0,"Jorge Aparicio <japaricious@gmail.com>, Brook Heisler <brookheisler@gmail.com>"
 critical-section,https://github.com/rust-embedded/critical-section,MIT OR Apache-2.0,The critical-section Authors
 crossbeam-channel,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crossbeam-channel Authors
diff --git a/datadog-ipc/benches/ipc.rs b/datadog-ipc/benches/ipc.rs
index 929906fd89..bd80577927 100644
--- a/datadog-ipc/benches/ipc.rs
+++ b/datadog-ipc/benches/ipc.rs
@@ -15,7 +15,7 @@ use tokio::runtime;
 fn criterion_benchmark(c: &mut Criterion) {
     let (conn_server, conn_client) = datadog_ipc::SeqpacketConn::socketpair().unwrap();
 
-    let worker = thread::spawn(move || {
+    let _worker = thread::spawn(move || {
         let rt = runtime::Builder::new_current_thread()
             .enable_all()
             .build()
@@ -36,15 +36,6 @@ fn criterion_benchmark(c: &mut Criterion) {
     c.bench_function("two way interface", |b| {
         b.iter(|| channel.call_req_cnt().unwrap())
     });
-
-    #[cfg(not(target_arch = "aarch64"))]
-    println!(
-        "Total requests handled: {}",
-        channel.call_req_cnt().unwrap()
-    );
-
-    drop(channel);
-    worker.join().unwrap();
 }
 
 #[cfg(unix)]
diff --git a/libdd-common/Cargo.toml b/libdd-common/Cargo.toml
index f9d4f1ac19..7419853b0a 100644
--- a/libdd-common/Cargo.toml
+++ b/libdd-common/Cargo.toml
@@ -35,6 +35,7 @@ regex = "1.5"
 # Use rustls-no-provider instead of rustls to avoid reqwest forcing aws-lc-rs as the crypto
 # backend. We install the ring provider explicitly in connector/mod.rs instead.
 reqwest = { version = "0.13.2", features = ["rustls-no-provider", "hickory-dns"], default-features = false, optional = true }
+criterion = { version = "0.5.1", optional = true }
 # Pinned to <0.8.3: version 0.8.3+ pulls in openssl-probe@0.2 which probes multiple
 # certificate directories and parses individual cert files instead of loading a single
 # bundle, adding unnecessary I/O overhead in latency-sensitive environments.
@@ -103,6 +104,8 @@ fips = ["tls-core", "hyper-rustls/fips"]
 reqwest = ["dep:reqwest", "test-utils"]
 # Enable test utilities for use in other crates
 test-utils = ["dep:httparse", "dep:rand", "dep:mime", "dep:multer"]
+# Enable benchmark utilities (ReportingAllocator, Criterion allocation measurement)
+bench-utils = ["dep:criterion"]
 
 [lints.rust]
 # We run coverage checks in our github actions. These checks are run with
diff --git a/libdd-common/src/bench_utils.rs b/libdd-common/src/bench_utils.rs
new file mode 100644
index 0000000000..17337f6cd9
--- /dev/null
+++ b/libdd-common/src/bench_utils.rs
@@ -0,0 +1,333 @@
+// Copyright 2021-Present Datadog, Inc. https://www.datadoghq.com/
+// SPDX-License-Identifier: Apache-2.0
+
+//! Scaffolding for memory usage benchmarks.
+//!
+//! See the `ReportingAllocator` type and `memory_allocated_measurement` for usage.
+
+#![allow(missing_docs)]
+
+use std::{
+    alloc::{GlobalAlloc, System},
+    cell::Cell,
+    time::Duration,
+};
+
+use criterion::{Criterion, Throughput};
+
+pub trait MeasurementName {
+    fn name() -> &'static str;
+}
+
+impl MeasurementName for criterion::measurement::WallTime {
+    fn name() -> &'static str {
+        "wall_time"
+    }
+}
+
+pub fn memory_allocated_measurement(
+    global_alloc: &'static ReportingAllocator<System>,
+) -> Criterion<AllocatedBytesMeasurement<System>> {
+    Criterion::default()
+        .with_measurement(AllocatedBytesMeasurement(Cell::new(false), global_alloc))
+        .measurement_time(Duration::from_millis(1))
+        .warm_up_time(Duration::from_millis(1))
+        .without_plots()
+        .plotting_backend(criterion::PlottingBackend::None)
+        .sample_size(10)
+}
+
+#[derive(Debug)]
+struct AllocStats {
+    allocated_bytes: usize,
+    #[allow(dead_code)]
+    allocations: usize,
+}
+
+pub struct ReportingAllocator<T: GlobalAlloc> {
+    alloc: T,
+    allocated_bytes: std::sync::atomic::AtomicUsize,
+    allocations: std::sync::atomic::AtomicUsize,
+}
+
+impl<T: GlobalAlloc> ReportingAllocator<T> {
+    pub const fn new(alloc: T) -> Self {
+        Self {
+            alloc,
+            allocated_bytes: std::sync::atomic::AtomicUsize::new(0),
+            allocations: std::sync::atomic::AtomicUsize::new(0),
+        }
+    }
+
+    fn stats(&self) -> AllocStats {
+        AllocStats {
+            allocated_bytes: self
+                .allocated_bytes
+                .load(std::sync::atomic::Ordering::Relaxed),
+            allocations: self.allocations.load(std::sync::atomic::Ordering::Relaxed),
+        }
+    }
+}
+
+unsafe impl<T: GlobalAlloc> GlobalAlloc for ReportingAllocator<T> {
+    unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
+        self.allocated_bytes
+            .fetch_add(layout.size(), std::sync::atomic::Ordering::Relaxed);
+        self.allocations
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        self.alloc.alloc(layout)
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
+        self.alloc.dealloc(ptr, layout);
+    }
+}
+
+pub struct AllocatedBytesMeasurement<T: GlobalAlloc + 'static>(
+    Cell<bool>,
+    &'static ReportingAllocator<T>,
+);
+
+impl<T: GlobalAlloc> MeasurementName for AllocatedBytesMeasurement<T> {
+    fn name() -> &'static str {
+        "allocated_bytes"
+    }
+}
+
+impl<T: GlobalAlloc> criterion::measurement::Measurement for AllocatedBytesMeasurement<T> {
+    type Intermediate = usize;
+
+    type Value = usize;
+
+    fn start(&self) -> Self::Intermediate {
+        self.1.stats().allocated_bytes
+    }
+
+    fn end(&self, i: Self::Intermediate) -> Self::Value {
+        self.1.stats().allocated_bytes - i
+    }
+
+    fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value {
+        *v1 + *v2
+    }
+
+    fn zero(&self) -> Self::Value {
+        0
+    }
+
+    fn to_f64(&self, value: &Self::Value) -> f64 {
+        let b = self.0.get();
+        self.0.set(!b);
+        // Criterion does not handle all-identical measurement values well, and since
+        // allocation is deterministic that tends to happen a lot. Add a small +/- epsilon
+        // so each pair of measurements differs slightly without skewing the distribution.
+        *value as f64 + if b { 0.01 } else { -0.01 }
+    }
+
+    fn formatter(&self) -> &dyn criterion::measurement::ValueFormatter {
+        &AllocationFormatter
+    }
+}
+
+struct AllocationFormatter;
+
+impl criterion::measurement::ValueFormatter for AllocationFormatter {
+    fn scale_values(&self, typical_value: f64, values: &mut [f64]) -> &'static str {
+        let log_scale: f64 = typical_value.log10().round();
+        if log_scale.is_infinite() || log_scale.is_nan() || log_scale < 0.0 {
+            return "b";
+        }
+        let scale = (log_scale as i32 / 3).min(4);
+        values.iter_mut().for_each(|v| *v /= 10_f64.powi(scale * 3));
+        match scale {
+            0 => "b",
+            1 => "Kb",
+            2 => "Mb",
+            3 => "Gb",
+            _ => "Tb",
+        }
+    }
+
+    fn scale_throughputs(
+        &self,
+        _typical_value: f64,
+        throughput: &criterion::Throughput,
+        _values: &mut [f64],
+    ) -> &'static str {
+        match throughput {
+            Throughput::Bytes(_) => "B/s",
+            Throughput::BytesDecimal(_) => "B/s",
+            Throughput::Elements(_) => "elements/s",
+        }
+    }
+
+    fn scale_for_machines(&self, _values: &mut [f64]) -> &'static str {
+        "b"
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use criterion::measurement::{Measurement, ValueFormatter};
+    use std::alloc::{GlobalAlloc, Layout, System};
+
+    static SHARED: ReportingAllocator<System> = ReportingAllocator::new(System);
+
+    // --- ReportingAllocator ---
+
+    #[test]
+    fn new_starts_at_zero() {
+        let a = ReportingAllocator::new(System);
+        let s = a.stats();
+        assert_eq!(s.allocated_bytes, 0);
+        assert_eq!(s.allocations, 0);
+    }
+
+    #[test]
+    fn alloc_increments_both_counters() {
+        let a = ReportingAllocator::new(System);
+        let layout = Layout::from_size_align(64, 8).unwrap();
+        let ptr = unsafe { a.alloc(layout) };
+        assert!(!ptr.is_null());
+        assert_eq!(a.stats().allocated_bytes, 64);
+        assert_eq!(a.stats().allocations, 1);
+        unsafe { a.dealloc(ptr, layout) };
+    }
+
+    #[test]
+    fn dealloc_does_not_change_counters() {
+        let a = ReportingAllocator::new(System);
+        let layout = Layout::from_size_align(32, 8).unwrap();
+        let ptr = unsafe { a.alloc(layout) };
+        let bytes_after_alloc = a.stats().allocated_bytes;
+        unsafe { a.dealloc(ptr, layout) };
+        assert_eq!(a.stats().allocated_bytes, bytes_after_alloc);
+        assert_eq!(a.stats().allocations, 1);
+    }
+
+    #[test]
+    fn multiple_allocs_accumulate() {
+        let a = ReportingAllocator::new(System);
+        let l1 = Layout::from_size_align(16, 8).unwrap();
+        let l2 = Layout::from_size_align(32, 8).unwrap();
+        let p1 = unsafe { a.alloc(l1) };
+        let p2 = unsafe { a.alloc(l2) };
+        assert_eq!(a.stats().allocated_bytes, 48);
+        assert_eq!(a.stats().allocations, 2);
+        unsafe {
+            a.dealloc(p1, l1);
+            a.dealloc(p2, l2);
+        }
+    }
+
+    // --- AllocatedBytesMeasurement ---
+
+    #[test]
+    fn measurement_zero_and_add() {
+        let m = AllocatedBytesMeasurement(Cell::new(false), &SHARED);
+        assert_eq!(m.zero(), 0);
+        assert_eq!(m.add(&100, &200), 300);
+    }
+
+    #[test]
+    fn measurement_start_end_tracks_delta() {
+        let m = AllocatedBytesMeasurement(Cell::new(false), &SHARED);
+        let start = m.start();
+        let layout = Layout::from_size_align(256, 8).unwrap();
+        let ptr = unsafe { SHARED.alloc(layout) };
+        // Other tests may also allocate via SHARED concurrently, so allow >= 256.
+        assert!(m.end(start) >= 256);
+        unsafe { SHARED.dealloc(ptr, layout) };
+    }
+
+    #[test]
+    fn measurement_to_f64_alternates_epsilon() {
+        let m = AllocatedBytesMeasurement(Cell::new(false), &SHARED);
+        // Initial state: Cell = false → first result is value - 0.01
+        assert!((m.to_f64(&1000) - 999.99).abs() < 1e-9);
+        // After first call: Cell = true → result is value + 0.01
+        assert!((m.to_f64(&1000) - 1000.01).abs() < 1e-9);
+        // Alternates back
+        assert!((m.to_f64(&1000) - 999.99).abs() < 1e-9);
+    }
+
+    #[test]
+    fn measurement_name() {
+        assert_eq!(
+            AllocatedBytesMeasurement::<System>::name(),
+            "allocated_bytes"
+        );
+    }
+
+    // --- AllocationFormatter::scale_values ---
+
+    #[test]
+    fn scale_values_zero_returns_bytes() {
+        let f = AllocationFormatter;
+        let mut v = [42.0_f64];
+        assert_eq!(f.scale_values(0.0, &mut v), "b");
+    }
+
+    #[test]
+    fn scale_values_sub_byte_returns_bytes() {
+        let f = AllocationFormatter;
+        let mut v = [0.5_f64];
+        // log10(0.1) = -1 → negative → "b"
+        assert_eq!(f.scale_values(0.1, &mut v), "b");
+    }
+
+    #[test]
+    fn scale_values_bytes() {
+        let f = AllocationFormatter;
+        let mut v = [1.0_f64];
+        assert_eq!(f.scale_values(1.0, &mut v), "b");
+        assert!((v[0] - 1.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn scale_values_kilobytes() {
+        let f = AllocationFormatter;
+        let mut v = [2000.0_f64];
+        assert_eq!(f.scale_values(1000.0, &mut v), "Kb");
+        assert!((v[0] - 2.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn scale_values_megabytes() {
+        let f = AllocationFormatter;
+        let mut v = [3_000_000.0_f64];
+        assert_eq!(f.scale_values(1_000_000.0, &mut v), "Mb");
+        assert!((v[0] - 3.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn scale_values_gigabytes() {
+        let f = AllocationFormatter;
+        let mut v = [4_000_000_000.0_f64];
+        assert_eq!(f.scale_values(1_000_000_000.0, &mut v), "Gb");
+        assert!((v[0] - 4.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn scale_values_terabytes() {
+        let f = AllocationFormatter;
+        let mut v = [5_000_000_000_000.0_f64];
+        assert_eq!(f.scale_values(1_000_000_000_000.0, &mut v), "Tb");
+        assert!((v[0] - 5.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn scale_values_very_large_clamps_to_terabytes() {
+        let f = AllocationFormatter;
+        let mut v = [1e18_f64];
+        assert_eq!(f.scale_values(1e18, &mut v), "Tb");
+    }
+
+    #[test]
+    fn scale_for_machines_returns_bytes_unit() {
+        let f = AllocationFormatter;
+        let mut v = [1000.0_f64];
+        assert_eq!(f.scale_for_machines(&mut v), "b");
+    }
+}
diff --git a/libdd-common/src/lib.rs b/libdd-common/src/lib.rs
index 1922446c6c..2578459260 100644
--- a/libdd-common/src/lib.rs
+++ b/libdd-common/src/lib.rs
@@ -23,6 +23,8 @@ pub mod dump_server;
 pub mod entity_id;
 #[macro_use]
 pub mod cstr;
+#[cfg(feature = "bench-utils")]
+pub mod bench_utils;
 pub mod config;
 pub mod error;
 pub mod http_common;
diff --git a/libdd-trace-utils/Cargo.toml b/libdd-trace-utils/Cargo.toml
index 9e6b3655fe..bfacc33037 100644
--- a/libdd-trace-utils/Cargo.toml
+++ b/libdd-trace-utils/Cargo.toml
@@ -66,6 +66,7 @@ getrandom = { version = "0.2", features = ["js"] }
 
 [dev-dependencies]
 libdd-capabilities-impl = { version = "0.1.0", path = "../libdd-capabilities-impl" }
+libdd-common = { path = "../libdd-common", default-features = false, features = ["bench-utils"] }
 bolero = "0.13"
 criterion = "0.5.1"
 httpmock = { version = "0.8.0-alpha.1" }
diff --git a/libdd-trace-utils/benches/deserialization.rs b/libdd-trace-utils/benches/deserialization.rs
index c073446f37..767082e5be 100644
--- a/libdd-trace-utils/benches/deserialization.rs
+++ b/libdd-trace-utils/benches/deserialization.rs
@@ -1,7 +1,10 @@
 // Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 
+use std::alloc::System;
+
 use criterion::{black_box, criterion_group, Criterion};
+use libdd_common::bench_utils::{memory_allocated_measurement, AllocatedBytesMeasurement};
 use libdd_trace_utils::tracer_payload::{decode_to_trace_chunks, TraceEncoding};
 use serde_json::{json, Value};
 
@@ -81,4 +84,31 @@ pub fn deserialize_msgpack_to_internal(c: &mut Criterion) {
     );
 }
 
+fn deserialize_msgpack_to_internal_allocs(c: &mut Criterion<AllocatedBytesMeasurement<System>>) {
+    let data = rmp_serde::to_vec(&generate_trace_chunks(20, 2_075))
+        .expect("Failed to serialize test spans.");
+    let data_as_bytes = libdd_tinybytes::Bytes::copy_from_slice(&data);
+
+    c.bench_function(
+        "benching deserializing traces from msgpack to their internal representation (allocs)",
+        |b| {
+            b.iter_batched(
+                || data_as_bytes.clone(),
+                |data_as_bytes| {
+                    let result =
+                        black_box(decode_to_trace_chunks(data_as_bytes, TraceEncoding::V04));
+                    assert!(result.is_ok());
+                    result
+                },
+                criterion::BatchSize::LargeInput,
+            );
+        },
+    );
+}
+
 criterion_group!(deserialize_benches, deserialize_msgpack_to_internal);
+criterion_group!(
+    name = deserialize_alloc_benches;
+    config = memory_allocated_measurement(&super::GLOBAL);
+    targets = deserialize_msgpack_to_internal_allocs
+);
diff --git a/libdd-trace-utils/benches/main.rs b/libdd-trace-utils/benches/main.rs
index de08a58ae2..0d86f25ee5 100644
--- a/libdd-trace-utils/benches/main.rs
+++ b/libdd-trace-utils/benches/main.rs
@@ -1,12 +1,19 @@
 // Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/
 // SPDX-License-Identifier: Apache-2.0
 
+use std::alloc::System;
+
 use criterion::criterion_main;
+use libdd_common::bench_utils::ReportingAllocator;
+
+#[global_allocator]
+pub static GLOBAL: ReportingAllocator<System> = ReportingAllocator::new(System);
 
 mod deserialization;
 mod serialization;
 
 criterion_main!(
     serialization::serialize_benches,
-    deserialization::deserialize_benches
+    deserialization::deserialize_benches,
+    deserialization::deserialize_alloc_benches
 );