diff --git a/Cargo.lock b/Cargo.lock index a0d168a407..9c6b210fe7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -394,6 +394,21 @@ dependencies = [ "syn 2.0.110", ] +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "2.10.0" @@ -2753,6 +2768,7 @@ dependencies = [ name = "libdd-profiling" version = "1.0.0" dependencies = [ + "allocator-api2", "anyhow", "bitmaps", "bolero", @@ -2761,6 +2777,7 @@ dependencies = [ "chrono", "criterion", "futures", + "hashbrown 0.16.0", "http", "http-body-util", "hyper", @@ -2771,11 +2788,13 @@ dependencies = [ "libdd-profiling-protobuf", "lz4_flex", "mime", + "proptest", "prost", "rustc-hash 1.1.0", "serde", "serde_json", "target-triple 0.1.4", + "thiserror 2.0.17", "tokio", "tokio-util", "zstd", @@ -3850,6 +3869,8 @@ name = "proptest" version = "1.5.0" source = "git+https://github.com/bantonsson/proptest.git?branch=ban/avoid-libm-in-std#9f623fbab7a1a4da487551128c2bffeee2ed6b87" dependencies = [ + "bit-set", + "bit-vec", "bitflags", "lazy_static", "num-traits", @@ -3857,6 +3878,8 @@ dependencies = [ "rand_chacha 0.3.1", "rand_xorshift", "regex-syntax", + "rusty-fork", + "tempfile", "unarray", ] @@ -4038,6 +4061,12 @@ dependencies = [ "syn 2.0.110", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.42" @@ -4348,6 +4377,18 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ruzstd" version = "0.3.1" @@ -5823,6 +5864,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/libdd-profiling/Cargo.toml b/libdd-profiling/Cargo.toml index fb5a9181f6..c9d1938ad4 100644 --- a/libdd-profiling/Cargo.toml +++ b/libdd-profiling/Cargo.toml @@ -21,6 +21,7 @@ name = "main" harness = false [dependencies] +allocator-api2 = { version = "0.2", default-features = false, features = ["alloc"] } anyhow = "1.0" bitmaps = "3.2.0" byteorder = { version = "1.5", features = ["std"] } @@ -30,6 +31,7 @@ libdd-alloc = { version = "1.0.0", path = "../libdd-alloc" } libdd-profiling-protobuf = { version = "1.0.0", path = "../libdd-profiling-protobuf", features = ["prost_impls"] } libdd-common = { version = "1.0.0", path = "../libdd-common" } futures = { version = "0.3", default-features = false } +hashbrown = { version = "0.16", default-features = false } http = "1.0" hyper = { workspace = true} http-body-util = "0.1" @@ -42,6 +44,7 @@ rustc-hash = { version = "1.1", default-features = false } serde = {version = "1.0", features = ["derive"]} serde_json = {version = "1.0"} target-triple = "0.1.4" +thiserror = "2" tokio = {version = "1.23", features = ["rt", "macros"]} tokio-util = "0.7.1" zstd = { version = "0.13", default-features = false } @@ -50,3 +53,4 @@ zstd = { version = "0.13", default-features = false } bolero = "0.13" criterion = "0.5.1" lz4_flex = { version = "0.9", default-features = false, features = ["std", "frame"] } +proptest = "1" diff --git a/libdd-profiling/src/lib.rs b/libdd-profiling/src/lib.rs index 7ba095f4af..486104e792 100644 --- a/libdd-profiling/src/lib.rs +++ b/libdd-profiling/src/lib.rs @@ -12,4 +12,4 @@ pub mod exporter; pub mod internal; pub mod iter; pub mod pprof; -mod profiles; +pub mod profiles; diff --git a/libdd-profiling/src/pprof/test_utils.rs b/libdd-profiling/src/pprof/test_utils.rs index 10e27c458f..7b257ea02f 100644 --- a/libdd-profiling/src/pprof/test_utils.rs +++ b/libdd-profiling/src/pprof/test_utils.rs @@ -1,13 +1,10 @@ // Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 -use anyhow::Context; use libdd_profiling_protobuf::prost_impls::{Profile, Sample}; -use std::io::Cursor; fn deserialize_compressed_pprof(encoded: &[u8]) -> anyhow::Result { use prost::Message; - use std::io::Read; // The zstd bindings use FFI so they don't work under miri. This means the // buffer isn't compressed, so simply convert to a vec. @@ -15,6 +12,8 @@ fn deserialize_compressed_pprof(encoded: &[u8]) -> anyhow::Result { let buf = encoded.to_vec(); #[cfg(not(miri))] let buf = { + use anyhow::Context; + use std::io::{Cursor, Read}; let mut decoder = zstd::Decoder::new(Cursor::new(encoded)).context("failed to create zstd decoder")?; let mut out = Vec::new(); diff --git a/libdd-profiling/src/profiles/collections/error.rs b/libdd-profiling/src/profiles/collections/error.rs new file mode 100644 index 0000000000..c35b772a26 --- /dev/null +++ b/libdd-profiling/src/profiles/collections/error.rs @@ -0,0 +1,29 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +#[repr(C)] +#[derive(Debug, thiserror::Error)] +pub enum SetError { + #[error("set error: invalid argument")] + InvalidArgument, + #[error("set error: out of memory")] + OutOfMemory, +} + +impl From for SetError { + fn from(_: libdd_alloc::AllocError) -> Self { + SetError::OutOfMemory + } +} + +impl From for SetError { + fn from(_: std::collections::TryReserveError) -> Self { + SetError::OutOfMemory + } +} + +impl From for SetError { + fn from(_: hashbrown::TryReserveError) -> Self { + SetError::OutOfMemory + } +} diff --git a/libdd-profiling/src/profiles/collections/mod.rs b/libdd-profiling/src/profiles/collections/mod.rs new file mode 100644 index 0000000000..514402f7c4 --- /dev/null +++ b/libdd-profiling/src/profiles/collections/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +mod error; +mod set; +mod slice_set; +mod string_set; +mod thin_str; + +pub type SetHasher = core::hash::BuildHasherDefault; + +pub use error::*; +pub use set::*; +pub use slice_set::*; +pub use string_set::*; +pub use thin_str::*; diff --git a/libdd-profiling/src/profiles/collections/set.rs b/libdd-profiling/src/profiles/collections/set.rs new file mode 100644 index 0000000000..ef378e9da0 --- /dev/null +++ b/libdd-profiling/src/profiles/collections/set.rs @@ -0,0 +1,236 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +use super::SetError; +use super::SetHasher as Hasher; +use core::hint::unreachable_unchecked; +use core::{fmt, mem, ptr}; +use hashbrown::HashTable; +use libdd_alloc::{Allocator, ChainAllocator, VirtualAllocator}; +use std::ffi::c_void; +use std::hash::{BuildHasher, Hash}; + +pub const SET_MIN_CAPACITY: usize = 14; + +#[repr(transparent)] +#[derive(Debug, Eq, Hash, PartialEq)] +pub struct SetId(pub(crate) ptr::NonNull); + +impl SetId { + /// Cast to another type. Although this is safe, using the result is not + /// necessarily safe. + #[inline] + #[must_use] + pub fn cast(self) -> SetId { + SetId(self.0.cast()) + } + + pub fn into_raw(self) -> ptr::NonNull { + self.0.cast() + } + + /// Re-creates a [`SetId`] from calling [`SetId::into_raw`]. + /// + /// # Safety + /// + /// The set it belongs to must still be alive, and the repr should be + /// unchanged since it was created by [`SetId::into_raw`]. + pub unsafe fn from_raw(raw: ptr::NonNull) -> Self { + Self(raw.cast::()) + } +} + +// This is different from derive(Clone), because derive(Clone) will be Clone +// only if T is Clone, and that's not true here--the type is always Clone. +impl Clone for SetId { + fn clone(&self) -> Self { + *self + } +} +impl Copy for SetId {} + +impl fmt::Debug for Set { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Set").field("table", &self.table).finish() + } +} + +pub struct Set { + pub(crate) arena: ChainAllocator, + pub(crate) table: HashTable>, +} + +impl Set { + const SIZE_HINT: usize = 1024 * 1024; + + pub fn try_new() -> Result { + Self::try_with_capacity(SET_MIN_CAPACITY) + } + + #[inline] + pub(crate) fn allocate_one(&mut self, value: T) -> Result, SetError> { + let layout = core::alloc::Layout::new::(); + // Allocate raw bytes for a single `T` + let obj = self.arena.allocate(layout)?; // Result, AllocError> + let raw_slice_ptr: *mut [u8] = obj.as_ptr(); + let raw = raw_slice_ptr as *mut u8 as *mut T; + + // SAFETY: `raw` points to allocated, properly aligned memory for `T`. + unsafe { ptr::write(raw, value) }; + + // SAFETY: cannot be null as it was just allocated. + Ok(unsafe { ptr::NonNull::new_unchecked(raw) }) + } + + pub fn try_insert(&mut self, value: T) -> Result, SetError> { + let hash = Hasher::default().hash_one(&value); + // SAFETY: hash computed by this set's hasher for value. + if let Some(existing) = unsafe { self.find_with_hash(hash, &value) } { + return Ok(existing); + } + // SAFETY: hash computed by this set's hasher, uniqueness is enforced + // by a prior find. + unsafe { self.insert_unique_uncontended_with_hash(hash, value) } + } + + pub fn len(&self) -> usize { + self.table.len() + } + + pub fn is_empty(&self) -> bool { + self.table.is_empty() + } + pub fn capacity(&self) -> usize { + self.table.capacity() + } + + /// Returns the `SetId` for `value` if it exists in the set, without inserting. + /// + /// This is primarily intended for tests and debugging. In typical usage + /// you should prefer `try_insert` which handles both existence checks and + /// insertion atomically in the intended access pattern. + pub fn find(&self, value: &T) -> Option> { + let hash = Hasher::default().hash_one(value); + // SAFETY: `hash` was computed using this set's hasher over `&value`. + unsafe { self.find_with_hash(hash, value) } + } + + /// Returns a shared reference to the value for a given `SetId`. + /// + /// # Safety + /// - The `id` must have been obtained from this exact `Set` instance (or remain valid for + /// it). Using an id from another set, or after the backing arena is torn down, is undefined + /// behavior. + /// # Safety + /// - `id` must have been obtained from this exact `Set` instance and still refer to a live + /// element in its arena. + pub unsafe fn get(&self, id: SetId) -> &T { + // SAFETY: Caller guarantees the `SetId` belongs to this set and points + // to a live, properly aligned `T` in the arena. + unsafe { id.0.as_ref() } + } +} + +impl Drop for Set { + fn drop(&mut self) { + if mem::needs_drop::() { + for nn in self.table.iter() { + // SAFETY: Elements in the table were allocated and initialized + // via `allocate_one` and remain valid for the lifetime of this + // set (arena-backed). We only drop if `T` requires dropping. + unsafe { ptr::drop_in_place(nn.as_ptr()) }; + } + } + } +} + +impl Set { + fn try_with_capacity(capacity: usize) -> Result { + let arena = ChainAllocator::new_in(Self::SIZE_HINT, VirtualAllocator {}); + let mut table = HashTable::new(); + + // SAFETY: new empty table cannot require rehash, callback unreachable. + table.try_reserve(capacity, |_| unsafe { unreachable_unchecked() })?; + Ok(Self { arena, table }) + } + + unsafe fn find_with_hash(&self, hash: u64, key: &T) -> Option> { + let found = self + .table + // SAFETY: NonNull inside table points to live, properly aligned Ts. + .find(hash, |nn| unsafe { nn.as_ref() == key })?; + Some(SetId(*found)) + } + + unsafe fn insert_unique_uncontended_with_hash( + &mut self, + hash: u64, + value: T, + ) -> Result, SetError> { + // Reserve table space BEFORE allocating the new value so we don't + // need to drop it on reserve failure. + // SAFETY: NonNull entries are valid; closure only hashes existing entries. + self.table + .try_reserve(1, |nnv| Hasher::default().hash_one(unsafe { nnv.as_ref() }))?; + let nn = self.allocate_one(value)?; + // SAFETY: reserve above guarantees no rehash occurs; closure unreachable. + self.table + .insert_unique(hash, nn, |_| unsafe { unreachable_unchecked() }); + Ok(SetId(nn)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + use std::collections::HashSet as StdHashSet; + use std::sync::{Arc, Weak}; + + proptest! { + #![proptest_config(ProptestConfig { + cases: if cfg!(miri) { 4 } else { 64 }, + .. ProptestConfig::default() + })] + + #[test] + fn proptest_matches_std_hashset(values in proptest::collection::vec(any::(), 0..if cfg!(miri) { 32 } else { 512 })) { + let mut set = Set::::try_new().unwrap(); + let mut shadow = StdHashSet::::new(); + + for v in &values { + shadow.insert(*v); + let _ = set.try_insert(*v).unwrap(); + } + + prop_assert_eq!(set.len(), shadow.len()); + + for &v in &shadow { + let id = set.find(&v).unwrap(); + // SAFETY: id just obtained from this set + let fetched = unsafe { set.get(id) }; + prop_assert_eq!(*fetched, v); + } + } + } + + #[test] + fn set_drops_elements_on_drop() { + let mut set = Set::>::try_new().unwrap(); + let mut weaks: Vec> = Vec::new(); + + let total = if cfg!(miri) { 8 } else { 64 }; + for i in 0..total { + let arc = Arc::new(i as u64); + weaks.push(Arc::downgrade(&arc)); + // Transfer ownership into the set + let _ = set.try_insert(arc).unwrap(); + } + + drop(set); + + for (idx, w) in weaks.iter().enumerate() { + assert!(w.upgrade().is_none(), "weak at {idx} still alive"); + } + } +} diff --git a/libdd-profiling/src/profiles/collections/slice_set.rs b/libdd-profiling/src/profiles/collections/slice_set.rs new file mode 100644 index 0000000000..e2aa3636f7 --- /dev/null +++ b/libdd-profiling/src/profiles/collections/slice_set.rs @@ -0,0 +1,131 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +use super::{SetError, ThinSlice}; +use core::hash::{BuildHasher, Hash}; +use core::hint::unreachable_unchecked; +use hashbrown::HashTable; +use libdd_alloc::{ChainAllocator, VirtualAllocator}; + +use super::SetHasher as Hasher; + +/// Holds unique slices and provides handles to fetch them later. +pub struct SliceSet { + /// The bytes of each slice stored in `slices` are allocated here. + pub(crate) arena: ChainAllocator, + + /// The unordered hash set of unique slices. + /// The static lifetimes are a lie; they are tied to the `arena`, which is + /// only moved if the slice set is moved. + /// References to the underlying slices should generally not be handed, + /// but if they are, they should be bound to the slice set's lifetime. + pub(crate) slices: HashTable>, +} + +impl SliceSet { + const SIZE_HINT: usize = 1024 * 1024; + + pub fn try_with_capacity(capacity: usize) -> Result { + let arena = ChainAllocator::new_in(Self::SIZE_HINT, VirtualAllocator {}); + + let mut slices = HashTable::new(); + // SAFETY: we just made the empty hash table, so there's nothing that + // needs to be rehashed. + slices.try_reserve(capacity, |_| unsafe { unreachable_unchecked() })?; + + Ok(SliceSet { arena, slices }) + } + + /// # Safety + /// + /// The slice must not already exist within the set. + pub unsafe fn insert_unique_uncontended( + &mut self, + slice: &[T], + ) -> Result, SetError> { + let hash = Hasher::default().hash_one(slice); + self.insert_unique_uncontended_with_hash(hash, slice) + } + + /// # Safety + /// 1. The hash must be the same as if the slice was re-hashed with the hasher the slice set + /// would use. + /// 2. The slice must not already exist within the set. + #[inline(never)] + pub unsafe fn insert_unique_uncontended_with_hash( + &mut self, + hash: u64, + slice: &[T], + ) -> Result, SetError> { + let obj = ThinSlice::try_allocate_for(slice, &self.arena)?; + let uninit = unsafe { &mut *obj.as_ptr() }; + let new_slice = ThinSlice::try_from_slice_in(slice, uninit)?; + + self.slices + .try_reserve(1, |thin| Hasher::default().hash_one(thin.as_slice()))?; + + // Add it to the set. The memory was previously reserved. + // SAFETY: The try_reserve above means any necessary re-hashing has + // already been done, so the hash closure cannot be called. + self.slices + .insert_unique(hash, new_slice, |_| unsafe { unreachable_unchecked() }); + + Ok(new_slice) + } + + /// Adds the slice to the slice set if it isn't present already, and + /// returns a handle to the slice that can be used to retrieve it later. + pub fn try_insert(&mut self, slice: &[T]) -> Result, SetError> + where + T: Hash, + { + let hash = Hasher::default().hash_one(slice); + + // SAFETY: the slice's hash is correct, we use the same hasher as + // SliceSet uses. + if let Some(id) = unsafe { self.find_with_hash(hash, slice) } { + return Ok(id); + } + + // SAFETY: we just checked above that the slice isn't in the set. + unsafe { self.insert_unique_uncontended(slice) } + } + + /// # Safety + /// The hash must be the same as if the slice was re-hashed with the + /// hasher the slice set would use. + #[inline(never)] + pub(crate) unsafe fn find_with_hash( + &self, + hash: u64, + slice: &[T], + ) -> Option> + where + T: PartialEq, + { + let interned_slice = self + .slices + .find(hash, |thin_slice| thin_slice.as_slice() == slice)?; + Some(*interned_slice) + } + + /// Returns an iterator over all slices in the set. + pub fn iter(&self) -> impl Iterator> + '_ { + self.slices.iter().copied() + } + + /// Returns the number of slices in the set. + pub fn len(&self) -> usize { + self.slices.len() + } + + /// Returns true if the set is empty. + pub fn is_empty(&self) -> bool { + self.slices.is_empty() + } + + /// Returns the capacity of the hash table. + pub fn capacity(&self) -> usize { + self.slices.capacity() + } +} diff --git a/libdd-profiling/src/profiles/collections/string_set.rs b/libdd-profiling/src/profiles/collections/string_set.rs new file mode 100644 index 0000000000..23a51b166c --- /dev/null +++ b/libdd-profiling/src/profiles/collections/string_set.rs @@ -0,0 +1,351 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +use super::slice_set::SliceSet; +use super::SetError; +use super::ThinStr; +use std::ffi::c_void; +use std::hash::BuildHasher; +use std::ops::Deref; +use std::ptr::NonNull; + +use super::SetHasher as Hasher; + +/// Represents a handle to a string that can be retrieved by the string set. +/// The exact representation is not a public detail; it is only available so +/// that it is known for FFI size and alignment. +/// +/// Some [`StringRef`]s refer to well-known strings, which always exist in +/// every string table. +/// +/// The caller needs to ensure the string set it was created from always exists +/// when a StringId is dereferenced. +#[repr(transparent)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct StringRef(pub ThinStr<'static>); + +impl StringRef { + pub fn into_raw(self) -> NonNull { + self.0.into_raw() + } + + /// Re-creates a [`StringRef`] created by [`StringRef::into_raw`]. + /// + /// # Safety + /// + /// `this` needs to be created from [``StringRef::into_raw`] and the set + /// it belongs to should still be alive. + pub unsafe fn from_raw(this: NonNull) -> Self { + Self(ThinStr::from_raw(this)) + } +} + +impl From<&StringRef> for StringRef { + fn from(value: &StringRef) -> Self { + *value + } +} + +impl Default for StringRef { + fn default() -> Self { + Self::EMPTY + } +} + +impl StringRef { + pub const EMPTY: StringRef = StringRef(ThinStr::new()); + pub const END_TIMESTAMP_NS: StringRef = StringRef(ThinStr::end_timestamp_ns()); + pub const LOCAL_ROOT_SPAN_ID: StringRef = StringRef(ThinStr::local_root_span_id()); + pub const TRACE_ENDPOINT: StringRef = StringRef(ThinStr::trace_endpoint()); + pub const SPAN_ID: StringRef = StringRef(ThinStr::span_id()); +} + +pub const WELL_KNOWN_STRING_REFS: [StringRef; 5] = [ + StringRef::EMPTY, + StringRef::END_TIMESTAMP_NS, + StringRef::LOCAL_ROOT_SPAN_ID, + StringRef::TRACE_ENDPOINT, + StringRef::SPAN_ID, +]; + +/// Holds unique strings and provides [`StringRef`]s to fetch them later. +/// This is a newtype around SliceSet to enforce UTF-8 invariants. +pub struct UnsyncStringSet(SliceSet); + +impl UnsyncStringSet { + pub fn try_with_capacity(capacity: usize) -> Result { + let mut set = Self(SliceSet::try_with_capacity(capacity)?); + let strings = &mut set.0.slices; + for id in WELL_KNOWN_STRING_REFS { + let hash = Hasher::default().hash_one(id.0.deref().as_bytes()); + strings.insert_unique(hash, id.0.into(), |t| Hasher::default().hash_one(t.deref())); + } + + Ok(set) + } + + /// Creates a new string set, which initially holds the empty string and + /// other well-known strings. The well-known strings are always + /// available and can be fetched using the [`WELL_KNOWN_STRING_REFS`]. + pub fn try_new() -> Result { + Self::try_with_capacity(28) + } + + unsafe fn find_with_hash(&self, hash: u64, str: &str) -> Option { + let interned_str = self.0.slices.find(hash, |thin_slice| { + // SAFETY: We only store valid UTF-8 in string sets + let slice_str = unsafe { std::str::from_utf8_unchecked(thin_slice.as_slice()) }; + slice_str == str + })?; + Some(StringRef((*interned_str).into())) + } + + /// # Safety + /// 1. The hash must be the same as if the str was re-hashed with the hasher the string set + /// would use. + /// 2. The string must be unique within the set. + pub unsafe fn insert_unique_uncontended(&mut self, str: &str) -> Result { + let hash = Hasher::default().hash_one(str.as_bytes()); + self.insert_unique_uncontended_with_hash(hash, str) + } + + /// Inserts a string into the string set without checking for duplicates, using a pre-calculated + /// hash. + /// + /// # Safety + /// 1. The caller must ensure that the hash was computed using the same hasher the string set + /// would use. + /// 2. The string must be unique within the set. + pub unsafe fn insert_unique_uncontended_with_hash( + &mut self, + hash: u64, + str: &str, + ) -> Result { + let new_slice = self + .0 + .insert_unique_uncontended_with_hash(hash, str.as_bytes())?; + Ok(StringRef(new_slice.into())) + } + + /// Adds the string to the string set if it isn't present already, and + /// returns a handle to the string that can be used to retrieve it later. + pub fn try_insert(&mut self, str: &str) -> Result { + let hash = Hasher::default().hash_one(str.as_bytes()); + unsafe { self.try_insert_with_hash(hash, str) } + } + + /// Adds the string to the string set if it isn't present already, using a pre-calculated hash. + /// Returns a handle to the string that can be used to retrieve it later. + /// + /// # Safety + /// The caller must ensure that the hash was computed using the same hasher the string set would + /// use. + pub unsafe fn try_insert_with_hash( + &mut self, + hash: u64, + str: &str, + ) -> Result { + // SAFETY: the string's hash is correct, we use the same hasher as + // StringSet uses. + if let Some(id) = self.find_with_hash(hash, str) { + return Ok(id); + } + + // SAFETY: we just checked above that the string isn't in the set. + self.insert_unique_uncontended_with_hash(hash, str) + } + + /// Returns an iterator over all strings in the set as [`StringRef`]s. + pub fn string_ids(&self) -> impl Iterator + '_ { + self.0.slices.iter().map(|slice| StringRef((*slice).into())) + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn capacity(&self) -> usize { + self.0.capacity() + } + + /// # Safety + /// The caller must ensure that the `StringId` was obtained from this set + /// (or is a well-known id) and that the set outlives the returned `&str`. + pub unsafe fn get_string(&self, id: StringRef) -> &str { + // SAFETY: the lifetime extension is safe as long as caller respects + // this function's safety. + unsafe { core::mem::transmute::<&str, &str>(id.0.deref()) } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_string_set_basic_operations() { + let mut set = UnsyncStringSet::try_new().unwrap(); + + // Test inserting new strings + let id1 = set.try_insert("hello").unwrap(); + let id2 = set.try_insert("world").unwrap(); + let id3 = set.try_insert("hello").unwrap(); // duplicate + + // Verify duplicate returns same ID + assert_eq!(&*id1.0, &*id3.0); + assert_ne!(&*id1.0, &*id2.0); + + // Verify retrieval + unsafe { + assert_eq!(set.get_string(id1), "hello"); + assert_eq!(set.get_string(id2), "world"); + assert_eq!(set.get_string(id3), "hello"); + } + } + + #[test] + fn test_string_lengths_and_alignment() { + let mut set = UnsyncStringSet::try_new().unwrap(); + + // Test various string lengths that might cause alignment issues + let test_strings = [ + "", // 0 bytes + "a", // 1 byte + "ab", // 2 bytes + "abc", // 3 bytes + "abcd", // 4 bytes + "abcdefg", // 7 bytes + "abcdefgh", // 8 bytes (usize boundary on 64-bit) + "abcdefghijklmno", // 15 bytes + "abcdefghijklmnop", // 16 bytes + "abcdefghijklmnopqrstuvwxyz123456789", // 35 bytes + ]; + + let mut ids = Vec::new(); + for s in &test_strings { + let id = set.try_insert(s).unwrap(); + ids.push(id); + } + + // Verify all strings can be retrieved correctly + for (id, expected) in ids.iter().zip(&test_strings) { + unsafe { + assert_eq!(set.get_string(*id), *expected); + } + } + } + + #[test] + fn test_unicode_strings() { + let mut set = UnsyncStringSet::try_new().unwrap(); + + let unicode_strings = [ + "café", // Latin with accents + "🦀", // Emoji (4 bytes) + "こんにちは", // Japanese + "Здравствуй", // Cyrillic + "🔥💯✨", // Multiple emoji + "a\u{0000}b", // Embedded null + "line1\nline2", // Newline + "tab\there", // Tab + ]; + + let mut ids = Vec::new(); + for s in &unicode_strings { + let id = set.try_insert(s).unwrap(); + ids.push(id); + } + + // Verify all Unicode strings are preserved correctly + for (id, expected) in ids.iter().zip(&unicode_strings) { + unsafe { + assert_eq!(set.get_string(*id), *expected); + } + } + } + + #[test] + fn test_capacity_and_growth() { + // Test with minimal capacity + let mut set = UnsyncStringSet::try_with_capacity(1).unwrap(); + + // Insert more strings than initial capacity to force growth + let test_strings: Vec = (0..50).map(|i| format!("growth_test_{}", i)).collect(); + + let mut ids = Vec::new(); + for s in &test_strings { + let id = set.try_insert(s).unwrap(); + ids.push(id); + } + + // Verify all strings are still accessible after growth + for (id, expected) in ids.iter().zip(&test_strings) { + unsafe { + assert_eq!(set.get_string(*id), expected); + } + } + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_large_strings() { + let mut set = UnsyncStringSet::try_new().unwrap(); + + // Test moderately large string + let large_string = "x".repeat(1024); + let id1 = set.try_insert(&large_string).unwrap(); + + unsafe { + assert_eq!(set.get_string(id1), large_string); + } + + // Test very large string + let very_large_string = "y".repeat(65536); + let id2 = set.try_insert(&very_large_string).unwrap(); + + unsafe { + assert_eq!(set.get_string(id2), very_large_string); + // Verify first string is still intact + assert_eq!(set.get_string(id1), large_string); + } + + // Test extremely large string (>2 MiB) to trigger different ChainAllocator path + let huge_string = "z".repeat(2 * 1024 * 1024 + 1000); // >2 MiB + let id3 = set.try_insert(&huge_string).unwrap(); + + unsafe { + assert_eq!(set.get_string(id3), huge_string); + // Verify previous strings are still intact + assert_eq!(set.get_string(id1), large_string); + assert_eq!(set.get_string(id2), very_large_string); + } + } + + #[test] + fn test_many_small_strings() { + const NUM_STRINGS: usize = if cfg!(miri) { 100 } else { 1000 }; + let mut set = UnsyncStringSet::try_new().unwrap(); + + // Insert many small strings to test fragmentation and growth + let mut ids = Vec::with_capacity(NUM_STRINGS); + let mut expected = Vec::with_capacity(NUM_STRINGS); + + for i in 0..NUM_STRINGS { + let s = format!("{}", i); + let id = set.try_insert(&s).unwrap(); + ids.push(id); + expected.push(s); + } + + // Verify all strings are still correct + for (id, expected_str) in ids.iter().zip(&expected) { + unsafe { + assert_eq!(set.get_string(*id), expected_str); + } + } + } +} diff --git a/libdd-profiling/src/profiles/collections/thin_str.rs b/libdd-profiling/src/profiles/collections/thin_str.rs new file mode 100644 index 0000000000..1850ccf40a --- /dev/null +++ b/libdd-profiling/src/profiles/collections/thin_str.rs @@ -0,0 +1,762 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +use libdd_alloc::{AllocError, Allocator}; +use std::alloc::Layout; +use std::borrow::Borrow; +use std::ffi::c_void; +use std::marker::PhantomData; +use std::mem::MaybeUninit; +use std::ops::Deref; +use std::ptr::NonNull; +use std::{fmt, hash, ptr}; + +const USIZE_WIDTH: usize = core::mem::size_of::(); + +/// A struct which acts like a thin slice reference. It does this by storing +// the length of the slice just before the elements of the slice. +#[derive(Copy, Clone)] +#[repr(C)] +pub struct ThinSlice<'a, T: Copy> { + thin_ptr: ThinPtr, + + /// Since [`ThinSlice`] doesn't hold a reference but acts like one, + // indicate this to the compiler with phantom data. + // This takes up no space. + _marker: PhantomData<&'a [T]>, +} + +impl fmt::Debug for ThinSlice<'_, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(f) + } +} + +/// A struct which acts like a thin &str. It does this by storing the size +/// of the string just before the bytes of the string. +#[derive(Copy, Clone)] +#[repr(transparent)] +pub struct ThinStr<'a> { + inner: ThinSlice<'a, u8>, +} + +impl ThinStr<'_> { + pub fn into_raw(self) -> NonNull { + self.inner.thin_ptr.size_ptr.cast() + } + + /// Re-creates a [`ThinStr`] created by [`ThinStr::into_raw`]. + /// + /// # Safety + /// + /// `this` needs to be created from [``ThinStr::into_raw`] and the storage + /// it belongs to should still be alive. + pub unsafe fn from_raw(this: NonNull) -> Self { + // SAFETY: `this` must have been produced by `ThinStr::into_raw` for + // a compatible `ThinStr` allocation. After calling this function, the + // original raw pointer must not be used again. + let thin_ptr = ThinPtr { + size_ptr: this.cast(), + _marker: PhantomData, + }; + Self { + inner: ThinSlice { + thin_ptr, + _marker: PhantomData, + }, + } + } +} + +impl fmt::Debug for ThinStr<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(f) + } +} + +// SAFETY: ThinStr is safe to send between threads as long as the underlying +// arena/storage remains alive. The caller must ensure the arena outlives all +// ThinStr references. This is the design trade-off for better performance +// than individual reference counting. +unsafe impl Send for ThinPtr {} +unsafe impl Sync for ThinPtr {} + +unsafe impl Send for ThinSlice<'_, T> {} +unsafe impl Sync for ThinSlice<'_, T> {} + +unsafe impl Send for ThinStr<'_> {} +unsafe impl Sync for ThinStr<'_> {} + +impl ThinStr<'static> { + pub const fn new() -> ThinStr<'static> { + ThinStr { + inner: ThinSlice { + thin_ptr: EMPTY_INLINE_STRING.as_thin_ptr(), + _marker: PhantomData, + }, + } + } + + pub const fn end_timestamp_ns() -> ThinStr<'static> { + ThinStr { + inner: ThinSlice { + thin_ptr: END_TIMESTAMP_NS.as_thin_ptr(), + _marker: PhantomData, + }, + } + } + + pub const fn local_root_span_id() -> ThinStr<'static> { + ThinStr { + inner: ThinSlice { + thin_ptr: LOCAL_ROOT_SPAN_ID.as_thin_ptr(), + _marker: PhantomData, + }, + } + } + + pub const fn trace_endpoint() -> ThinStr<'static> { + ThinStr { + inner: ThinSlice { + thin_ptr: TRACE_ENDPOINT.as_thin_ptr(), + _marker: PhantomData, + }, + } + } + + pub const fn span_id() -> ThinStr<'static> { + ThinStr { + inner: ThinSlice { + thin_ptr: SPAN_ID.as_thin_ptr(), + _marker: PhantomData, + }, + } + } +} + +impl Default for ThinStr<'static> { + fn default() -> Self { + Self::new() + } +} + +impl Borrow for ConstString { + fn borrow(&self) -> &InlineString { + let thin_ptr = ThinPtr { + size_ptr: NonNull::from(self).cast::(), + _marker: PhantomData, + }; + // SAFETY: the object is layout compatible and lifetime is safe, and + // inline strings are valid UTF-8. + unsafe { &*thin_ptr.inline_string_ptr().as_ptr() } + } +} + +#[repr(transparent)] +#[derive(Clone, Copy)] +struct ThinPtr { + /// Points to the beginning of an inline slice of T. + size_ptr: NonNull, + _marker: PhantomData, +} + +#[repr(C)] +pub struct InlineSlice { + /// Stores the len of `data` in native endian. + size: [u8; core::mem::size_of::()], + data: [T], +} + +impl Deref for InlineSlice { + type Target = [T]; + fn deref(&self) -> &Self::Target { + &self.data + } +} + +#[repr(C)] +pub struct InlineString { + /// Stores the len of `data` in native endian. + size: [u8; core::mem::size_of::()], + data: str, +} + +impl Deref for InlineString { + type Target = str; + fn deref(&self) -> &Self::Target { + &self.data + } +} + +impl ThinPtr { + /// Reads the size prefix to get the length of the slice. + const fn len(self) -> usize { + // SAFETY: ThinPtr points to the size prefix of the slice. + let size = unsafe { self.size_ptr.cast::<[u8; USIZE_WIDTH]>().as_ptr().read() }; + usize::from_ne_bytes(size) + } + + /// Returns a wide pointer to an inline slice. The pointer is mut but you + /// most likely shouldn't modify it. + const fn inline_slice_ptr(self) -> NonNull> { + let len = self.len(); + let slice = ptr::slice_from_raw_parts_mut(self.size_ptr.as_ptr(), len); + // SAFETY: derived from a non-null pointer self.size_ptr. + unsafe { NonNull::new_unchecked(slice as *mut [()] as *mut InlineSlice) } + } +} + +impl ThinPtr { + /// Returns a wide pointer to an inline string. The pointer is mut but you + /// most likely shouldn't modify it. + /// + /// # Safety + /// The bytes must be valid UTF-8 and originate from a valid `InlineString` + /// layout created by this module. + const unsafe fn inline_string_ptr(self) -> NonNull { + let len = self.len(); + let slice = ptr::slice_from_raw_parts_mut(self.size_ptr.as_ptr(), len); + // SAFETY: derived from a non-null pointer self.size_ptr. + unsafe { NonNull::new_unchecked(slice as *mut [()] as *mut InlineString) } + } +} + +// Generic ThinSlice implementation +impl<'a, T: Copy> ThinSlice<'a, T> { + /// Returns the length of the slice. + pub fn len(&self) -> usize { + self.thin_ptr.len() + } + + /// Returns true if the slice is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the slice as a `&[T]`. + pub fn as_slice(&self) -> &[T] { + // SAFETY: ThinSlice is layout compatible with InlineSlice, and the + // lifetime is correct. + let inline_slice = unsafe { self.thin_ptr.inline_slice_ptr().as_ref() }; + &inline_slice.data + } + + /// Computes the layout for a slice of the given length. + pub fn layout_for(slice: &[T]) -> Result { + let len = slice.len(); + let element_size = core::mem::size_of::(); + let data_size = len.checked_mul(element_size).ok_or(AllocError)?; + let total_size = USIZE_WIDTH.checked_add(data_size).ok_or(AllocError)?; + Layout::from_size_align(total_size, 1).map_err(|_| AllocError) + } + + /// Allocates memory for a slice and returns a pointer to uninitialized memory. + pub fn try_allocate_for( + slice: &[T], + alloc: &A, + ) -> Result]>, AllocError> { + let layout = Self::layout_for(slice)?; + let obj = alloc.allocate(layout)?; + let ptr = obj.cast::>(); + Ok(NonNull::slice_from_raw_parts(ptr, obj.len())) + } + + /// Tries to create a [`ThinSlice`] in the uninitialized space. + /// + /// # Errors + /// + /// Returns an error if the spare capacity is not large enough. + pub fn try_from_slice_in( + slice: &[T], + spare_capacity: &'a mut [MaybeUninit], + ) -> Result { + let layout = Self::layout_for(slice)?; + if spare_capacity.len() < layout.size() { + return Err(AllocError); + } + + let allocation = spare_capacity.as_mut_ptr().cast::(); + + // Write the size prefix + let size_bytes = slice.len().to_ne_bytes(); + // SAFETY: we've verified the allocation is big enough and aligned. + unsafe { core::ptr::copy_nonoverlapping(size_bytes.as_ptr(), allocation, USIZE_WIDTH) }; + + // Write the data + let data = unsafe { allocation.add(USIZE_WIDTH).cast::() }; + // SAFETY: the allocation is big enough, locations are distinct, and + // the memory is safe for writing. + unsafe { core::ptr::copy_nonoverlapping(slice.as_ptr(), data, slice.len()) }; + + let size_ptr = unsafe { NonNull::new_unchecked(allocation) }; + let thin_ptr = ThinPtr { + size_ptr, + _marker: PhantomData, + }; + let _marker = PhantomData; + Ok(ThinSlice { thin_ptr, _marker }) + } + + /// Creates a [`ThinSlice`] in the uninitialized space without checking capacity. + /// + /// # Safety + /// + /// The caller must ensure that `spare_capacity` has enough space for the slice + /// as determined by [`Self::layout_for`]. + pub unsafe fn from_slice_in_unchecked( + slice: &[T], + spare_capacity: &'a mut [MaybeUninit], + ) -> Self { + let allocation = spare_capacity.as_mut_ptr().cast::(); + + // Write the size prefix + let size_bytes = slice.len().to_ne_bytes(); + core::ptr::copy_nonoverlapping(size_bytes.as_ptr(), allocation, USIZE_WIDTH); + + // Write the data + let data = unsafe { allocation.add(USIZE_WIDTH).cast::() }; + core::ptr::copy_nonoverlapping(slice.as_ptr(), data, slice.len()); + + let size_ptr = NonNull::new_unchecked(allocation); + let thin_ptr = ThinPtr { + size_ptr, + _marker: PhantomData, + }; + let _marker = PhantomData; + ThinSlice { thin_ptr, _marker } + } + + /// Returns the memory layout of this slice. + pub fn layout(&self) -> Layout { + // layout_for only fails on overflow or invalid align; for valid T and lengths + // produced by this type, it should always succeed. In case of error, fall back to + // a conservative layout that matches the actual allocation. + Self::layout_for(self.as_slice()).unwrap_or_else(|_| unsafe { + // Size = prefix + data length, with alignment of 1 + Layout::from_size_align_unchecked(USIZE_WIDTH + self.len(), 1) + }) + } +} + +impl Deref for ThinSlice<'_, T> { + type Target = [T]; + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl PartialEq for ThinSlice<'_, T> +where + T: PartialEq, +{ + fn eq(&self, other: &Self) -> bool { + self.as_slice() == other.as_slice() + } +} + +impl Eq for ThinSlice<'_, T> where T: Eq {} + +impl hash::Hash for ThinSlice<'_, T> +where + T: hash::Hash, +{ + fn hash(&self, state: &mut H) { + self.as_slice().hash(state) + } +} + +impl Borrow<[T]> for ThinSlice<'_, T> { + fn borrow(&self) -> &[T] { + self.as_slice() + } +} + +impl Borrow> for ThinSlice<'_, T> { + fn borrow(&self) -> &InlineSlice { + // SAFETY: ThinSlice is layout compatible with InlineSlice, and the + // lifetime is correct. + unsafe { self.thin_ptr.inline_slice_ptr().as_ref() } + } +} + +// String-specific ThinStr implementations that delegate to ThinSlice +impl<'a> ThinStr<'a> { + // Note: len(), is_empty(), and as_bytes() are available through Deref + + /// Computes the layout for a string of the given length. + pub fn layout_for(str: &str) -> Result { + ThinSlice::layout_for(str.as_bytes()) + } + + /// Allocates memory for a string and returns a pointer to uninitialized memory. + pub fn try_allocate_for( + str: &str, + alloc: &A, + ) -> Result]>, AllocError> { + ThinSlice::try_allocate_for(str.as_bytes(), alloc) + } + + /// Tries to create a [`ThinStr`] in the uninitialized space. + /// + /// # Errors + /// + /// Returns an error if the spare capacity is not large enough. + pub fn try_from_str_in( + str: &str, + spare_capacity: &'a mut [MaybeUninit], + ) -> Result { + let inner = ThinSlice::try_from_slice_in(str.as_bytes(), spare_capacity)?; + Ok(ThinStr { inner }) + } + + /// Creates a [`ThinStr`] in the uninitialized space without checking capacity. + /// + /// # Safety + /// + /// The caller must ensure that `spare_capacity` has enough space for the string + /// as determined by [`Self::layout_for`]. + pub unsafe fn from_str_in_unchecked( + str: &str, + spare_capacity: &'a mut [MaybeUninit], + ) -> Self { + let inner = ThinSlice::from_slice_in_unchecked(str.as_bytes(), spare_capacity); + ThinStr { inner } + } + + /// Returns the memory layout of this string. + pub fn layout(&self) -> Layout { + self.inner.layout() + } +} + +impl Deref for ThinStr<'_> { + type Target = str; + fn deref(&self) -> &Self::Target { + let inline_string: &InlineString = self.borrow(); + &inline_string.data + } +} + +impl Borrow for ThinStr<'_> { + fn borrow(&self) -> &str { + self.deref() + } +} + +impl Borrow for ThinStr<'_> { + fn borrow(&self) -> &InlineString { + // SAFETY: as long as the lifetime is correct, then this is also safe. + // If the caller is lying about the lifetime (e.g. dynamic lifetimes) + // then the caller needs to be cautious about borrowing this, and + // ThinStr only stores valid UTF-8 strings. + unsafe { self.inner.thin_ptr.inline_string_ptr().as_ref() } + } +} + +impl PartialEq for ThinStr<'_> { + fn eq(&self, other: &Self) -> bool { + self.deref() == other.deref() + } +} + +impl Eq for ThinStr<'_> {} + +impl hash::Hash for ThinStr<'_> { + fn hash(&self, state: &mut H) { + // Hash as a string to maintain consistency with &str + self.deref().hash(state) + } +} + +impl<'a> From> for ThinStr<'a> { + fn from(inner: ThinSlice<'a, u8>) -> Self { + ThinStr { inner } + } +} + +impl<'a> From> for ThinSlice<'a, u8> { + fn from(thin_str: ThinStr<'a>) -> Self { + thin_str.inner + } +} + +/// [`ConstString`] is used to create the storage needed for static strings +/// that back [`ThinStr`]s. +#[repr(C)] +pub struct ConstString { + /// Stores the len of `data`. + size: [u8; core::mem::size_of::()], + data: [u8; N], +} + +impl ConstString { + const fn new(str: &str) -> Self { + // Meant for complile-time validation. + #[allow(clippy::panic)] + if str.len() != N { + panic!("string length and storage mismatch for ConstString") + } + ConstString:: { + size: N.to_ne_bytes(), + data: { + let src = str.as_bytes(); + let mut dst = [0u8; N]; + let mut i = 0usize; + while i < N { + dst[i] = src[i]; + i += 1; + } + dst + }, + } + } + const fn as_thin_ptr(&self) -> ThinPtr { + let ptr = core::ptr::addr_of!(self.size).cast::(); + // SAFETY: derived from static address, and ThinStr does not allow + // modifications, so the mut-cast is also fine. + let size_ptr = unsafe { NonNull::new_unchecked(ptr.cast_mut()) }; + ThinPtr { + size_ptr, + _marker: PhantomData, + } + } +} + +static EMPTY_INLINE_STRING: ConstString<0> = ConstString::new(""); +static END_TIMESTAMP_NS: ConstString<16> = ConstString::new("end_timestamp_ns"); +static LOCAL_ROOT_SPAN_ID: ConstString<18> = ConstString::new("local root span id"); +static TRACE_ENDPOINT: ConstString<14> = ConstString::new("trace endpoint"); +static SPAN_ID: ConstString<7> = ConstString::new("span id"); + +#[cfg(test)] +mod tests { + use super::*; + use libdd_alloc::Global; + + const TEST_STRINGS: [&str; 5] = [ + "datadog", + "MyNamespace.MyClass.MyMethod(Int32 id, String name)", + "/var/run/datadog/apm.socket", + "[truncated]", + "Sidekiq::❨╯°□°❩╯︵┻━┻", + ]; + + #[test] + fn test_allocation_and_deallocation() { + let alloc = &Global; + + let mut thin_strs: Vec = TEST_STRINGS + .iter() + .copied() + .map(|str| { + let obj = ThinStr::try_allocate_for(str, alloc).unwrap(); + // SAFETY: just allocated the bytes, no other references exist, + // so we can safely turn it into `&mut [MaybeUninit]`. + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in(str, uninit).unwrap(); + let actual = thin_str.deref(); + assert_eq!(str, actual); + thin_str + }) + .collect(); + + // This could detect out-of-bounds reads. + for (thin_str, str) in thin_strs.iter().zip(TEST_STRINGS) { + let actual = thin_str.deref(); + assert_eq!(str, actual); + } + + for thin_str in thin_strs.drain(..) { + unsafe { alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()) }; + } + } + + #[test] + fn test_empty_string() { + let alloc = &Global; + + let obj = ThinStr::try_allocate_for("", alloc).unwrap(); + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in("", uninit).unwrap(); + + assert_eq!(thin_str.deref(), ""); + assert_eq!(thin_str.deref().len(), 0); + + unsafe { alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()) }; + } + + #[test] + fn test_single_byte_strings() { + let alloc = &Global; + let single_bytes = ["a", "z", "0", "9", "!", "~"]; + + for &s in &single_bytes { + let obj = ThinStr::try_allocate_for(s, alloc).unwrap(); + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in(s, uninit).unwrap(); + + assert_eq!(thin_str.deref(), s); + assert_eq!(thin_str.deref().len(), 1); + + unsafe { alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()) }; + } + } + + #[test] + fn test_boundary_lengths() { + let alloc = &Global; + + // Test strings around common boundary sizes + let test_cases = [ + ("", 0), + ("a", 1), + ("ab", 2), + ("abc", 3), + ("abcd", 4), + ("abcdefg", 7), + ("abcdefgh", 8), + ("abcdefghijklmno", 15), + ("abcdefghijklmnop", 16), + ("abcdefghijklmnopqrstuvwxyz123456", 32), + ("abcdefghijklmnopqrstuvwxyz1234567", 33), + ]; + + for (s, expected_len) in test_cases { + assert_eq!(s.len(), expected_len); + + let obj = ThinStr::try_allocate_for(s, alloc).unwrap(); + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in(s, uninit).unwrap(); + + assert_eq!(thin_str.deref(), s); + assert_eq!(thin_str.deref().len(), expected_len); + + unsafe { alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()) }; + } + } + + #[test] + fn test_unicode_edge_cases() { + let alloc = &Global; + + let unicode_cases = [ + "é", // 2-byte UTF-8 + "€", // 3-byte UTF-8 + "🦀", // 4-byte UTF-8 + "\u{0000}", // Null character + "\u{FFFD}", // Replacement character + "a\u{0000}b", // Embedded null + "\n\r\t", // Control characters + "\u{1F600}\u{1F601}", // Multiple emoji + ]; + + for s in unicode_cases { + let obj = ThinStr::try_allocate_for(s, alloc).unwrap(); + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in(s, uninit).unwrap(); + + assert_eq!(thin_str.deref(), s); + assert_eq!(thin_str.deref().len(), s.len()); + + unsafe { alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()) }; + } + } + + #[test] + fn test_capacity() { + // Test that try_from_str_in fails when there's not enough space + let test_string = "hello world"; + let mut small_buffer = [std::mem::MaybeUninit::uninit(); 5]; // Too small + + let result = ThinStr::try_from_str_in(test_string, &mut small_buffer); + assert!(result.is_err()); + + // Test with exactly the right amount of space + let required_size = test_string.len() + core::mem::size_of::(); + let mut buffer = vec![std::mem::MaybeUninit::uninit(); required_size]; + + let thin_str = ThinStr::try_from_str_in(test_string, &mut buffer).unwrap(); + assert_eq!(thin_str.deref(), test_string); + } + + proptest::proptest! { + #![proptest_config(proptest::prelude::ProptestConfig { + // Reduce test cases under miri for faster execution + cases: if cfg!(miri) { 16 } else { 256 }, + ..proptest::prelude::ProptestConfig::default() + })] + + #[test] + fn test_thin_str_properties(test_string in ".*") { + use std::borrow::Borrow; + use std::hash::{Hash, Hasher}; + use std::collections::hash_map::DefaultHasher; + + let alloc = &Global; + + // Test layout calculation property + let layout = ThinStr::layout_for(&test_string).unwrap(); + let min_size = test_string.len() + core::mem::size_of::(); + assert!(layout.size() >= min_size); + assert!(layout.align() >= 1); + assert!(layout.align().is_power_of_two()); + + // Create ThinStr + let obj = ThinStr::try_allocate_for(&test_string, alloc).unwrap(); + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in(&test_string, uninit).unwrap(); + + // Test borrowing properties + let borrowed_str: &str = thin_str.borrow(); + assert_eq!(borrowed_str, test_string); + + let borrowed_inline: &InlineString = thin_str.borrow(); + assert_eq!(borrowed_inline.deref(), test_string); + + // Test deref consistency + assert_eq!(thin_str.deref(), test_string); + assert_eq!(thin_str.deref().len(), test_string.len()); + + // Test hash consistency property + let mut hasher1 = DefaultHasher::new(); + thin_str.hash(&mut hasher1); + let hash1 = hasher1.finish(); + + let mut hasher2 = DefaultHasher::new(); + test_string.hash(&mut hasher2); + let hash2 = hasher2.finish(); + + assert_eq!(hash1, hash2); + + // Test equality property - create another ThinStr with same content + let obj2 = ThinStr::try_allocate_for(&test_string, alloc).unwrap(); + let uninit2 = unsafe { &mut *obj2.as_ptr() }; + let thin_str2 = ThinStr::try_from_str_in(&test_string, uninit2).unwrap(); + + // Should be equal even though they're different allocations + assert_eq!(thin_str, thin_str2); + + // Cleanup + unsafe { + alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()); + alloc.deallocate(thin_str2.inner.thin_ptr.size_ptr, thin_str2.layout()); + } + } + } + + #[test] + fn test_large_string() { + let alloc = &Global; + + // Test a reasonably large string + let large_string = "x".repeat(10000); + let obj = ThinStr::try_allocate_for(&large_string, alloc).unwrap(); + let uninit = unsafe { &mut *obj.as_ptr() }; + let thin_str = ThinStr::try_from_str_in(&large_string, uninit).unwrap(); + + assert_eq!(thin_str.deref(), large_string); + assert_eq!(thin_str.deref().len(), 10000); + + unsafe { alloc.deallocate(thin_str.inner.thin_ptr.size_ptr, thin_str.layout()) }; + } +} diff --git a/libdd-profiling/src/profiles/mod.rs b/libdd-profiling/src/profiles/mod.rs index 95ba786481..24780566b1 100644 --- a/libdd-profiling/src/profiles/mod.rs +++ b/libdd-profiling/src/profiles/mod.rs @@ -1,6 +1,7 @@ // Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +pub mod collections; mod compressor; pub use compressor::*;