From 77cdf3291d08ce5db729b0f34f0d0815449f064f Mon Sep 17 00:00:00 2001 From: statxc Date: Fri, 6 Mar 2026 21:01:22 +0000 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20use=20HashMap=20for=20create=5Fshar?= =?UTF-8?q?ed=5Fstring=20to=20fix=20O(N=C2=B2)=20performance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rust/flatbuffers/src/builder.rs | 34 +++++++++ .../rust_usage_test/tests/integration_test.rs | 73 +++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/rust/flatbuffers/src/builder.rs b/rust/flatbuffers/src/builder.rs index 84bb622580..bcf859404e 100644 --- a/rust/flatbuffers/src/builder.rs +++ b/rust/flatbuffers/src/builder.rs @@ -24,6 +24,9 @@ use core::marker::PhantomData; use core::ops::{Add, AddAssign, Deref, DerefMut, Index, IndexMut, Sub, SubAssign}; use core::ptr::write_bytes; +#[cfg(feature = "std")] +use std::collections::HashMap; + use crate::endian_scalar::emplace_scalar; use crate::primitives::*; use crate::push::{Push, PushAlignment}; @@ -139,6 +142,9 @@ pub struct FlatBufferBuilder<'fbb, A: Allocator = DefaultAllocator> { min_align: usize, force_defaults: bool, + #[cfg(feature = "std")] + strings_pool: HashMap>, + #[cfg(not(feature = "std"))] strings_pool: Vec>, _phantom: PhantomData<&'fbb ()>, @@ -197,6 +203,9 @@ impl<'fbb, A: Allocator> FlatBufferBuilder<'fbb, A> { min_align: 0, force_defaults: false, + #[cfg(feature = "std")] + strings_pool: HashMap::new(), + #[cfg(not(feature = "std"))] strings_pool: Vec::new(), _phantom: PhantomData, @@ -343,6 +352,31 @@ impl<'fbb, A: Allocator> FlatBufferBuilder<'fbb, A> { WIPOffset::new(o.value()) } + /// Create a utf8 string, and de-duplicate if already created. + /// + /// Uses a HashMap to track previously written strings, providing O(1) + /// amortized lookup and insertion. + #[cfg(feature = "std")] + #[inline] + pub fn create_shared_string<'a: 'b, 'b>(&'a mut self, s: &'b str) -> WIPOffset<&'fbb str> { + self.assert_not_nested( + "create_shared_string can not be called when a table or vector is under construction", + ); + + if let Some(&offset) = self.strings_pool.get(s) { + return offset; + } + + let address = WIPOffset::new(self.create_byte_string(s.as_bytes()).value()); + self.strings_pool.insert(s.to_owned(), address); + address + } + + /// Create a utf8 string, and de-duplicate if already created. + /// + /// Uses a sorted Vec with binary search to track previously written + /// strings when in `no_std` mode. + #[cfg(not(feature = "std"))] #[inline] pub fn create_shared_string<'a: 'b, 'b>(&'a mut self, s: &'b str) -> WIPOffset<&'fbb str> { self.assert_not_nested( diff --git a/tests/rust_usage_test/tests/integration_test.rs b/tests/rust_usage_test/tests/integration_test.rs index d97bf88cff..38928d156e 100644 --- a/tests/rust_usage_test/tests/integration_test.rs +++ b/tests/rust_usage_test/tests/integration_test.rs @@ -3224,4 +3224,77 @@ fn test_shared_strings() { assert_eq!(string_vector.get(1), "foo"); } +#[test] +fn test_shared_strings_pool_deduplication() { + // Verifies that create_shared_string correctly deduplicates across many + // unique strings and that the resulting buffer contains valid data. + let mut builder = flatbuffers::FlatBufferBuilder::with_capacity(1024); + + // Insert multiple unique strings and verify each gets a distinct offset. + let animals = ["cat", "dog", "bird", "fish", "snake"]; + let offsets: Vec<_> = animals + .iter() + .map(|s| builder.create_shared_string(s)) + .collect(); + for i in 0..offsets.len() { + for j in (i + 1)..offsets.len() { + assert_ne!( + offsets[i].value(), + offsets[j].value(), + "unique strings '{}' and '{}' must have different offsets", + animals[i], + animals[j], + ); + } + } + + // Re-insert the same strings and verify they return the original offsets. + for (i, s) in animals.iter().enumerate() { + let offset = builder.create_shared_string(s); + assert_eq!( + offset.value(), + offsets[i].value(), + "duplicate string '{}' must return the same offset", + s, + ); + } + + // Verify that reset clears the pool: a previously shared string is no + // longer deduplicated against strings from before the reset. + builder.reset(); + let a = builder.create_shared_string("cat"); + let b = builder.create_shared_string("cat"); + assert_eq!(a.value(), b.value(), "same string after reset must still deduplicate"); + + // Verify that shared strings produce a valid, readable buffer. + builder.reset(); + let shared_name = builder.create_shared_string("goblin"); + let shared_name_dup = builder.create_shared_string("goblin"); + assert_eq!(shared_name.value(), shared_name_dup.value()); + + let enemy = my_game::example::Monster::create( + &mut builder, + &my_game::example::MonsterArgs { + name: Some(shared_name), + ..Default::default() + }, + ); + let main_name = builder.create_shared_string("goblin"); + assert_eq!(main_name.value(), shared_name.value()); + + let monster = my_game::example::Monster::create( + &mut builder, + &my_game::example::MonsterArgs { + name: Some(main_name), + enemy: Some(enemy), + ..Default::default() + }, + ); + builder.finish(monster, None); + + let m = my_game::example::root_as_monster(builder.finished_data()).unwrap(); + assert_eq!(m.name(), "goblin"); + assert_eq!(m.enemy().unwrap().name(), "goblin"); +} + } From b0f786707a3934a94f0c65eb7ccc2935d0104e74 Mon Sep 17 00:00:00 2001 From: statxc Date: Sat, 7 Mar 2026 04:39:48 +0000 Subject: [PATCH 2/2] refactor: clean up no_std binary_search_by with direct slice comparison --- rust/flatbuffers/src/builder.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/rust/flatbuffers/src/builder.rs b/rust/flatbuffers/src/builder.rs index bcf859404e..e6b9cfb41c 100644 --- a/rust/flatbuffers/src/builder.rs +++ b/rust/flatbuffers/src/builder.rs @@ -389,19 +389,15 @@ impl<'fbb, A: Allocator> FlatBufferBuilder<'fbb, A> { let found = self.strings_pool.binary_search_by(|offset| { let ptr = offset.value() as usize; - // Gets The pointer to the size of the string let str_memory = &buf[buf.len() - ptr..]; - // Gets the size of the written string from buffer - let size = - u32::from_le_bytes([str_memory[0], str_memory[1], str_memory[2], str_memory[3]]) - as usize; - // Size of the string size - let string_size: usize = 4; - // Fetches actual string bytes from index of string after string size - // to the size of string plus string size - let iter = str_memory[string_size..size + string_size].iter(); - // Compares bytes of fetched string and current writable string - iter.cloned().cmp(s.bytes()) + let size = u32::from_le_bytes([ + str_memory[0], + str_memory[1], + str_memory[2], + str_memory[3], + ]) as usize; + let stored = &str_memory[4..4 + size]; + stored.cmp(s.as_bytes()) }); match found {