From 5ac98798f22f80c0bc90a9139e2b46fe3d1d098f Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 00:41:20 +0300 Subject: [PATCH 1/8] perf: add optimized function to create offset with same length --- arrow-array/src/array/list_array.rs | 2 +- arrow-buffer/src/buffer/offset.rs | 60 +++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index ec7983c303f0..55a3a4c0d678 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -466,7 +466,7 @@ impl From for GenericListArray< _ => unreachable!(), }; - let offsets = OffsetBuffer::from_lengths(std::iter::repeat_n(size, value.len())); + let offsets = OffsetBuffer::from_repeated_length(size, value.len()); Self { data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index fe3a57a38248..de10e8993b8a 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -112,6 +112,9 @@ impl OffsetBuffer { /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]); /// ``` /// + /// If you want to create an [`OffsetBuffer`] where each slice has the same length, + /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead. + /// /// # Panics /// /// Panics on overflow @@ -133,6 +136,39 @@ impl OffsetBuffer { Self(out.into()) } + /// Create a new [`OffsetBuffer`] where each slice has the same length + /// `length`, repeated `n` times. + /// + /// + /// Example + /// ``` + /// # use arrow_buffer::OffsetBuffer; + /// let offsets = OffsetBuffer::::from_repeated_length(4, 3); + /// assert_eq!(offsets.as_ref(), &[0, 4, 8, 12]); + /// ``` + /// + /// # Panics + /// + /// Panics on overflow + pub fn from_repeated_length(length: usize, n: usize) -> Self { + if n == 0 { + return Self::new_empty(); + } + + if length == 0 { + return Self::new_zeroed(n); + } + + // Check for overflow + O::from_usize(length * n).expect("offset overflow"); + + let offsets = (0..=n) + .map(|index| O::usize_as(index * length)) + .collect::>(); + + Self(ScalarBuffer::from(offsets)) + } + /// Get an Iterator over the lengths of this [`OffsetBuffer`] /// /// ``` @@ -283,6 +319,30 @@ mod tests { OffsetBuffer::::from_lengths([usize::MAX, 1]); } + #[test] + #[should_panic(expected = "offset overflow")] + fn from_repeated_lengths_offset_length_overflow() { + OffsetBuffer::::from_repeated_length(i32::MAX as usize, 1); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn from_repeated_lengths_offset_repeat_overflow() { + OffsetBuffer::::from_repeated_length(1, i32::MAX as usize); + } + + #[test] + #[should_panic(expected = "usize overflow")] + fn from_repeated_lengths_usize_length_overflow() { + OffsetBuffer::::from_repeated_length(usize::MAX, 1); + } + + #[test] + #[should_panic(expected = "usize overflow")] + fn from_repeated_lengths_usize_repeat_overflow() { + OffsetBuffer::::from_repeated_length(1, usize::MAX); + } + #[test] fn get_lengths() { let offsets = OffsetBuffer::::new(ScalarBuffer::::from(vec![0, 1, 4, 9])); From e172f2e8d8939fe36b3b7739a2608f5a4d07a254 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 00:58:54 +0300 Subject: [PATCH 2/8] add tests --- arrow-buffer/src/buffer/offset.rs | 88 +++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index de10e8993b8a..307f027f681c 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -159,6 +159,10 @@ impl OffsetBuffer { return Self::new_zeroed(n); } + // Check for overflow + // Making sure we don't overflow usize or O when calculating the total length + length.checked_mul(n).expect("usize overflow"); + // Check for overflow O::from_usize(length * n).expect("offset overflow"); @@ -322,23 +326,29 @@ mod tests { #[test] #[should_panic(expected = "offset overflow")] fn from_repeated_lengths_offset_length_overflow() { - OffsetBuffer::::from_repeated_length(i32::MAX as usize, 1); + OffsetBuffer::::from_repeated_length(i32::MAX as usize / 4, 5); } #[test] #[should_panic(expected = "offset overflow")] fn from_repeated_lengths_offset_repeat_overflow() { - OffsetBuffer::::from_repeated_length(1, i32::MAX as usize); + OffsetBuffer::::from_repeated_length(1, i32::MAX as usize + 1); } #[test] - #[should_panic(expected = "usize overflow")] + #[should_panic(expected = "offset overflow")] fn from_repeated_lengths_usize_length_overflow() { OffsetBuffer::::from_repeated_length(usize::MAX, 1); } #[test] #[should_panic(expected = "usize overflow")] + fn from_repeated_lengths_usize_length_usize_overflow() { + OffsetBuffer::::from_repeated_length(usize::MAX, 2); + } + + #[test] + #[should_panic(expected = "offset overflow")] fn from_repeated_lengths_usize_repeat_overflow() { OffsetBuffer::::from_repeated_length(1, usize::MAX); } @@ -383,4 +393,76 @@ mod tests { let default = OffsetBuffer::::default(); assert_eq!(default.as_ref(), &[0]); } + + #[test] + fn from_repeated_length_basic() { + // Basic case with length 4, repeated 3 times + let buffer = OffsetBuffer::::from_repeated_length(4, 3); + assert_eq!(buffer.as_ref(), &[0, 4, 8, 12]); + + // Verify the lengths are correct + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![4, 4, 4]); + } + + #[test] + fn from_repeated_length_single_repeat() { + // Length 5, repeated once + let buffer = OffsetBuffer::::from_repeated_length(5, 1); + assert_eq!(buffer.as_ref(), &[0, 5]); + + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![5]); + } + + #[test] + fn from_repeated_length_zero_repeats() { + let buffer = OffsetBuffer::::from_repeated_length(10, 0); + assert_eq!(buffer, OffsetBuffer::::new_empty()); + } + + #[test] + fn from_repeated_length_zero_length() { + // Zero length, repeated 5 times (all zeros) + let buffer = OffsetBuffer::::from_repeated_length(0, 5); + assert_eq!(buffer.as_ref(), &[0, 0, 0, 0, 0, 0]); + + // All lengths should be 0 + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![0, 0, 0, 0, 0]); + } + + #[test] + fn from_repeated_length_large_values() { + // Test with larger values that don't overflow + let buffer = OffsetBuffer::::from_repeated_length(1000, 100); + assert_eq!(buffer[0], 0); + + // Verify all lengths are 1000 + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths.len(), 100); + assert!(lengths.iter().all(|&len| len == 1000)); + } + + #[test] + fn from_repeated_length_unit_length() { + // Length 1, repeated multiple times + let buffer = OffsetBuffer::::from_repeated_length(1, 10); + assert_eq!(buffer.as_ref(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + + let lengths: Vec = buffer.lengths().collect(); + assert_eq!(lengths, vec![1; 10]); + } + + #[test] + fn from_repeated_length_max_safe_values() { + // Test with maximum safe values for i32 + // i32::MAX / 3 ensures we don't overflow when repeated twice + let third_max = (i32::MAX / 3) as usize; + let buffer = OffsetBuffer::::from_repeated_length(third_max, 2); + assert_eq!( + buffer.as_ref(), + &[0, third_max as i32, (third_max * 2) as i32] + ); + } } From 3317a39583b0b738ef882e5b4395f44d6e541880 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 01:00:11 +0300 Subject: [PATCH 3/8] updated comment --- arrow-buffer/src/buffer/offset.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 307f027f681c..66fa7dd22ec5 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -112,7 +112,7 @@ impl OffsetBuffer { /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]); /// ``` /// - /// If you want to create an [`OffsetBuffer`] where each slice has the same length, + /// If you want to create an [`OffsetBuffer`] where all lengths are the same, /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead. /// /// # Panics From 5f01d05c1cacfc5fb80edb14ae85b0182a28e3a3 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 09:49:52 +0300 Subject: [PATCH 4/8] perf: add `repeat_slice_n_times` to `MutableBuffer` this will be used in: - https://github.com/apache/arrow-rs/pull/8653 --- arrow-buffer/src/buffer/mutable.rs | 219 +++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 93d9d6b9ad84..5239b7e46736 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -222,6 +222,104 @@ impl MutableBuffer { } } + /// Adding to this mutable buffer `slice_to_repeat` repeated `repeat_count` times. + /// + /// # Example + /// + /// ## Repeat the same string bytes multiple times + /// ``` + /// # use arrow_buffer::buffer::MutableBuffer; + /// let mut buffer = MutableBuffer::new(0); + /// let bytes_to_repeat = b"ab"; + /// buffer.repeat_slice_n_times(bytes_to_repeat, 3); + /// assert_eq!(buffer.as_slice(), b"ababab"); + /// ``` + pub fn repeat_slice_n_times( + &mut self, + slice_to_repeat: &[T], + repeat_count: usize, + ) { + if repeat_count == 0 || slice_to_repeat.is_empty() { + return; + } + + let bytes_to_repeat = size_of_val(slice_to_repeat); + + // Ensure capacity + self.reserve(repeat_count * bytes_to_repeat); + + // For smaller number of repeats, just extend directly as the overhead of the + // doubling strategy is not worth it + if repeat_count <= 3 { + for _ in 0..repeat_count { + self.extend_from_slice(slice_to_repeat); + } + + return; + } + + // We will use doubling strategy to fill the buffer in log(repeat_count) steps + + // If we keep extending from ourself we will reach it pretty fast + let final_len_to_repeat = repeat_count * bytes_to_repeat; + + // Save the length before we do all the copies to know where to start from + let length_before = self.len; + + // Copy the initial slice once + self.extend_from_slice(slice_to_repeat); + + // This tracks how much bytes we have added by repeating so far + let mut added_repeats_length = bytes_to_repeat; + assert_eq!( + self.len - length_before, + added_repeats_length, + "should copy exactly the same number of bytes" + ); + + // Copy in doubling steps to make the number of calls logarithmic and fast (as we copy large chunks at a time) + while added_repeats_length * 2 <= final_len_to_repeat { + unsafe { + // Get to the start of the data before we started copying anything + let src = self.data.as_ptr().add(length_before) as *const u8; + + // Go to the current location to copy to (end of current data) + let dst = self.data.as_ptr().add(self.len); + + // SAFETY: the pointers are not overlapping as there is `added_repeats_length` exactly + // between them + std::ptr::copy_nonoverlapping(src, dst, added_repeats_length) + } + + // Advance the length by the amount of data we just copied (doubled) + self.len += added_repeats_length; + + // Double the amount of data we have added so far + added_repeats_length *= 2; + } + + // Handle the remainder in single copy. + + // the amount left to copy is guaranteed to be less than what we have already copied + let last_amount_to_copy = final_len_to_repeat - added_repeats_length; + assert!( + last_amount_to_copy <= final_len_to_repeat, + "the last copy should not overlap" + ); + + if last_amount_to_copy == 0 { + return; + } + + unsafe { + let src = self.data.as_ptr().add(length_before) as *const u8; + let dst = self.data.as_ptr().add(self.len); + + std::ptr::copy_nonoverlapping(src, dst, last_amount_to_copy) + } + self.len += last_amount_to_copy; + } + #[cold] fn reallocate(&mut self, capacity: usize) { let new_layout = Layout::from_size_align(capacity, self.layout.align()).unwrap(); @@ -1184,4 +1282,125 @@ mod tests { assert_eq!(pool.used(), 0); } } + + fn create_expected_repeated_slice( + slice_to_repeat: &[T], + repeat_count: usize, + ) -> Buffer { + let mut expected = MutableBuffer::new(size_of_val(slice_to_repeat) * repeat_count); + for _ in 0..repeat_count { + // Not using push_slice_repeated as this is the function under test + expected.extend_from_slice(slice_to_repeat); + } + expected.into() + } + + // Helper to test a specific repeat count with various slice sizes + fn test_repeat_count( + repeat_count: usize, + test_data: &[T], + ) { + let mut buffer = MutableBuffer::new(0); + buffer.repeat_slice_n_times(test_data, repeat_count); + + let expected = create_expected_repeated_slice(test_data, repeat_count); + let result: Buffer = buffer.into(); + + assert_eq!( + result, + expected, + "Failed for repeat_count={}, slice_len={}", + repeat_count, + test_data.len() + ); + } + + #[test] + fn test_repeat_slice_count_edge_cases() { + // Empty slice + test_repeat_count(100, &[] as &[i32]); + + // Zero repeats + test_repeat_count(0, &[1i32, 2, 3]); + } + + #[test] + fn test_small_repeats_counts() { + // test any special implementation for small repeat counts + let data = &[1u8, 2, 3, 4, 5]; + + for _ in 1..=10 { + test_repeat_count(2, data); + } + } + + #[test] + fn test_different_size_of_i32_repeat_slice() { + let data: &[i32] = &[1, 2, 3]; + let data_with_single_item: &[i32] = &[42]; + + for data in &[data, data_with_single_item] { + for item in 1..=9 { + let base_repeat_count = 2_usize.pow(item); + test_repeat_count(base_repeat_count - 1, data); + test_repeat_count(base_repeat_count, data); + test_repeat_count(base_repeat_count + 1, data); + } + } + } + + #[test] + fn test_different_size_of_u8_repeat_slice() { + let data: &[u8] = &[1, 2, 3]; + let data_with_single_item: &[u8] = &[10]; + + for data in &[data, data_with_single_item] { + for item in 1..=9 { + let base_repeat_count = 2_usize.pow(item); + test_repeat_count(base_repeat_count - 1, data); + test_repeat_count(base_repeat_count, data); + test_repeat_count(base_repeat_count + 1, data); + } + } + } + + #[test] + fn test_different_size_of_u16_repeat_slice() { + let data: &[u16] = &[1, 2, 3]; + let data_with_single_item: &[u16] = &[10]; + + for data in &[data, data_with_single_item] { + for item in 1..=9 { + let base_repeat_count = 2_usize.pow(item); + test_repeat_count(base_repeat_count - 1, data); + test_repeat_count(base_repeat_count, data); + test_repeat_count(base_repeat_count + 1, data); + } + } + } + + #[test] + fn test_various_slice_lengths() { + // Test different slice lengths with same repeat pattern + let repeat_count = 37; // Arbitrary non-power-of-2 + + // Single element + test_repeat_count(repeat_count, &[42i32]); + + // Small slices + test_repeat_count(repeat_count, &[1i32, 2]); + test_repeat_count(repeat_count, &[1i32, 2, 3]); + test_repeat_count(repeat_count, &[1i32, 2, 3, 4]); + test_repeat_count(repeat_count, &[1i32, 2, 3, 4, 5]); + + // Larger slices + let data_10: Vec = (0..10).collect(); + test_repeat_count(repeat_count, &data_10); + + let data_100: Vec = (0..100).collect(); + test_repeat_count(repeat_count, &data_100); + + let data_1000: Vec = (0..1000).collect(); + test_repeat_count(repeat_count, &data_1000); + } } From 72070dc54216b37c789d17f0464c8e655dc31194 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:59:42 +0300 Subject: [PATCH 5/8] simplify implementation --- arrow-buffer/src/buffer/mutable.rs | 51 ++++++++++-------------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 5239b7e46736..986e221db7c6 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -258,11 +258,6 @@ impl MutableBuffer { return; } - // We will use doubling strategy to fill the buffer in log(repeat_count) steps - - // If we keep extending from ourself we will reach it pretty fast - let final_len_to_repeat = repeat_count * bytes_to_repeat; - // Save the length before we do all the copies to know where to start from let length_before = self.len; @@ -270,15 +265,24 @@ impl MutableBuffer { self.extend_from_slice(slice_to_repeat); // This tracks how much bytes we have added by repeating so far - let mut added_repeats_length = bytes_to_repeat; + let added_repeats_length = bytes_to_repeat; assert_eq!( self.len - length_before, added_repeats_length, "should copy exactly the same number of bytes" ); - // Copy in doubling steps to make the number of calls logarithmic and fast (as we copy large chunks at a time) - while added_repeats_length * 2 <= final_len_to_repeat { + // Number of times the slice was repeated + let mut already_repeated_times = 1; + + // We will use doubling strategy to fill the buffer in log(repeat_count) steps + while already_repeated_times < repeat_count { + // How many slices can we copy in this iteration + // (either double what we have, or just the remaining ones) + let number_of_slices_to_copy = + already_repeated_times.min(repeat_count - already_repeated_times); + let number_of_bytes_to_copy = number_of_slices_to_copy * bytes_to_repeat; + unsafe { // Get to the start of the data before we started copying anything let src = self.data.as_ptr().add(length_before) as *const u8; @@ -286,38 +290,15 @@ impl MutableBuffer { // Go to the current location to copy to (end of current data) let dst = self.data.as_ptr().add(self.len); - // SAFETY: the pointers are not overlapping as there is `added_repeats_length` exactly - // between them - std::ptr::copy_nonoverlapping(src, dst, added_repeats_length) + // SAFETY: the pointers are not overlapping as there is `number_of_bytes_to_copy` or less between them + std::ptr::copy_nonoverlapping(src, dst, number_of_bytes_to_copy) } // Advance the length by the amount of data we just copied (doubled) - self.len += added_repeats_length; - - // Double the amount of data we have added so far - added_repeats_length *= 2; - } - - // Handle the remainder in single copy. - - // the amount left to copy is guaranteed to be less than what we have already copied - let last_amount_to_copy = final_len_to_repeat - added_repeats_length; - assert!( - last_amount_to_copy <= final_len_to_repeat, - "the last copy should not overlap" - ); - - if last_amount_to_copy == 0 { - return; - } - - unsafe { - let src = self.data.as_ptr().add(length_before) as *const u8; - let dst = self.data.as_ptr().add(self.len); + self.len += number_of_bytes_to_copy; - std::ptr::copy_nonoverlapping(src, dst, last_amount_to_copy) + already_repeated_times += number_of_slices_to_copy; } - self.len += last_amount_to_copy; } #[cold] From 9948aa37c57fc777be54a56bf913a53770f950f2 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:39:29 +0300 Subject: [PATCH 6/8] feat: add `new_repeated` and `try_new_repeated` to `ByteArray` this will be used to improve performance of converting scalar to array --- arrow-array/src/array/byte_array.rs | 46 ++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 63c439ef96d9..c70d1c96f3e5 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -190,6 +190,36 @@ impl GenericByteArray { Scalar::new(Self::from_iter_values(std::iter::once(value))) } + /// Create a new [`GenericByteArray`] where `value` is repeated `repeat_count` times. + /// + /// # Panics + /// This will panic if value's length multiplied by `repeat_count` overflows usize. + /// + pub fn new_repeated(value: impl AsRef, repeat_count: usize) -> Self { + Self::try_new_repeated(value, repeat_count).unwrap() + } + + /// Try to create [`GenericByteArray`] where `value` is repeated `repeat_count` times. + /// + /// Return an error if value's length multiplied by `repeat_count` overflows usize. + pub fn try_new_repeated(value: impl AsRef, repeat_count: usize) -> Result { + let s: &[u8] = value.as_ref().as_ref(); + let value_offsets = OffsetBuffer::try_from_repeated_length(s.len(), repeat_count)?; + let bytes: Buffer = { + let mut mutable_buffer = MutableBuffer::with_capacity(0); + mutable_buffer.repeat_slice_n_times(s, repeat_count); + + mutable_buffer.into() + }; + + Ok(Self { + data_type: T::DATA_TYPE, + value_data: bytes, + value_offsets, + nulls: None, + }) + } + /// Creates a [`GenericByteArray`] based on an iterator of values without nulls pub fn from_iter_values(iter: I) -> Self where @@ -593,7 +623,7 @@ where #[cfg(test)] mod tests { - use crate::{BinaryArray, StringArray}; + use crate::{Array, BinaryArray, StringArray}; use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer}; #[test] @@ -651,4 +681,18 @@ mod tests { BinaryArray::new(offsets, non_ascii_data, None); } + + #[test] + fn create_repeated() { + let arr = BinaryArray::new_repeated(b"hello", 3); + assert_eq!(arr.len(), 3); + assert_eq!(arr.value(0).as_ref(), b"hello"); + assert_eq!(arr.value(1).as_ref(), b"hello"); + assert_eq!(arr.value(2).as_ref(), b"hello"); + + let arr = StringArray::new_repeated("world", 2); + assert_eq!(arr.len(), 2); + assert_eq!(arr.value(0), "world"); + assert_eq!(arr.value(1), "world"); + } } From 78b223fb158dcb60366998a08b17b70c4e26e841 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:44:05 +0300 Subject: [PATCH 7/8] fix --- arrow-array/src/array/byte_array.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index c70d1c96f3e5..fb75e70a690c 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -196,15 +196,8 @@ impl GenericByteArray { /// This will panic if value's length multiplied by `repeat_count` overflows usize. /// pub fn new_repeated(value: impl AsRef, repeat_count: usize) -> Self { - Self::try_new_repeated(value, repeat_count).unwrap() - } - - /// Try to create [`GenericByteArray`] where `value` is repeated `repeat_count` times. - /// - /// Return an error if value's length multiplied by `repeat_count` overflows usize. - pub fn try_new_repeated(value: impl AsRef, repeat_count: usize) -> Result { let s: &[u8] = value.as_ref().as_ref(); - let value_offsets = OffsetBuffer::try_from_repeated_length(s.len(), repeat_count)?; + let value_offsets = OffsetBuffer::from_repeated_length(s.len(), repeat_count); let bytes: Buffer = { let mut mutable_buffer = MutableBuffer::with_capacity(0); mutable_buffer.repeat_slice_n_times(s, repeat_count); @@ -212,12 +205,12 @@ impl GenericByteArray { mutable_buffer.into() }; - Ok(Self { + Self { data_type: T::DATA_TYPE, value_data: bytes, value_offsets, nulls: None, - }) + } } /// Creates a [`GenericByteArray`] based on an iterator of values without nulls @@ -686,9 +679,9 @@ mod tests { fn create_repeated() { let arr = BinaryArray::new_repeated(b"hello", 3); assert_eq!(arr.len(), 3); - assert_eq!(arr.value(0).as_ref(), b"hello"); - assert_eq!(arr.value(1).as_ref(), b"hello"); - assert_eq!(arr.value(2).as_ref(), b"hello"); + assert_eq!(arr.value(0), b"hello"); + assert_eq!(arr.value(1), b"hello"); + assert_eq!(arr.value(2), b"hello"); let arr = StringArray::new_repeated("world", 2); assert_eq!(arr.len(), 2); From 670d3019e06228c2897fb9ee2bb905f74de0d08c Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 3 Nov 2025 18:52:28 +0200 Subject: [PATCH 8/8] add failing tests --- arrow-array/src/array/byte_array.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index fb75e70a690c..fbd8458846fc 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -688,4 +688,28 @@ mod tests { assert_eq!(arr.value(0), "world"); assert_eq!(arr.value(1), "world"); } + + #[test] + #[should_panic(expected = "usize overflow")] + fn create_repeated_usize_overflow_1() { + let _arr = BinaryArray::new_repeated(b"hello", (usize::MAX / "hello".len()) + 1); + } + + #[test] + #[should_panic(expected = "usize overflow")] + fn create_repeated_usize_overflow_2() { + let _arr = BinaryArray::new_repeated(b"hello", usize::MAX); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn create_repeated_i32_offset_overflow_1() { + let _arr = BinaryArray::new_repeated(b"hello", usize::MAX / "hello".len()); + } + + #[test] + #[should_panic(expected = "offset overflow")] + fn create_repeated_i32_offset_overflow_2() { + let _arr = BinaryArray::new_repeated(b"hello", ((i32::MAX as usize) / "hello".len()) + 1); + } }