From 67a1614d9e552d82e1169dff64d2a12adef8b58d Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 12 Jan 2026 16:44:02 +0800 Subject: [PATCH] fix: trait Array has been sealed in arrow new version --- python/src/arrow.rs | 3 +- rust/lance-arrow/src/bfloat16.rs | 129 ++++++++++------------ rust/lance-arrow/src/floats.rs | 60 ++++++++-- rust/lance-arrow/src/json.rs | 84 ++------------ rust/lance-index/src/vector/bq/builder.rs | 2 +- rust/lance-linalg/benches/dot.rs | 2 +- rust/lance-testing/src/datagen.rs | 6 +- 7 files changed, 126 insertions(+), 160 deletions(-) diff --git a/python/src/arrow.rs b/python/src/arrow.rs index d676d7ca436..f5a51de2aaf 100644 --- a/python/src/arrow.rs +++ b/python/src/arrow.rs @@ -76,7 +76,8 @@ pub fn bfloat16_array<'py>( values: Vec>, py: Python<'py>, ) -> PyResult> { - let array = BFloat16Array::from_iter(values.into_iter().map(|v| v.map(bf16::from_f32))); + let array = + BFloat16Array::from_iter(values.into_iter().map(|v| v.map(bf16::from_f32))).into_inner(); // Create a record batch with a single column and an annotated schema let field = Field::new("bfloat16", DataType::FixedSizeBinary(2), true).with_metadata( diff --git a/rust/lance-arrow/src/bfloat16.rs b/rust/lance-arrow/src/bfloat16.rs index ad2ee86d168..0364afe300f 100644 --- a/rust/lance-arrow/src/bfloat16.rs +++ b/rust/lance-arrow/src/bfloat16.rs @@ -6,10 +6,7 @@ use std::fmt::Formatter; use std::slice; -use arrow_array::{ - builder::BooleanBufferBuilder, iterator::ArrayIter, Array, ArrayAccessor, ArrayRef, - FixedSizeBinaryArray, -}; +use arrow_array::{builder::BooleanBufferBuilder, Array, FixedSizeBinaryArray}; use arrow_buffer::MutableBuffer; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field as ArrowField}; @@ -41,9 +38,7 @@ pub struct BFloat16Type {} /// An array of bfloat16 values /// -/// This implements the [`Array`] trait for bfloat16 values. Note that -/// bfloat16 is not the same thing as fp16 which is supported natively -/// by arrow-rs. +/// Note that bfloat16 is not the same thing as fp16 which is supported natively by arrow-rs. #[derive(Clone)] pub struct BFloat16Array { inner: FixedSizeBinaryArray, @@ -72,8 +67,27 @@ impl BFloat16Array { values.into() } + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + pub fn is_null(&self, i: usize) -> bool { + self.inner.is_null(i) + } + + pub fn null_count(&self) -> usize { + self.inner.null_count() + } + pub fn iter(&self) -> BFloat16Iter<'_> { - BFloat16Iter::new(self) + BFloat16Iter { + array: self, + index: 0, + } } pub fn value(&self, i: usize) -> bf16 { @@ -100,65 +114,6 @@ impl BFloat16Array { } } -impl ArrayAccessor for &BFloat16Array { - type Item = bf16; - - fn value(&self, index: usize) -> Self::Item { - BFloat16Array::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - BFloat16Array::value_unchecked(self, index) - } -} - -impl Array for BFloat16Array { - fn as_any(&self) -> &dyn std::any::Any { - self.inner.as_any() - } - - fn to_data(&self) -> arrow_data::ArrayData { - self.inner.to_data() - } - - fn into_data(self) -> arrow_data::ArrayData { - self.inner.into_data() - } - - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let inner_array: &dyn Array = &self.inner; - inner_array.slice(offset, length) - } - - fn nulls(&self) -> Option<&arrow_buffer::NullBuffer> { - self.inner.nulls() - } - - fn data_type(&self) -> &DataType { - self.inner.data_type() - } - - fn len(&self) -> usize { - self.inner.len() - } - - fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - fn offset(&self) -> usize { - self.inner.offset() - } - - fn get_array_memory_size(&self) -> usize { - self.inner.get_array_memory_size() - } - - fn get_buffer_memory_size(&self) -> usize { - self.inner.get_buffer_memory_size() - } -} - impl FromIterator> for BFloat16Array { fn from_iter>>(iter: I) -> Self { let mut buffer = MutableBuffer::new(10); @@ -242,7 +197,27 @@ impl PartialEq for BFloat16Array { } } -type BFloat16Iter<'a> = ArrayIter<&'a BFloat16Array>; +pub struct BFloat16Iter<'a> { + array: &'a BFloat16Array, + index: usize, +} + +impl<'a> Iterator for BFloat16Iter<'a> { + type Item = Option; + + fn next(&mut self) -> Option { + if self.index >= self.array.len() { + return None; + } + let i = self.index; + self.index += 1; + if self.array.is_null(i) { + Some(None) + } else { + Some(Some(self.array.value(i))) + } + } +} /// Methods that are lifted from arrow-rs temporarily until they are made public. mod from_arrow { @@ -290,17 +265,26 @@ mod from_arrow { } } -impl FloatArray for BFloat16Array { +impl FloatArray for FixedSizeBinaryArray { type FloatType = BFloat16Type; fn as_slice(&self) -> &[bf16] { + assert_eq!( + self.value_length(), + 2, + "BFloat16 arrays must use FixedSizeBinary(2) storage" + ); unsafe { slice::from_raw_parts( - self.inner.value_data().as_ptr() as *const bf16, - self.inner.value_data().len() / 2, + self.value_data().as_ptr() as *const bf16, + self.value_data().len() / 2, ) } } + + fn from_values(values: Vec) -> Self { + BFloat16Array::from(values).into_inner() + } } #[cfg(test)] @@ -327,6 +311,9 @@ mod tests { for (expected, value) in values.as_slice().iter().zip(array2.iter()) { assert_eq!(Some(*expected), value); } + + let arrow_array = array.into_inner(); + assert_eq!(arrow_array.as_slice(), values.as_slice()); } #[test] diff --git a/rust/lance-arrow/src/floats.rs b/rust/lance-arrow/src/floats.rs index a530612a875..fcacbdd7eb5 100644 --- a/rust/lance-arrow/src/floats.rs +++ b/rust/lance-arrow/src/floats.rs @@ -13,7 +13,7 @@ use std::{ use arrow_array::{ types::{Float16Type, Float32Type, Float64Type}, - Array, Float16Array, Float32Array, Float64Array, + Array, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, }; use arrow_schema::{DataType, Field}; use half::{bf16, f16}; @@ -95,7 +95,7 @@ pub trait ArrowFloatType: Debug { /// Returns empty array of this type. fn empty_array() -> Self::ArrayType { - Vec::::new().into() + >::from_values(Vec::new()) } } @@ -143,7 +143,7 @@ impl ArrowFloatType for BFloat16Type { const MIN: Self::Native = bf16::MIN; const MAX: Self::Native = bf16::MAX; - type ArrayType = BFloat16Array; + type ArrayType = FixedSizeBinaryArray; } impl ArrowFloatType for Float16Type { @@ -180,13 +180,22 @@ impl ArrowFloatType for Float64Type { /// /// This is similar to [`arrow_array::PrimitiveArray`] but applies to all float types (including bfloat16) /// and is implemented as a trait and not a struct -pub trait FloatArray: - Array + Clone + From> + 'static -{ +pub trait FloatArray: Array + Clone + 'static { type FloatType: ArrowFloatType; /// Returns a reference to the underlying data as a slice. fn as_slice(&self) -> &[T::Native]; + + /// Construct an array from a vector of values. + fn from_values(values: Vec) -> Self; + + /// Construct an array from an iterator of values. + fn from_iter_values(values: impl IntoIterator) -> Self + where + Self: Sized, + { + Self::from_values(values.into_iter().collect()) + } } impl FloatArray for Float16Array { @@ -195,6 +204,10 @@ impl FloatArray for Float16Array { fn as_slice(&self) -> &[::Native] { self.values() } + + fn from_values(values: Vec<::Native>) -> Self { + Self::from(values) + } } impl FloatArray for Float32Array { @@ -203,6 +216,10 @@ impl FloatArray for Float32Array { fn as_slice(&self) -> &[::Native] { self.values() } + + fn from_values(values: Vec<::Native>) -> Self { + Self::from(values) + } } impl FloatArray for Float64Array { @@ -211,6 +228,10 @@ impl FloatArray for Float64Array { fn as_slice(&self) -> &[::Native] { self.values() } + + fn from_values(values: Vec<::Native>) -> Self { + Self::from(values) + } } /// Convert a float32 array to another float array @@ -219,9 +240,10 @@ impl FloatArray for Float64Array { /// and need to be converted to the appropriate float type for the index. pub fn coerce_float_vector(input: &Float32Array, float_type: FloatType) -> Result> { match float_type { - FloatType::BFloat16 => Ok(Arc::new(BFloat16Array::from_iter_values( - input.values().iter().map(|v| bf16::from_f32(*v)), - ))), + FloatType::BFloat16 => Ok(Arc::new( + BFloat16Array::from_iter_values(input.values().iter().map(|v| bf16::from_f32(*v))) + .into_inner(), + )), FloatType::Float16 => Ok(Arc::new(Float16Array::from_iter_values( input.values().iter().map(|v| f16::from_f32(*v)), ))), @@ -231,3 +253,23 @@ pub fn coerce_float_vector(input: &Float32Array, float_type: FloatType) -> Resul ))), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_coerce_float_vector_bfloat16() { + let input = Float32Array::from(vec![1.0f32, 2.0, 3.0]); + let array = coerce_float_vector(&input, FloatType::BFloat16).unwrap(); + + assert_eq!(array.data_type(), &DataType::FixedSizeBinary(2)); + + let fixed = array + .as_any() + .downcast_ref::() + .unwrap(); + let expected: Vec = input.values().iter().map(|v| bf16::from_f32(*v)).collect(); + assert_eq!(fixed.as_slice(), expected.as_slice()); + } +} diff --git a/rust/lance-arrow/src/json.rs b/rust/lance-arrow/src/json.rs index 0b5e84ae5a8..d11240ff7a4 100644 --- a/rust/lance-arrow/src/json.rs +++ b/rust/lance-arrow/src/json.rs @@ -8,7 +8,6 @@ use std::sync::Arc; use arrow_array::builder::LargeBinaryBuilder; use arrow_array::{Array, ArrayRef, LargeBinaryArray, LargeStringArray, RecordBatch, StringArray}; -use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, Field as ArrowField, Schema}; use crate::ARROW_EXT_NAME_KEY; @@ -140,8 +139,8 @@ impl JsonArray { pub fn to_arrow_json(&self) -> ArrayRef { let mut builder = arrow_array::builder::StringBuilder::new(); - for i in 0..self.len() { - if self.is_null(i) { + for i in 0..self.inner.len() { + if self.inner.is_null(i) { builder.append_null(); } else { let jsonb_bytes = self.inner.value(i); @@ -153,53 +152,17 @@ impl JsonArray { // Return as UTF-8 string array (Arrow represents JSON as strings) Arc::new(builder.finish()) } -} - -impl Array for JsonArray { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn to_data(&self) -> ArrayData { - self.inner.to_data() - } - - fn into_data(self) -> ArrayData { - self.inner.into_data() - } - fn data_type(&self) -> &DataType { - &DataType::LargeBinary - } - - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - Arc::new(Self { - inner: self.inner.slice(offset, length), - }) - } - - fn len(&self) -> usize { + pub fn len(&self) -> usize { self.inner.len() } - fn is_empty(&self) -> bool { + pub fn is_empty(&self) -> bool { self.inner.is_empty() } - fn offset(&self) -> usize { - self.inner.offset() - } - - fn nulls(&self) -> Option<&arrow_buffer::NullBuffer> { - self.inner.nulls() - } - - fn get_buffer_memory_size(&self) -> usize { - self.inner.get_buffer_memory_size() - } - - fn get_array_memory_size(&self) -> usize { - self.inner.get_array_memory_size() + pub fn is_null(&self, i: usize) -> bool { + self.inner.is_null(i) } } @@ -712,39 +675,14 @@ mod tests { let json_array = JsonArray::try_from_iter(vec![Some(r#"{"a": 1}"#), Some(r#"{"b": 2}"#)]).unwrap(); - // Test as_any - let any_ref = json_array.as_any(); - assert!(any_ref.downcast_ref::().is_some()); - - // Test data_type - assert_eq!(json_array.data_type(), &DataType::LargeBinary); - - // Test len (already covered, but included for completeness) + // Wrapper methods assert_eq!(json_array.len(), 2); - - // Test is_empty assert!(!json_array.is_empty()); + assert!(!json_array.is_null(0)); - // Test offset - assert_eq!(json_array.offset(), 0); - - // Test get_buffer_memory_size - assert!(json_array.get_buffer_memory_size() > 0); - - // Test get_array_memory_size - assert!(json_array.get_array_memory_size() > 0); - - // Test to_data - let data = json_array.clone().to_data(); - assert_eq!(data.len(), 2); - - // Test into_data - let data = json_array.clone().into_data(); - assert_eq!(data.len(), 2); - - // Test slice - let sliced = json_array.slice(0, 1); - assert_eq!(sliced.len(), 1); + // Underlying Arrow array + assert_eq!(json_array.inner().data_type(), &DataType::LargeBinary); + assert_eq!(json_array.inner().len(), 2); } #[test] diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs index bfb2bfbc3d9..47a40c55801 100644 --- a/rust/lance-index/src/vector/bq/builder.rs +++ b/rust/lance-index/src/vector/bq/builder.rs @@ -58,7 +58,7 @@ impl RabitQuantizer { let rotate_mat = match T::FLOAT_TYPE { FloatType::Float16 | FloatType::Float32 | FloatType::Float64 => { - let rotate_mat = T::ArrayType::from(rotate_mat); + let rotate_mat = >::from_values(rotate_mat); FixedSizeListArray::try_new_from_values(rotate_mat, code_dim).unwrap() } _ => unimplemented!("RabitQ does not support data type: {:?}", T::FLOAT_TYPE), diff --git a/rust/lance-linalg/benches/dot.rs b/rust/lance-linalg/benches/dot.rs index 47354ac79f9..8175591e6b6 100644 --- a/rust/lance-linalg/benches/dot.rs +++ b/rust/lance-linalg/benches/dot.rs @@ -39,7 +39,7 @@ where let type_name = std::any::type_name::(); c.bench_function(format!("Dot({type_name}, arrow_artiy)").as_str(), |b| { b.iter(|| { - T::ArrayType::from( + >::from_values( target .as_slice() .chunks(DIMENSION) diff --git a/rust/lance-testing/src/datagen.rs b/rust/lance-testing/src/datagen.rs index 4cc1d504594..40204e4a73b 100644 --- a/rust/lance-testing/src/datagen.rs +++ b/rust/lance-testing/src/datagen.rs @@ -209,10 +209,8 @@ where { let mut rng = StdRng::from_seed(seed); - T::ArrayType::from( - repeat_with(|| T::Native::from_f32(rng.random::()).unwrap()) - .take(n) - .collect::>(), + >::from_iter_values( + repeat_with(|| T::Native::from_f32(rng.random::()).unwrap()).take(n), ) }