Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 37 additions & 5 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,20 @@ pub static BLOB_DESC_LANCE_FIELD: LazyLock<Field> =
pub static BLOB_V2_DESC_FIELDS: LazyLock<Fields> = LazyLock::new(|| {
Fields::from(vec![
ArrowField::new("kind", DataType::UInt8, false),
ArrowField::new("position", DataType::UInt64, true),
ArrowField::new("size", DataType::UInt64, true),
ArrowField::new("blob_id", DataType::UInt32, true),
ArrowField::new("blob_uri", DataType::Utf8, true),
ArrowField::new("position", DataType::UInt64, false),
ArrowField::new("size", DataType::UInt64, false),
ArrowField::new("blob_id", DataType::UInt32, false),
ArrowField::new("blob_uri", DataType::Utf8, false),
])
});

pub static BLOB_V2_DESC_TYPE: LazyLock<DataType> =
LazyLock::new(|| DataType::Struct(BLOB_V2_DESC_FIELDS.clone()));

pub static BLOB_V2_DESC_FIELD: LazyLock<ArrowField> = LazyLock::new(|| {
ArrowField::new("description", BLOB_V2_DESC_TYPE.clone(), true).with_metadata(HashMap::from([
ArrowField::new("description", BLOB_V2_DESC_TYPE.clone(), false).with_metadata(HashMap::from([
(lance_arrow::BLOB_META_KEY.to_string(), "true".to_string()),
("lance-encoding:packed".to_string(), "true".to_string()),
]))
});

Expand Down Expand Up @@ -415,3 +416,34 @@ pub fn lance_supports_nulls(datatype: &DataType) -> bool {
| DataType::FixedSizeList(_, _)
)
}

/// Physical storage mode for blob v2 descriptors (one byte, stored in the packed struct column).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum BlobKind {
/// Stored in the main data file’s out-of-line buffer; `position`/`size` point into that file.
Inline = 0,
/// Stored in a shared packed blob file; `position`/`size` locate the slice, `blob_id` selects the file.
Packed = 1,
/// Stored in a dedicated raw blob file; `blob_id` identifies the file, `size` is the full file length.
Dedicated = 2,
/// Not stored by Lance; `blob_uri` holds an absolute external URI, offsets are zero.
External = 3,
}

impl TryFrom<u8> for BlobKind {
type Error = Error;

fn try_from(value: u8) -> Result<Self> {
match value {
0 => Ok(Self::Inline),
1 => Ok(Self::Packed),
2 => Ok(Self::Dedicated),
3 => Ok(Self::External),
other => Err(Error::InvalidInput {
source: format!("Unknown blob kind {other:?}").into(),
location: location!(),
}),
}
}
}
4 changes: 3 additions & 1 deletion rust/lance-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -513,15 +513,17 @@ impl Field {
///
/// If the field is not a blob, return the field itself.
pub fn into_unloaded_with_version(mut self, version: BlobVersion) -> Self {
if self.data_type().is_binary_like() && self.is_blob() {
if self.is_blob() {
match version {
BlobVersion::V2 => {
self.logical_type = BLOB_V2_DESC_LANCE_FIELD.logical_type.clone();
self.children = BLOB_V2_DESC_LANCE_FIELD.children.clone();
self.metadata = BLOB_V2_DESC_LANCE_FIELD.metadata.clone();
}
BlobVersion::V1 => {
self.logical_type = BLOB_DESC_LANCE_FIELD.logical_type.clone();
self.children = BLOB_DESC_LANCE_FIELD.children.clone();
self.metadata = BLOB_DESC_LANCE_FIELD.metadata.clone();
}
}
}
Expand Down
138 changes: 106 additions & 32 deletions rust/lance-encoding/src/encodings/logical/blob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use crate::{
format::ProtobufUtils21,
repdef::{DefinitionInterpretation, RepDefBuilder},
};
use lance_core::datatypes::BlobKind;

/// Blob structural encoder - stores large binary data in external buffers
///
Expand Down Expand Up @@ -267,7 +268,7 @@ impl FieldEncoder for BlobV2StructuralEncoder {
&mut self,
array: ArrayRef,
external_buffers: &mut OutOfLineBuffers,
_repdef: RepDefBuilder,
mut repdef: RepDefBuilder,
row_number: u64,
num_rows: u64,
) -> Result<Vec<EncodeTask>> {
Expand All @@ -280,6 +281,11 @@ impl FieldEncoder for BlobV2StructuralEncoder {
};

let struct_arr = array.as_struct();
if let Some(validity) = struct_arr.nulls() {
repdef.add_validity_bitmap(validity.clone());
} else {
repdef.add_no_null(struct_arr.len());
}
let mut data_idx = None;
let mut uri_idx = None;
for (idx, field) in fields.iter().enumerate() {
Expand Down Expand Up @@ -310,12 +316,6 @@ impl FieldEncoder for BlobV2StructuralEncoder {
location: location!(),
});
}
if uri_is_set {
return Err(Error::NotSupported {
source: "External blob (uri) is not supported yet".into(),
location: location!(),
});
}
}

let binary_array = data_col;
Expand All @@ -327,34 +327,41 @@ impl FieldEncoder for BlobV2StructuralEncoder {
let mut blob_id_builder = PrimitiveBuilder::<UInt32Type>::with_capacity(binary_array.len());
let mut uri_builder = StringBuilder::with_capacity(binary_array.len(), 0);

for i in 0..binary_array.len() {
let is_null_row = match array.data_type() {
DataType::Struct(_) => array.is_null(i),
_ => binary_array.is_null(i),
};
if is_null_row {
kind_builder.append_null();
position_builder.append_null();
size_builder.append_null();
blob_id_builder.append_null();
uri_builder.append_null();
Comment on lines -330 to -340
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these changes forwards / backwards compatible?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I believe so. No blob v2 files have been written so far. It's part of file format 2.2

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I had forgotten we added a dedicated encoded / decoder for v2.

for i in 0..struct_arr.len() {
if struct_arr.is_null(i) {
// Packed struct does not support nullable fields; use empty/default values instead.
kind_builder.append_value(BlobKind::Inline as u8);
position_builder.append_value(0);
size_builder.append_value(0);
blob_id_builder.append_value(0);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like blob_id is always set to zero. Can you remind me what this is for again?

Copy link
Copy Markdown
Collaborator Author

@Xuanwo Xuanwo Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blob_id == 0 means it's empty (the valid value starts from 1).

And blob_id reprents the id used by packed & dedicated blobs which are both not implemented yet.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the !data_is_set branch then? I thought an inline blob was "data column is not null and uri is null" and a dedicated / packed blob is "data column is null and uri is not null"?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see the enum now. So the !data_is_set approach is external.

uri_builder.append_value("");
continue;
}

let value = binary_array.value(i);
kind_builder.append_value(0);

if value.is_empty() {
let data_is_set = !data_col.is_null(i);
if data_is_set {
let value = binary_array.value(i);
kind_builder.append_value(BlobKind::Inline as u8);
if value.is_empty() {
position_builder.append_value(0);
size_builder.append_value(0);
} else {
let position =
external_buffers.add_buffer(LanceBuffer::from(Buffer::from(value)));
position_builder.append_value(position);
size_builder.append_value(value.len() as u64);
}
blob_id_builder.append_value(0);
uri_builder.append_value("");
} else {
// external uri
let uri = uri_col.value(i);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check if the URI is empty?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, we can check before writing.

kind_builder.append_value(BlobKind::External as u8);
position_builder.append_value(0);
size_builder.append_value(0);
} else {
let position = external_buffers.add_buffer(LanceBuffer::from(Buffer::from(value)));
position_builder.append_value(position);
size_builder.append_value(value.len() as u64);
blob_id_builder.append_value(0);
uri_builder.append_value(uri);
}

blob_id_builder.append_null();
uri_builder.append_null();
}

let children: Vec<ArrayRef> = vec![
Expand All @@ -374,7 +381,7 @@ impl FieldEncoder for BlobV2StructuralEncoder {
self.descriptor_encoder.maybe_encode(
descriptor_array,
external_buffers,
RepDefBuilder::default(),
repdef,
row_number,
num_rows,
)
Expand Down Expand Up @@ -402,9 +409,16 @@ mod tests {
use crate::{
compression::DefaultCompressionStrategy,
encoder::{ColumnIndexSequence, EncodingOptions},
testing::{check_round_trip_encoding_of_data, TestCases},
testing::{
check_round_trip_encoding_of_data, check_round_trip_encoding_of_data_with_expected,
TestCases,
},
version::LanceFileVersion,
};
use arrow_array::{
ArrayRef, LargeBinaryArray, StringArray, StructArray, UInt32Array, UInt64Array, UInt8Array,
};
use arrow_array::LargeBinaryArray;
use arrow_schema::{DataType, Field as ArrowField};

#[test]
fn test_blob_encoder_creation() {
Expand Down Expand Up @@ -487,4 +501,64 @@ mod tests {
// Use the standard test harness
check_round_trip_encoding_of_data(vec![array], &TestCases::default(), blob_metadata).await;
}

#[tokio::test]
async fn test_blob_v2_external_round_trip() {
let blob_metadata =
HashMap::from([(lance_arrow::BLOB_META_KEY.to_string(), "true".to_string())]);

let data_field = Arc::new(ArrowField::new("data", DataType::LargeBinary, true));
let uri_field = Arc::new(ArrowField::new("uri", DataType::Utf8, true));

let data_array = LargeBinaryArray::from(vec![Some(b"inline".as_ref()), None, None]);
let uri_array = StringArray::from(vec![
None,
Some("file:///tmp/external.bin"),
Some("s3://bucket/blob"),
]);

let struct_array = StructArray::from(vec![
(data_field, Arc::new(data_array) as ArrayRef),
(uri_field, Arc::new(uri_array) as ArrayRef),
]);

let expected_descriptor = StructArray::from(vec![
(
Arc::new(ArrowField::new("kind", DataType::UInt8, false)),
Arc::new(UInt8Array::from(vec![
BlobKind::Inline as u8,
BlobKind::External as u8,
BlobKind::External as u8,
])) as ArrayRef,
),
(
Arc::new(ArrowField::new("position", DataType::UInt64, false)),
Arc::new(UInt64Array::from(vec![0, 0, 0])) as ArrayRef,
),
(
Arc::new(ArrowField::new("size", DataType::UInt64, false)),
Arc::new(UInt64Array::from(vec![6, 0, 0])) as ArrayRef,
),
(
Arc::new(ArrowField::new("blob_id", DataType::UInt32, false)),
Arc::new(UInt32Array::from(vec![0, 0, 0])) as ArrayRef,
),
(
Arc::new(ArrowField::new("blob_uri", DataType::Utf8, false)),
Arc::new(StringArray::from(vec![
"",
"file:///tmp/external.bin",
"s3://bucket/blob",
])) as ArrayRef,
),
]);

check_round_trip_encoding_of_data_with_expected(
vec![Arc::new(struct_array)],
Some(Arc::new(expected_descriptor)),
&TestCases::default().with_min_file_version(LanceFileVersion::V2_2),
blob_metadata,
)
.await;
}
}
Loading
Loading