From 5f03219cb592e51e3181ff59879e3ce12f49ed4b Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 10 Dec 2025 21:30:51 +0800 Subject: [PATCH] refactor: use the same path for dedicated and packed blob Signed-off-by: Xuanwo --- rust/lance-core/src/utils/blob.rs | 29 ++++++++--------------------- rust/lance/src/dataset/blob.rs | 16 ++++++++-------- 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/rust/lance-core/src/utils/blob.rs b/rust/lance-core/src/utils/blob.rs index 2e38c95ee93..b51a53895f6 100644 --- a/rust/lance-core/src/utils/blob.rs +++ b/rust/lance-core/src/utils/blob.rs @@ -3,38 +3,25 @@ use object_store::path::Path; -/// Format a dedicated blob sidecar path for a data file. +/// Format a blob sidecar path for a data file. /// -/// Layout: `//.raw` +/// Layout: `//.blob` /// - `base` is typically the dataset's data directory. /// - `data_file_key` is the stem of the data file (without extension). -pub fn dedicated_blob_path(base: &Path, data_file_key: &str, blob_id: u32) -> Path { - let file_name = format!("{:08x}.raw", blob_id); +/// - `blob_id` is the hex-encoded identifier assigned during write. +pub fn blob_path(base: &Path, data_file_key: &str, blob_id: u32) -> Path { + let file_name = format!("{:08x}.blob", blob_id); base.child(data_file_key).child(file_name.as_str()) } -/// Format a packed blob sidecar path for a data file. -/// -/// Layout: `//.pack` -pub fn pack_blob_path(base: &Path, data_file_key: &str, blob_id: u32) -> Path { - let file_name = format!("{:08x}.pack", blob_id); - base.child(data_file_key).child(file_name.as_str()) -} #[cfg(test)] mod tests { use super::*; #[test] - fn test_dedicated_blob_path_formatting() { - let base = Path::from("base"); - let path = dedicated_blob_path(&base, "deadbeef", 2); - assert_eq!(path.to_string(), "base/deadbeef/00000002.raw"); - } - - #[test] - fn test_pack_blob_path_formatting() { + fn test_blob_path_formatting() { let base = Path::from("base"); - let path = pack_blob_path(&base, "cafebabe", 3); - assert_eq!(path.to_string(), "base/cafebabe/00000003.pack"); + let path = blob_path(&base, "deadbeef", 2); + assert_eq!(path.to_string(), "base/deadbeef/00000002.blob"); } } diff --git a/rust/lance/src/dataset/blob.rs b/rust/lance/src/dataset/blob.rs index d56b9e2fb8a..7abf9c7d5f1 100644 --- a/rust/lance/src/dataset/blob.rs +++ b/rust/lance/src/dataset/blob.rs @@ -20,7 +20,7 @@ use super::take::TakeBuilder; use super::{Dataset, ProjectionRequest}; use arrow_array::StructArray; use lance_core::datatypes::{BlobKind, BlobVersion}; -use lance_core::utils::blob::{dedicated_blob_path, pack_blob_path}; +use lance_core::utils::blob::blob_path; use lance_core::{utils::address::RowAddress, Error, Result}; use lance_io::traits::Reader; @@ -37,8 +37,8 @@ const INLINE_MAX: usize = 64 * 1024; // 64KB inline cutoff const DEDICATED_THRESHOLD: usize = 4 * 1024 * 1024; // 4MB dedicated cutoff const PACK_FILE_MAX_SIZE: usize = 1024 * 1024 * 1024; // 1GiB per .pack sidecar -// Maintains rolling `.pack` sidecar files for packed blobs. -// Layout: data/{data_file_key}/{blob_id:08x}.pack where each file is an +// Maintains rolling `.blob` sidecar files for packed blobs. +// Layout: data/{data_file_key}/{blob_id:08x}.blob where each file is an // unframed concatenation of blob payloads; descriptors store (blob_id, // position, size) to locate each slice. A dedicated struct keeps path state // and rolling size separate from the per-batch preprocessor logic, so we can @@ -67,7 +67,7 @@ impl PackWriter { } async fn start_new_pack(&mut self, blob_id: u32) -> Result<()> { - let path = pack_blob_path(&self.data_dir, &self.data_file_key, blob_id); + let path = blob_path(&self.data_dir, &self.data_file_key, blob_id); let writer = self.object_store.create(&path).await?; self.writer = Some(writer); self.current_blob_id = Some(blob_id); @@ -75,7 +75,7 @@ impl PackWriter { Ok(()) } - /// Append `data` to the current `.pack` file, rolling to a new file when + /// Append `data` to the current `.blob` file, rolling to a new file when /// `max_pack_size` would be exceeded. /// /// alloc_blob_id: called only when a new pack file is opened; returns the @@ -156,7 +156,7 @@ impl BlobPreprocessor { } async fn write_dedicated(&mut self, blob_id: u32, data: &[u8]) -> Result { - let path = dedicated_blob_path(&self.data_dir, &self.data_file_key, blob_id); + let path = blob_path(&self.data_dir, &self.data_file_key, blob_id); let mut writer = self.object_store.create(&path).await?; writer.write_all(data).await?; writer.shutdown().await?; @@ -732,7 +732,7 @@ async fn collect_blob_files_v2( })?; let data_file_key = data_file_key_from_path(data_file.path.as_str()); - let path = dedicated_blob_path(&dataset.data_dir(), data_file_key, blob_id); + let path = blob_path(&dataset.data_dir(), data_file_key, blob_id); files.push(BlobFile::new_dedicated(dataset.clone(), path, size)); } BlobKind::Packed => { @@ -754,7 +754,7 @@ async fn collect_blob_files_v2( location: location!(), })?; let data_file_key = data_file_key_from_path(data_file.path.as_str()); - let path = pack_blob_path(&dataset.data_dir(), data_file_key, blob_id); + let path = blob_path(&dataset.data_dir(), data_file_key, blob_id); files.push(BlobFile::new_packed(dataset.clone(), path, position, size)); } BlobKind::External => {