From 5c7054311a3a786be1e773bcc0f55ea94a30b0cb Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 8 Dec 2025 16:16:21 +0800 Subject: [PATCH 01/24] feat: binary copy for compaction --- rust/lance-datagen/src/generator.rs | 5 +- rust/lance-file/src/writer.rs | 16 +- rust/lance/Cargo.toml | 4 + rust/lance/benches/binary_copy.rs | 140 +++ rust/lance/src/dataset/optimize.rs | 1517 ++++++++++++++++++++++++++- 5 files changed, 1653 insertions(+), 29 deletions(-) create mode 100644 rust/lance/benches/binary_copy.rs diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index bc319c1ed2e..4c56d5dbad6 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -2083,7 +2083,8 @@ pub mod array { use arrow_array::{ ArrowNativeTypeOp, BooleanArray, Date32Array, Date64Array, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, - TimestampMicrosecondArray, TimestampNanosecondArray, TimestampSecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, }; use arrow_schema::{IntervalUnit, TimeUnit}; use chrono::Utc; @@ -2518,7 +2519,7 @@ pub mod array { )) } DataType::Timestamp(TimeUnit::Millisecond, _) => { - Box::new(FnGen::::new_known_size( + Box::new(FnGen::::new_known_size( data_type, sample_fn, 1, width, )) } diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index d32cd6712e8..9099c3ecb29 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -485,6 +485,18 @@ impl FileWriter { self.schema_metadata.insert(key.into(), value.into()); } + pub fn initialize_with_external_metadata( + &mut self, + schema: lance_core::datatypes::Schema, + column_metadata: Vec, + rows_written: u64, + ) { + self.schema = Some(schema); + self.num_columns = column_metadata.len() as u32; + self.column_metadata = column_metadata; + self.rows_written = rows_written; + } + /// Adds a global buffer to the file /// /// The global buffer can contain any arbitrary bytes. It will be written to the disk @@ -585,7 +597,9 @@ impl FileWriter { .collect::>(); self.write_pages(encoding_tasks).await?; - self.finish_writers().await?; + if !self.column_writers.is_empty() { + self.finish_writers().await?; + } // 3. write global buffers (we write the schema here) let global_buffer_offsets = self.write_global_buffers().await?; diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index c422a5bcf45..5c77ca850d0 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -162,5 +162,9 @@ harness = false name = "random_access" harness = false +[[bench]] +name = "binary_copy" +harness = false + [lints] workspace = true diff --git a/rust/lance/benches/binary_copy.rs b/rust/lance/benches/binary_copy.rs new file mode 100644 index 00000000000..9808976b59f --- /dev/null +++ b/rust/lance/benches/binary_copy.rs @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +#![allow(clippy::print_stdout)] + +use std::sync::Arc; +use std::time::Duration; + +use arrow_array::types::{Float32Type, Float64Type, Int32Type, Int64Type}; +use arrow_schema::{DataType, Field, Fields, TimeUnit}; +use criterion::{criterion_group, criterion_main, Criterion}; +use lance::dataset::{optimize::CompactionOptions, Dataset, WriteParams}; +use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; +use tempfile::TempDir; + +const ROW_NUM: usize = 5_000_000; + +fn bench_binary_copy(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + + let temp = rt.block_on(prepare_dataset_on_disk(ROW_NUM)); + let dataset_path = temp.path().join("binary-copy-bench.lance"); + let dataset = rt.block_on(async { Dataset::open(dataset_path.to_str().unwrap()).await.unwrap() }); + let dataset = Arc::new(dataset); + + let mut group = c.benchmark_group("binary_copy_compaction"); + group.sample_size(1); + group.measurement_time(Duration::from_secs(600)); + + group.bench_function("full_compaction", |b| { + let dataset = dataset.clone(); + b.to_async(&rt).iter(move || { + let dataset = dataset.clone(); + async move { + let mut ds = dataset.checkout_version(1).await.unwrap(); + ds.restore().await.unwrap(); + let options = CompactionOptions { enable_binary_copy: false, ..Default::default() }; + let _metrics = lance::dataset::optimize::compact_files(&mut ds, options, None) + .await + .unwrap(); + } + }); + }); + + group.bench_function("binary_copy_compaction", |b| { + let dataset = dataset.clone(); + b.to_async(&rt).iter(move || { + let dataset = dataset.clone(); + async move { + let mut ds = dataset.checkout_version(1).await.unwrap(); + ds.restore().await.unwrap(); + let options = CompactionOptions { enable_binary_copy: true, ..Default::default() }; + let _metrics = lance::dataset::optimize::compact_files(&mut ds, options, None) + .await + .unwrap(); + } + }); + }); + + group.finish(); +} + +async fn prepare_dataset_on_disk(row_num: usize) -> TempDir { + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::UInt32, true), + Field::new("y", DataType::LargeUtf8, true), + ]); + let nested_fields = Fields::from(vec![ + Field::new("inner", DataType::Struct(inner_fields.clone()), true), + Field::new("fsb", DataType::FixedSizeBinary(16), true), + Field::new("bin", DataType::Binary, true), + ]); + let event_fields = Fields::from(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("payload", DataType::Binary, true), + ]); + + let reader = gen_batch() + .col("vec1", array::rand_vec::(Dimension::from(12))) + .col("vec2", array::rand_vec::(Dimension::from(8))) + .col("i32", array::step::()) + .col("i64", array::step::()) + .col("f32", array::rand::()) + .col("f64", array::rand::()) + .col("bool", array::rand_boolean()) + .col("date32", array::rand_date32()) + .col("date64", array::rand_date64()) + .col( + "ts_ms", + array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), + ) + .col("utf8", array::rand_utf8(lance_datagen::ByteCount::from(16), false)) + .col("large_utf8", array::random_sentence(1, 6, true)) + .col("bin", array::rand_fixedbin(lance_datagen::ByteCount::from(24), false)) + .col("large_bin", array::rand_fixedbin(lance_datagen::ByteCount::from(24), true)) + .col( + "varbin", + array::rand_varbin( + lance_datagen::ByteCount::from(8), + lance_datagen::ByteCount::from(32), + ), + ) + .col("fsb16", array::rand_fsb(16)) + .col("struct_simple", array::rand_struct(inner_fields.clone())) + .col("struct_nested", array::rand_struct(nested_fields)) + .col( + "events", + array::rand_list_any(array::rand_struct(event_fields.clone()), true), + ) + .into_reader_rows(RowCount::from(row_num as u64), BatchCount::from(10)); + + let tmp = TempDir::new().unwrap(); + let path = tmp.path().join("binary-copy-bench.lance"); + let uri = path.to_str().unwrap(); + + Dataset::write( + reader, + uri, + Some(WriteParams { + max_rows_per_file: (row_num / 100) as usize, + ..Default::default() + }), + ) + .await + .expect("failed to write dataset"); + + tmp +} + +#[cfg(target_os = "linux")] +criterion_group!( + name=benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_binary_copy); +#[cfg(not(target_os = "linux"))] +criterion_group!( + name=benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_binary_copy); +criterion_main!(benches); diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 8d98310194b..030a8463cd9 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -91,19 +91,21 @@ use super::rowids::load_row_id_sequences; use super::transaction::{Operation, RewriteGroup, RewrittenIndex, Transaction}; use super::utils::make_rowid_capture_stream; use super::{write_fragments_internal, WriteMode, WriteParams}; +use crate::dataset::utils::CapturedRowIds; use crate::io::commit::{commit_transaction, migrate_fragments}; use crate::Dataset; use crate::Result; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::SendableRecordBatchStream; use futures::{StreamExt, TryStreamExt}; +use lance_arrow::DataTypeExt; use lance_core::datatypes::BlobHandling; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::utils::tracing::{DATASET_COMPACTING_EVENT, TRACE_DATASET_EVENTS}; use lance_core::Error; use lance_index::frag_reuse::FragReuseGroup; use lance_index::DatasetIndexExt; -use lance_table::format::{Fragment, RowIdMeta}; +use lance_table::format::{DataFile, Fragment, RowIdMeta}; use roaring::{RoaringBitmap, RoaringTreemap}; use serde::{Deserialize, Serialize}; use snafu::location; @@ -111,9 +113,24 @@ use tracing::info; pub mod remapping; +use super::rowids::load_row_id_sequence; +use crate::dataset::fragment::write::generate_random_filename; use crate::index::frag_reuse::build_new_frag_reuse_index; use crate::io::deletion::read_dataset_deletion_file; +use lance_core::datatypes::Schema; +use lance_encoding::decoder::{ColumnInfo, PageEncoding, PageInfo as DecPageInfo}; +use lance_encoding::version::LanceFileVersion; +use lance_file::format::pbfile; +use lance_file::reader::FileReader as LFReader; +use lance_file::writer::{FileWriter, FileWriterOptions}; +use lance_io::object_writer::ObjectWriter; +use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; +use lance_io::traits::Writer; +use lance_table::rowids::{write_row_ids, RowIdSequence}; +use prost::Message; +use prost_types::Any; pub use remapping::{IgnoreRemap, IndexRemapper, IndexRemapperOptions, RemappedIndex}; +use tokio::io::AsyncWriteExt; /// Options to be passed to [compact_files]. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] @@ -156,6 +173,11 @@ pub struct CompactionOptions { /// not be remapped during this compaction operation. Instead, the fragment reuse index /// is updated and will be used to perform remapping later. pub defer_index_remap: bool, + /// Whether to enable binary copy optimization when eligible. + /// Defaults to false. + pub enable_binary_copy: bool, + pub enable_binary_copy_force: bool, + pub binary_copy_read_batch_bytes: Option, } impl Default for CompactionOptions { @@ -170,6 +192,9 @@ impl Default for CompactionOptions { max_bytes_per_file: None, batch_size: None, defer_index_remap: false, + enable_binary_copy: false, + enable_binary_copy_force: false, + binary_copy_read_batch_bytes: Some(16 * 1024 * 1024), } } } @@ -183,6 +208,95 @@ impl CompactionOptions { } } +/// Determine if page-level binary copy can safely merge the provided fragments. +/// +/// Preconditions checked in order: +/// - Feature flag `enable_binary_copy` is enabled +/// - Dataset storage format is non-legacy +/// - Fragment list is non-empty +/// - All data files share identical Lance file versions +/// - No fragment has a deletion file +/// TODO need to support schema evolution case like add column and drop column +/// - All data files share identical schema mappings (`fields`, `column_indices`) +fn can_use_binary_copy( + dataset: &Dataset, + options: &CompactionOptions, + fragments: &[Fragment], +) -> bool { + use lance_file::version::LanceFileVersion; + if !options.enable_binary_copy { + return false; + } + + // not support blob column for now + let has_blob_columns = dataset + .schema() + .fields_pre_order() + .any(|field| field.is_blob()); + if has_blob_columns { + return false; + } + + // Check dataset storage version is supported + // Binary copy is not supported for legacy Lance file format + let storage_ok = dataset + .manifest + .data_storage_format + .lance_file_version() + .map(|v| !matches!(v.resolve(), LanceFileVersion::Legacy)) + .unwrap_or(false); + if !storage_ok { + return false; + } + + if fragments.is_empty() { + return false; + } + + // Establish version baseline from first data file + let first_data_file_version = LanceFileVersion::try_from_major_minor( + fragments[0].files[0].file_major_version, + fragments[0].files[0].file_minor_version, + ) + .map(|v| v.resolve()) + .unwrap(); + // Capture schema mapping baseline from first data file + let ref_fields = &fragments[0].files[0].fields; + let ref_cols = &fragments[0].files[0].column_indices; + // Single-pass verification across fragments and their files + let mut is_same_version = true; + + for fragment in fragments { + // Reject fragments with deletions (binary copy does not materialize deletions) + if fragment.deletion_file.is_some() { + return false; + } + + // Check version and schema mapping equality for each data file + for data_file in &fragment.files { + let version_ok = LanceFileVersion::try_from_major_minor( + data_file.file_major_version, + data_file.file_minor_version, + ) + .map(|v| v.resolve()) + .is_ok_and(|v| v == first_data_file_version); + + if !version_ok { + is_same_version = false; + } + // Schema mapping must match exactly across all files + if data_file.fields != *ref_fields || data_file.column_indices != *ref_cols { + return false; + } + } + } + + if !is_same_version { + return false; + } + true +} + /// Metrics returned by [compact_files]. #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct CompactionMetrics { @@ -226,6 +340,13 @@ pub async fn compact_files( let compaction_plan: CompactionPlan = plan_compaction(dataset, &options).await?; + if compaction_plan.tasks().is_empty() && options.enable_binary_copy_force { + return Err(Error::NotSupported { + source: "not execute binary copy compaction task".into(), + location: location!(), + }); + } + // If nothing to compact, don't make a commit. if compaction_plan.tasks().is_empty() { return Ok(CompactionMetrics::default()); @@ -322,6 +443,64 @@ impl CompactionPlan { } } +/// Build a scan reader for rewrite and optionally capture row IDs. +/// +/// Parameters: +/// - `dataset`: Dataset handle used to create the scanner. +/// - `fragments`: When `with_frags` is true, restrict the scan to these old fragments +/// and preserve insertion order. +/// - `batch_size`: Optional batch size; if provided, set it on the scanner to control +/// read batching. +/// - `with_frags`: Whether to scan only the specified old fragments and force +/// in-order reading. +/// - `capture_row_ids`: When index remapping is needed, include and capture the +/// `_rowid` column from the stream. +/// +/// Returns: +/// - `SendableRecordBatchStream`: The batch stream (with `_rowid` removed if captured) +/// to feed the rewrite path. +/// - `Option>`: A receiver to obtain captured row IDs after the +/// stream completes; `None` if not capturing. +async fn prepare_reader( + dataset: &Dataset, + fragments: &[Fragment], + batch_size: Option, + with_frags: bool, + capture_row_ids: bool, +) -> Result<( + SendableRecordBatchStream, + Option>, +)> { + let mut scanner = dataset.scan(); + let has_blob_columns = dataset + .schema() + .fields_pre_order() + .any(|field| field.is_blob()); + if has_blob_columns { + scanner.blob_handling(BlobHandling::AllBinary); + } + if let Some(bs) = batch_size { + scanner.batch_size(bs); + } + if with_frags { + scanner + .with_fragments(fragments.to_vec()) + .scan_in_order(true); + } + if capture_row_ids { + scanner.with_row_id(); + let data = SendableRecordBatchStream::from(scanner.try_into_stream().await?); + let (data_no_row_ids, rx) = + make_rowid_capture_stream(data, dataset.manifest.uses_stable_row_ids())?; + Ok((data_no_row_ids, Some(rx))) + } else { + Ok(( + SendableRecordBatchStream::from(scanner.try_into_stream().await?), + None, + )) + } +} + /// A single group of fragments to compact, which is a view into the compaction /// plan. We keep the `replace_range` indices so we can map the result of the /// compact back to the fragments it replaces. @@ -672,6 +851,7 @@ async fn rewrite_files( .sum::(); // If we aren't using stable row ids, then we need to remap indices. let needs_remapping = !dataset.manifest.uses_stable_row_ids(); + let mut new_fragments: Vec; let mut scanner = dataset.scan(); let has_blob_columns = dataset .schema() @@ -683,7 +863,6 @@ async fn rewrite_files( if let Some(batch_size) = options.batch_size { scanner.batch_size(batch_size); } - // Generate an ID for logging purposes let task_id = uuid::Uuid::new_v4(); log::info!( "Compaction task {}: Begin compacting {} rows across {} fragments", @@ -691,19 +870,35 @@ async fn rewrite_files( num_rows, fragments.len() ); - scanner - .with_fragments(fragments.clone()) - .scan_in_order(true); - let (row_ids_rx, reader) = if needs_remapping { - scanner.with_row_id(); - let data = SendableRecordBatchStream::from(scanner.try_into_stream().await?); - let (data_no_row_ids, row_id_rx) = - make_rowid_capture_stream(data, dataset.manifest.uses_stable_row_ids())?; - (Some(row_id_rx), data_no_row_ids) + let can_binary_copy = can_use_binary_copy(dataset.as_ref(), options, &fragments); + if !can_binary_copy && options.enable_binary_copy_force { + return Err(Error::NotSupported { + source: format!("compaction task {}: binary copy is not supported", task_id).into(), + location: location!(), + }); + } + let mut row_ids_rx; + + let (reader, rx_initial) = if !can_binary_copy { + prepare_reader( + dataset.as_ref(), + &fragments, + options.batch_size, + true, + needs_remapping, + ) + .await? } else { - let data = SendableRecordBatchStream::from(scanner.try_into_stream().await?); - (None, data) + prepare_reader( + dataset.as_ref(), + &fragments, + options.batch_size, + false, + false, + ) + .await? }; + row_ids_rx = rx_initial; let mut rows_read = 0; let schema = reader.schema(); @@ -732,16 +927,73 @@ async fn rewrite_files( params.enable_stable_row_ids = true; } - let (mut new_fragments, _) = write_fragments_internal( - Some(dataset.as_ref()), - dataset.object_store.clone(), - &dataset.base, - dataset.schema().clone(), - reader, - params, - None, // Compaction doesn't use target_bases - ) - .await?; + if can_binary_copy { + new_fragments = rewrite_files_binary_copy( + dataset.as_ref(), + &fragments, + ¶ms, + options.binary_copy_read_batch_bytes, + ) + .await?; + + if new_fragments.is_empty() && options.enable_binary_copy_force { + return Err(Error::NotSupported { + source: format!("compaction task {}: binary copy is not supported", task_id).into(), + location: location!(), + }); + } + + if new_fragments.is_empty() { + // rollback to common compaction if binary copy not supported + let (reader_fallback, rx_fb) = prepare_reader( + dataset.as_ref(), + &fragments, + options.batch_size, + true, + needs_remapping, + ) + .await?; + row_ids_rx = rx_fb; + let (frags, _) = write_fragments_internal( + Some(dataset.as_ref()), + dataset.object_store.clone(), + &dataset.base, + dataset.schema().clone(), + reader_fallback, + params, + None, + ) + .await?; + new_fragments = frags; + } else if needs_remapping { + let (tx, rx) = std::sync::mpsc::channel(); + let mut addrs = RoaringTreemap::new(); + for frag in &fragments { + let frag_id = frag.id as u32; + let count = frag.physical_rows.unwrap_or(0); + for i in 0..count { + let addr = + lance_core::utils::address::RowAddress::new_from_parts(frag_id, i as u32); + addrs.insert(u64::from(addr)); + } + } + let captured = CapturedRowIds::AddressStyle(addrs); + let _ = tx.send(captured); + row_ids_rx = Some(rx); + } + } else { + let (frags, _) = write_fragments_internal( + Some(dataset.as_ref()), + dataset.object_store.clone(), + &dataset.base, + dataset.schema().clone(), + reader, + params, + None, + ) + .await?; + new_fragments = frags; + } log::info!("Compaction task {}: file written", task_id); @@ -768,9 +1020,9 @@ async fn rewrite_files( (Some(row_id_map), None) } } else { - log::info!("Compaction task {}: rechunking stable row ids", task_id); - rechunk_stable_row_ids(dataset.as_ref(), &mut new_fragments, &fragments).await?; if dataset.manifest.uses_stable_row_ids() { + log::info!("Compaction task {}: rechunking stable row ids", task_id); + rechunk_stable_row_ids(dataset.as_ref(), &mut new_fragments, &fragments).await?; recalc_versions_for_rewritten_fragments( dataset.as_ref(), &mut new_fragments, @@ -973,6 +1225,515 @@ async fn recalc_versions_for_rewritten_fragments( Ok(()) } +async fn rewrite_files_binary_copy( + dataset: &Dataset, + fragments: &[Fragment], + params: &WriteParams, + read_batch_bytes_opt: Option, +) -> Result> { + // Binary copy algorithm overview: + // - Reads page and buffer regions directly from source files in bounded batches + // - Appends them to a new output file with alignment, updating offsets + // - Recomputes page priorities by adding the cumulative row count to preserve order + // - For v2_0, enforces single-page structural header columns when closing a file + // - Writes a new footer (schema descriptor, column metadata, offset tables, version) + // - Optionally carries forward stable row ids and persists them inline in fragment metadata + // Merge small Lance files into larger ones by page-level binary copy. + let schema = dataset.schema().clone(); + let full_field_ids = schema.field_ids(); + + // The previous checks have ensured that the file versions of all files are consistent. + let version = LanceFileVersion::try_from_major_minor( + fragments[0].files[0].file_major_version, + fragments[0].files[0].file_minor_version, + ) + .unwrap() + .resolve(); + // v2_0 compatibility: column layout differs across file versions + // - v2_0 materializes BOTH leaf columns and non-leaf structural headers (e.g., Struct / List) + // which means the ColumnInfo set includes all fields in pre-order traversal. + // - v2_1+ materializes ONLY leaf columns. Non-leaf structural headers are not stored as columns. + // As a result, the ColumnInfo set contains leaf fields only. + // To correctly align copy layout, we derive `column_count` by version: + // - v2_0: use total number of fields in pre-order (leaf + non-leaf headers) + // - v2_1+: use only the number of leaf fields + let leaf_count = schema.fields_pre_order().filter(|f| f.is_leaf()).count(); + let column_count = if version == LanceFileVersion::V2_0 { + schema.fields_pre_order().count() + } else { + leaf_count + }; + + // v2_0 compatibility: build a map to identify non-leaf structural header columns + // - In v2_0 these headers exist as columns and must have a single page + // - In v2_1+ these headers are not stored as columns and this map is unused + let mut is_non_leaf_column: Vec = vec![false; column_count]; + if version == LanceFileVersion::V2_0 { + for (col_idx, field) in schema.fields_pre_order().enumerate() { + // Only mark non-packed Struct fields (lists remain as leaf data carriers) + let is_non_leaf = field.data_type().is_struct() && !field.is_packed_struct(); + is_non_leaf_column[col_idx] = is_non_leaf; + } + } + + let mut out: Vec = Vec::new(); + let mut current_writer: Option = None; + let mut current_filename: Option = None; + let mut current_pos: u64 = 0; + let mut current_page_table: Vec = Vec::new(); + + // Column-list> + let mut col_pages: Vec> = std::iter::repeat_with(Vec::::new) + .take(column_count) + .collect(); + let mut col_buffers: Vec> = vec![Vec::new(); column_count]; + let mut total_rows_in_current: u64 = 0; + let max_rows_per_file = params.max_rows_per_file as u64; + let uses_stable_row_ids = dataset.manifest.uses_stable_row_ids(); + let mut current_row_ids = RowIdSequence::new(); + + // Align all writes to 64-byte boundaries to honor typical IO alignment and + // keep buffer offsets valid across concatenated pages. + const ALIGN: usize = 64; + static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); + let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); + // Visit each fragment and all of its data files (a fragment may contain multiple files) + for frag in fragments.iter() { + let mut frag_row_ids_offset: u64 = 0; + let frag_row_ids = if uses_stable_row_ids { + Some(load_row_id_sequence(dataset, frag).await?) + } else { + None + }; + for df in frag.files.iter() { + let object_store = if let Some(base_id) = df.base_id { + dataset.object_store_for_base(base_id).await? + } else { + dataset.object_store.clone() + }; + let full_path = dataset.data_file_dir(df)?.child(df.path.as_str()); + let scan_scheduler = ScanScheduler::new( + object_store.clone(), + SchedulerConfig::max_bandwidth(&object_store), + ); + let file_scheduler = scan_scheduler + .open_file_with_priority(&full_path, 0, &df.file_size_bytes) + .await?; + let file_meta = LFReader::read_all_metadata(&file_scheduler).await?; + let src_collum_infos = file_meta.column_infos.clone(); + // Initialize current_page_table + if current_page_table.is_empty() { + current_page_table = src_collum_infos + .iter() + .map(|column_index| ColumnInfo { + index: column_index.index, + buffer_offsets_and_sizes: Arc::from( + Vec::<(u64, u64)>::new().into_boxed_slice(), + ), + page_infos: Arc::from(Vec::::new().into_boxed_slice()), + encoding: column_index.encoding.clone(), + }) + .collect(); + } + + // Iterate through each column of the current data file of the current fragment + for (col_idx, src_column_info) in src_collum_infos.iter().enumerate() { + // v2_0 compatibility: special handling for non-leaf structural header columns + // - v2_0 expects structural header columns to have a SINGLE page; they carry layout + // metadata only and are not true data carriers. + // - When merging multiple input files via binary copy, naively appending pages would + // yield multiple pages for the same structural header column, violating v2_0 rules. + // - To preserve v2_0 invariants, we skip pages beyond the first one for these columns. + // - During finalization we also normalize the single remaining page’s `num_rows` to the + // total number of rows in the output file and reset `priority` to 0. + // - For v2_1+ this logic does not apply because non-leaf headers are not stored as columns. + let is_non_leaf = col_idx < is_non_leaf_column.len() && is_non_leaf_column[col_idx]; + if is_non_leaf && !col_pages[col_idx].is_empty() { + continue; + } + + if current_writer.is_none() { + let filename = format!("{}.lance", generate_random_filename()); + let path = dataset.base.child(super::DATA_DIR).child(filename.as_str()); + let writer = dataset.object_store.create(&path).await?; + current_writer = Some(writer); + current_filename = Some(filename); + current_pos = 0; + } + + let read_batch_bytes: u64 = read_batch_bytes_opt.unwrap_or(16 * 1024 * 1024) as u64; + + let mut page_index = 0; + + // Iterate through each page of the current column in the current data file of the current fragment + while page_index < src_column_info.page_infos.len() { + let mut batch_ranges: Vec> = Vec::new(); + let mut batch_counts: Vec = Vec::new(); + let mut batch_bytes: u64 = 0; + let mut batch_pages: usize = 0; + // Build a single read batch by coalescing consecutive pages up to + // `read_batch_bytes` budget: + // - Accumulate total bytes (`batch_bytes`) and page count (`batch_pages`). + // - For each page, append its buffer ranges to `batch_ranges` and record + // the number of buffers in `batch_counts` so returned bytes can be + // mapped back to page boundaries. + // - Stop when adding the next page would exceed the byte budget, then + // issue one I/O request for the collected ranges. + // - Advance `page_index` to reflect pages scheduled in this batch. + for current_page in &src_column_info.page_infos[page_index..] { + let page_bytes: u64 = current_page + .buffer_offsets_and_sizes + .iter() + .map(|(_, size)| *size) + .sum(); + let would_exceed = + batch_pages > 0 && (batch_bytes + page_bytes > read_batch_bytes); + if would_exceed { + break; + } + batch_counts.push(current_page.buffer_offsets_and_sizes.len()); + for (offset, size) in current_page.buffer_offsets_and_sizes.iter() { + batch_ranges.push((*offset)..(*offset + *size)); + } + batch_bytes += page_bytes; + batch_pages += 1; + page_index += 1; + } + + let bytes_vec = if batch_ranges.is_empty() { + Vec::new() + } else { + // read many buffers at once + file_scheduler.submit_request(batch_ranges, 0).await? + }; + let mut bytes_iter = bytes_vec.into_iter(); + + for (local_idx, buffer_count) in batch_counts.iter().enumerate() { + // Reconstruct the absolute page index within the source column: + // - `page_index` now points to the page position + // - `batch_pages` is how many pages we included in this batch + // - `local_idx` enumerates pages inside the batch [0..batch_pages) + // Therefore `page_index - batch_pages + local_idx` yields the exact + // source page we are currently materializing, allowing us to access + // its metadata (encoding, row count, buffers) for the new page entry. + let page = + &src_column_info.page_infos[page_index - batch_pages + local_idx]; + let mut new_offsets = Vec::with_capacity(*buffer_count); + for _ in 0..*buffer_count { + if let Some(bytes) = bytes_iter.next() { + let writer = current_writer.as_mut().unwrap(); + let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; + if pad != 0 { + writer.write_all(&zero_buf[..pad]).await?; + current_pos += pad as u64; + } + let start = current_pos; + writer.write_all(&bytes).await?; + current_pos += bytes.len() as u64; + new_offsets.push((start, bytes.len() as u64)); + } + } + + // manual clone encoding + let encoding = if page.encoding.is_structural() { + PageEncoding::Structural(page.encoding.as_structural().clone()) + } else { + PageEncoding::Legacy(page.encoding.as_legacy().clone()) + }; + // `priority` acts as the global row offset for this page, ensuring + // downstream iterators maintain the correct logical order across + // merged inputs. + let new_page_info = DecPageInfo { + num_rows: page.num_rows, + priority: page.priority + total_rows_in_current, + encoding, + buffer_offsets_and_sizes: Arc::from(new_offsets.into_boxed_slice()), + }; + col_pages[col_idx].push(new_page_info); + } + } // finished scheduling & copying pages for this column in the current source file + + // Copy column-level buffers (outside page data) with alignment + if !src_column_info.buffer_offsets_and_sizes.is_empty() { + let ranges: Vec> = src_column_info + .buffer_offsets_and_sizes + .iter() + .map(|(offset, size)| (*offset)..(*offset + *size)) + .collect(); + let bytes_vec = file_scheduler.submit_request(ranges, 0).await?; + for bytes in bytes_vec.into_iter() { + let writer = current_writer.as_mut().unwrap(); + let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; + if pad != 0 { + writer.write_all(&zero_buf[..pad]).await?; + current_pos += pad as u64; + } + let start = current_pos; + writer.write_all(&bytes).await?; + current_pos += bytes.len() as u64; + col_buffers[col_idx].push((start, bytes.len() as u64)); + } + } + } // finished all columns in the current source file + + if uses_stable_row_ids { + // When stable row IDs are enabled, incorporate the fragment's row IDs + if let Some(seq) = frag_row_ids.as_ref() { + // Number of rows in the current source file + let count = file_meta.num_rows as usize; + + // Take the subsequence of row IDs corresponding to this file + let slice = seq.slice(frag_row_ids_offset as usize, count); + + // Materialize the slice into a Vec for conversion + // NOTE: This allocation can be avoided by extending with `slice` directly. + let ids_vec: Vec = slice.iter().collect(); + + // Append these row IDs to the accumulated sequence for the current output + current_row_ids.extend(RowIdSequence::from(ids_vec.as_slice())); + + // Advance the offset so the next file reads the subsequent row IDs + frag_row_ids_offset += count as u64; + } + } + + // Accumulate rows for the current output file and flush when reaching the threshold + total_rows_in_current += file_meta.num_rows; + if total_rows_in_current >= max_rows_per_file { + // v2_0 compatibility: enforce single-page structural headers before file close + // - We truncate to a single page and rewrite the page’s `num_rows` to match the output + // file’s row count so downstream decoders see a consistent header. + let mut final_cols: Vec> = Vec::with_capacity(column_count); + for (i, column_info) in current_page_table.iter().enumerate() { + // For v2_0 struct headers, force a single page and set num_rows to total + let mut pages_vec = std::mem::take(&mut col_pages[i]); + if version == LanceFileVersion::V2_0 + && is_non_leaf_column.get(i).copied().unwrap_or(false) + && !pages_vec.is_empty() + { + pages_vec[0].num_rows = total_rows_in_current; + pages_vec[0].priority = 0; + pages_vec.truncate(1); + } + let pages_arc = Arc::from(pages_vec.into_boxed_slice()); + let buffers_vec = std::mem::take(&mut col_buffers[i]); + final_cols.push(Arc::new(ColumnInfo::new( + column_info.index, + pages_arc, + buffers_vec, + column_info.encoding.clone(), + ))); + } + let writer = current_writer.take().unwrap(); + flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; + + // Register the newly closed output file as a fragment data file + let (maj, min) = version.to_numbers(); + let mut fragment_out = Fragment::new(0); + let mut data_file_out = + DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); + // v2_0 vs v2_1+ field-to-column index mapping + // - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping + // - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index + let is_structural = version >= LanceFileVersion::V2_1; + let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); + let mut curr_col_idx: i32 = 0; + for field in schema.fields_pre_order() { + if field.is_packed_struct() || field.children.is_empty() || !is_structural { + field_column_indices.push(curr_col_idx); + curr_col_idx += 1; + } else { + field_column_indices.push(-1); + } + } + data_file_out.fields = full_field_ids.clone(); + data_file_out.column_indices = field_column_indices; + fragment_out.files.push(data_file_out); + fragment_out.physical_rows = Some(total_rows_in_current as usize); + if uses_stable_row_ids { + fragment_out.row_id_meta = + Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); + } + // Reset state for next output file + current_writer = None; + current_pos = 0; + current_page_table.clear(); + for v in col_pages.iter_mut() { + v.clear(); + } + for v in col_buffers.iter_mut() { + v.clear(); + } + out.push(fragment_out); + total_rows_in_current = 0; + if uses_stable_row_ids { + current_row_ids = RowIdSequence::new(); + } + } + } + } // Complete the writing of all fragments, except for some data remaining in memory + + if total_rows_in_current > 0 { + // Flush remaining rows as a final output file + // v2_0 compatibility: same single-page enforcement applies for the final file close + let mut final_cols: Vec> = Vec::with_capacity(column_count); + for (i, ci) in current_page_table.iter().enumerate() { + // For v2_0 struct headers, force a single page and set num_rows to total + let mut pages_vec = std::mem::take(&mut col_pages[i]); + if version == LanceFileVersion::V2_0 + && is_non_leaf_column.get(i).copied().unwrap_or(false) + && !pages_vec.is_empty() + { + pages_vec[0].num_rows = total_rows_in_current; + pages_vec[0].priority = 0; + pages_vec.truncate(1); + } + let pages_arc = Arc::from(pages_vec.into_boxed_slice()); + let buffers_vec = std::mem::take(&mut col_buffers[i]); + final_cols.push(Arc::new(ColumnInfo::new( + ci.index, + pages_arc, + buffers_vec, + ci.encoding.clone(), + ))); + } + if current_writer.is_none() { + let filename = format!("{}.lance", generate_random_filename()); + let path = dataset.base.child(super::DATA_DIR).child(filename.as_str()); + let writer = dataset.object_store.create(&path).await?; + current_writer = Some(writer); + current_filename = Some(filename); + } + let writer = current_writer.take().unwrap(); + flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; + // Register the final file + let (maj, min) = version.to_numbers(); + let mut frag = Fragment::new(0); + let mut df = DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); + // v2_0 vs v2_1+ field-to-column index mapping for the final file + let is_structural = version >= LanceFileVersion::V2_1; + let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); + let mut curr_col_idx: i32 = 0; + for field in schema.fields_pre_order() { + if field.is_packed_struct() || field.children.is_empty() || !is_structural { + field_column_indices.push(curr_col_idx); + curr_col_idx += 1; + } else { + field_column_indices.push(-1); + } + } + df.fields = full_field_ids.clone(); + df.column_indices = field_column_indices; + frag.files.push(df); + frag.physical_rows = Some(total_rows_in_current as usize); + if uses_stable_row_ids { + frag.row_id_meta = Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); + } + out.push(frag); + } + Ok(out) +} + +/// Finalizes a compacted data file by writing the Lance footer via `FileWriter`. +/// +/// This function does not manually craft the footer. Instead it: +/// - Pads the current `ObjectWriter` position to a 64‑byte boundary (required for v2_1+ readers). +/// - Converts the collected per‑column info (`final_cols`) into `ColumnMetadata`. +/// - Constructs a `lance_file::writer::FileWriter` with the active `schema`, column metadata, +/// and `total_rows_in_current`. +/// - Calls `FileWriter::finish()` to emit column metadata, offset tables, global buffers +/// (schema descriptor), version, and to close the writer. +/// +/// Preconditions: +/// - All page data and column‑level buffers referenced by `final_cols` have already been written +/// to `writer`; otherwise offsets in the footer will be invalid. +/// +/// Version notes: +/// - v2_0 structural single‑page enforcement is handled when building `final_cols`; this function +/// only performs consistent finalization. +async fn flush_footer( + mut writer: ObjectWriter, + schema: &Schema, + final_cols: &[Arc], + total_rows_in_current: u64, + version: LanceFileVersion, +) -> Result<()> { + if version >= LanceFileVersion::V2_1 { + const ALIGN: usize = 64; + static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); + let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); + let pos = writer.tell().await? as u64; + let pad = (ALIGN as u64 - (pos % ALIGN as u64)) % ALIGN as u64; + if pad != 0 { + writer.write_all(&zero_buf[..pad as usize]).await?; + } + } + let mut col_metadatas = Vec::with_capacity(final_cols.len()); + for col in final_cols { + let pages = col + .page_infos + .iter() + .map(|page_info| { + let encoded_encoding = match &page_info.encoding { + PageEncoding::Legacy(array_encoding) => { + Any::from_msg(array_encoding)?.encode_to_vec() + } + PageEncoding::Structural(page_layout) => { + Any::from_msg(page_layout)?.encode_to_vec() + } + }; + let (buffer_offsets, buffer_sizes): (Vec<_>, Vec<_>) = page_info + .buffer_offsets_and_sizes + .as_ref() + .iter() + .cloned() + .unzip(); + Ok(pbfile::column_metadata::Page { + buffer_offsets, + buffer_sizes, + encoding: Some(pbfile::Encoding { + location: Some(pbfile::encoding::Location::Direct( + pbfile::DirectEncoding { + encoding: encoded_encoding, + }, + )), + }), + length: page_info.num_rows, + priority: page_info.priority, + }) + }) + .collect::>>()?; + let (buffer_offsets, buffer_sizes): (Vec<_>, Vec<_>) = + col.buffer_offsets_and_sizes.iter().cloned().unzip(); + let encoded_col_encoding = Any::from_msg(&col.encoding)?.encode_to_vec(); + let column = pbfile::ColumnMetadata { + pages, + buffer_offsets, + buffer_sizes, + encoding: Some(pbfile::Encoding { + location: Some(pbfile::encoding::Location::Direct(pbfile::DirectEncoding { + encoding: encoded_col_encoding, + })), + }), + }; + col_metadatas.push(column); + } + let mut file_writer = FileWriter::new_lazy( + writer, + FileWriterOptions { + format_version: Some(version), + ..Default::default() + }, + ); + file_writer.initialize_with_external_metadata( + schema.clone(), + col_metadatas, + total_rows_in_current, + ); + file_writer.finish().await?; + Ok(()) +} + /// Commit the results of file compaction. /// /// It is not required that all tasks are passed to this method. If some failed, @@ -1081,13 +1842,14 @@ mod tests { use self::remapping::RemappedIndex; use super::*; + use crate::dataset; use crate::dataset::index::frag_reuse::cleanup_frag_reuse_index; use crate::dataset::optimize::remapping::{transpose_row_addrs, transpose_row_ids_from_digest}; use crate::dataset::WriteDestination; use crate::index::frag_reuse::{load_frag_reuse_index_details, open_frag_reuse_index}; use crate::index::vector::{StageParams, VectorIndexParams}; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; - use arrow_array::types::{Float32Type, Int32Type, Int64Type}; + use arrow_array::types::{Float32Type, Float64Type, Int32Type, Int64Type}; use arrow_array::{ ArrayRef, Float32Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, PrimitiveArray, RecordBatch, RecordBatchIterator, @@ -1578,6 +2340,709 @@ mod tests { assert_eq!(fragment_ids, vec![3, 7, 8, 9, 10]); } + #[tokio::test] + async fn test_binary_copy_merge_small_files() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_merge_small_files(version).await; + } + } + + async fn do_test_binary_copy_merge_small_files(version: LanceFileVersion) { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let reader2 = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 2_500, + max_rows_per_group: 1_000, + data_storage_version: Some(version), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + dataset.append(reader2, Some(write_params)).await.unwrap(); + + let before = dataset.scan().try_into_batch().await.unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 100_000_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + let debug = dataset.manifest.clone(); + assert!(metrics.fragments_added >= 1); + assert_eq!( + dataset.count_rows(None).await.unwrap() as usize, + before.num_rows() + ); + let after = dataset.scan().try_into_batch().await.unwrap(); + assert_eq!(before, after); + } + + #[tokio::test] + async fn test_binary_copy_with_defer_remap() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_with_defer_remap(version).await; + } + } + + async fn do_test_binary_copy_with_defer_remap(version: LanceFileVersion) { + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; + use std::sync::Arc; + + let fixed_list_dt = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4); + + let meta_fields = Fields::from(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + Field::new("c", fixed_list_dt.clone(), true), + ]); + + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::UInt32, true), + Field::new("y", DataType::LargeUtf8, true), + ]); + let nested_fields = Fields::from(vec![ + Field::new("inner", DataType::Struct(inner_fields.clone()), true), + Field::new("fsb", DataType::FixedSizeBinary(8), true), + ]); + + let event_fields = Fields::from(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("payload", DataType::Binary, true), + ]); + + let reader = gen_batch() + .col("vec", array::rand_vec::(Dimension::from(16))) + .col("i", array::step::()) + .col("meta", array::rand_struct(meta_fields)) + .col("nested", array::rand_struct(nested_fields)) + .col( + "events", + array::rand_list_any(array::rand_struct(event_fields), true), + ) + .into_reader_rows(RowCount::from(6_000), BatchCount::from(1)); + + let mut dataset = Dataset::write( + reader, + "memory://test/binary_copy_nested", + Some(WriteParams { + max_rows_per_file: 1_000, + data_storage_version: Some(version), + ..Default::default() + }), + ) + .await + .unwrap(); + + let before_batch = dataset.scan().try_into_batch().await.unwrap(); + + let options = CompactionOptions { + defer_index_remap: true, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let after_batch = dataset.scan().try_into_batch().await.unwrap(); + + assert_eq!(before_batch, after_batch); + } + + #[tokio::test] + async fn test_binary_copy_with_stable_row_ids_enabled() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_with_stable_row_ids_enabled(version).await; + } + } + + async fn do_test_binary_copy_with_stable_row_ids_enabled(version: LanceFileVersion) { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector}; + let mut data_gen = BatchGenerator::new() + .col(Box::new( + RandomVector::new().vec_width(8).named("vec".to_owned()), + )) + .col(Box::new(IncrementingInt32::new().named("i".to_owned()))); + + let mut dataset = Dataset::write( + data_gen.batch(4_000), + "memory://test/binary_copy_stable_row_ids", + Some(WriteParams { + enable_stable_row_ids: true, + data_storage_version: Some(version), + max_rows_per_file: 500, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("scalar".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let params = VectorIndexParams::ivf_pq(1, 8, 1, MetricType::L2, 50); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vector".into()), + ¶ms, + false, + ) + .await + .unwrap(); + + async fn index_set(dataset: &Dataset) -> HashSet { + dataset + .load_indices() + .await + .unwrap() + .iter() + .map(|index| index.uuid) + .collect() + } + let indices = index_set(&dataset).await; + + async fn vector_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + let query = Float32Array::from(vec![0.0f32; 8]); + scanner + .nearest("vec", &query, 10) + .unwrap() + .project(&["i"]) + .unwrap(); + scanner.try_into_batch().await.unwrap() + } + + async fn scalar_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + scanner.filter("i = 100").unwrap().project(&["i"]).unwrap(); + scanner.try_into_batch().await.unwrap() + } + + let before_vec_result = vector_query(&dataset).await; + let before_scalar_result = scalar_query(&dataset).await; + + let before_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let current_indices = index_set(&dataset).await; + assert_eq!(indices, current_indices); + + let after_vec_result = vector_query(&dataset).await; + assert_eq!(before_vec_result, after_vec_result); + + let after_scalar_result = scalar_query(&dataset).await; + assert_eq!(before_scalar_result, after_scalar_result); + + let after_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap(); + + assert_eq!(before_batch, after_batch); + } + + #[tokio::test] + async fn test_binary_copy_without_stable_row_ids_remap() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_without_stable_row_ids_remap(version).await; + } + } + + async fn do_test_binary_copy_without_stable_row_ids_remap(version: LanceFileVersion) { + let mut data_gen = BatchGenerator::new() + .col(Box::new( + RandomVector::new().vec_width(8).named("vec".to_owned()), + )) + .col(Box::new(IncrementingInt32::new().named("i".to_owned()))); + + let mut dataset = Dataset::write( + data_gen.batch(4_000), + "memory://test/binary_copy_no_stable", + Some(WriteParams { + enable_stable_row_ids: false, + data_storage_version: Some(version), + max_rows_per_file: 500, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("scalar".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let params = VectorIndexParams::ivf_pq(1, 8, 1, MetricType::L2, 50); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vector".into()), + ¶ms, + false, + ) + .await + .unwrap(); + + async fn vector_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + let query = Float32Array::from(vec![0.0f32; 8]); + scanner + .nearest("vec", &query, 10) + .unwrap() + .project(&["i"]) + .unwrap(); + scanner.try_into_batch().await.unwrap() + } + + async fn scalar_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + scanner.filter("i = 100").unwrap().project(&["i"]).unwrap(); + scanner.try_into_batch().await.unwrap() + } + + let before_vec_result = vector_query(&dataset).await; + let before_scalar_result = scalar_query(&dataset).await; + let before_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let after_vec_result = vector_query(&dataset).await; + assert_eq!(before_vec_result, after_vec_result); + + let after_scalar_result = scalar_query(&dataset).await; + assert_eq!(before_scalar_result, after_scalar_result); + + let after_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + assert_eq!(before_batch, after_batch); + } + + #[tokio::test] + async fn test_perf_binary_copy_vs_full() { + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; + use std::time::Instant; + + let row_num = 5_000_000; + + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::UInt32, true), + Field::new("y", DataType::LargeUtf8, true), + ]); + let nested_fields = Fields::from(vec![ + Field::new("inner", DataType::Struct(inner_fields.clone()), true), + Field::new("fsb", DataType::FixedSizeBinary(16), true), + Field::new("bin", DataType::Binary, true), + ]); + let event_fields = Fields::from(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("payload", DataType::Binary, true), + ]); + + let reader_full = gen_batch() + .col("vec1", array::rand_vec::(Dimension::from(12))) + .col("vec2", array::rand_vec::(Dimension::from(8))) + .col("i32", array::step::()) + .col("i64", array::step::()) + .col("f32", array::rand::()) + .col("f64", array::rand::()) + .col("bool", array::rand_boolean()) + .col("date32", array::rand_date32()) + .col("date64", array::rand_date64()) + .col( + "ts_ms", + array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), + ) + .col( + "utf8", + array::rand_utf8(lance_datagen::ByteCount::from(16), false), + ) + .col("large_utf8", array::random_sentence(1, 6, true)) + .col( + "bin", + array::rand_fixedbin(lance_datagen::ByteCount::from(24), false), + ) + .col( + "large_bin", + array::rand_fixedbin(lance_datagen::ByteCount::from(24), true), + ) + .col( + "varbin", + array::rand_varbin( + lance_datagen::ByteCount::from(8), + lance_datagen::ByteCount::from(32), + ), + ) + .col("fsb16", array::rand_fsb(16)) + .col( + "fsl4", + array::cycle_vec(array::rand::(), Dimension::from(4)), + ) + .col("struct_simple", array::rand_struct(inner_fields.clone())) + .col("struct_nested", array::rand_struct(nested_fields)) + .col( + "events", + array::rand_list_any(array::rand_struct(event_fields.clone()), true), + ) + .into_reader_rows(RowCount::from(row_num), BatchCount::from(10)); + + let full_dir = TempStrDir::default(); + let a = full_dir.as_into_string().into(); + println!("full_dir: {:?}", a); + let mut dataset = Dataset::write( + reader_full, + &*full_dir, + Some(WriteParams { + enable_stable_row_ids: true, + max_rows_per_file: (row_num / 100) as usize, + ..Default::default() + }), + ) + .await + .unwrap(); + + let opt_full = CompactionOptions { + enable_binary_copy: false, + ..Default::default() + }; + let opt_binary = CompactionOptions { + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + + let t0 = Instant::now(); + let _ = compact_files(&mut dataset, opt_full, None).await.unwrap(); + let d_full = t0.elapsed(); + let before = dataset.count_rows(None).await.unwrap(); + + let versions = dataset.versions().await.unwrap(); + let mut dataset = dataset.checkout_version(1).await.unwrap(); + dataset.restore().await.unwrap(); + let t1 = Instant::now(); + let _ = compact_files(&mut dataset, opt_binary, None).await.unwrap(); + let d_bin = t1.elapsed(); + let after = dataset.count_rows(None).await.unwrap(); + + println!( + "perf: full_compaction={:?}, binary_copy={:?}, speedup={:.2}x", + d_full, + d_bin, + (d_full.as_secs_f64() / d_bin.as_secs_f64()) + ); + + assert_eq!(before, after); + } + + #[tokio::test] + async fn test_can_use_binary_copy_schema_consistency_ok() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader1 = RecordBatchIterator::new(vec![Ok(data.slice(0, 5_000))], data.schema()); + let reader2 = RecordBatchIterator::new(vec![Ok(data.slice(5_000, 5_000))], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 1_000, + ..Default::default() + }; + let mut dataset = Dataset::write(reader1, test_uri, Some(write_params.clone())) + .await + .unwrap(); + dataset.append(reader2, Some(write_params)).await.unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!(can_use_binary_copy(&dataset, &options, &frags)); + } + + #[tokio::test] + async fn test_can_use_binary_copy_schema_mismatch() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 1_000, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + ..Default::default() + }; + let mut frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + // Introduce a column index mismatch in the first data file + if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { + if let Some(first) = df.column_indices.get_mut(0) { + *first = -*first - 1; + } else { + df.column_indices.push(-1); + } + } + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + + // Also introduce a version mismatch and ensure rejection + if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { + df.file_minor_version = if df.file_minor_version == 1 { 2 } else { 1 }; + } + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + } + + #[tokio::test] + async fn test_can_use_binary_copy_reject_deletions() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 1_000, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + dataset.delete("a < 10").await.unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + ..Default::default() + }; + let frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + } + + #[tokio::test] + async fn test_binary_copy_fallback_to_common_compaction() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 500, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + dataset.delete("a < 100").await.unwrap(); + + let before = dataset.scan().try_into_batch().await.unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 100_000, + enable_binary_copy: true, + ..Default::default() + }; + + let frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let after = dataset.scan().try_into_batch().await.unwrap(); + assert_eq!(before, after); + } + + #[tokio::test] + async fn test_binary_copy_compaction_with_complex_schema() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_compaction_with_complex_schema(version).await; + } + } + + async fn do_test_binary_copy_compaction_with_complex_schema(version: LanceFileVersion) { + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; + + let row_num = 1_000; + + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::UInt32, true), + Field::new("y", DataType::LargeUtf8, true), + ]); + let nested_fields = Fields::from(vec![ + Field::new("inner", DataType::Struct(inner_fields.clone()), true), + Field::new("fsb", DataType::FixedSizeBinary(16), true), + Field::new("bin", DataType::Binary, true), + ]); + let event_fields = Fields::from(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("payload", DataType::Binary, true), + ]); + + let reader_full = gen_batch() + .col("vec1", array::rand_vec::(Dimension::from(12))) + .col("vec2", array::rand_vec::(Dimension::from(8))) + .col("i32", array::step::()) + .col("i64", array::step::()) + .col("f32", array::rand::()) + .col("f64", array::rand::()) + .col("bool", array::rand_boolean()) + .col("date32", array::rand_date32()) + .col("date64", array::rand_date64()) + .col( + "ts_ms", + array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), + ) + .col( + "utf8", + array::rand_utf8(lance_datagen::ByteCount::from(16), false), + ) + .col("large_utf8", array::random_sentence(1, 6, true)) + .col( + "bin", + array::rand_fixedbin(lance_datagen::ByteCount::from(24), false), + ) + .col( + "large_bin", + array::rand_fixedbin(lance_datagen::ByteCount::from(24), true), + ) + .col( + "varbin", + array::rand_varbin( + lance_datagen::ByteCount::from(8), + lance_datagen::ByteCount::from(32), + ), + ) + .col("fsb16", array::rand_fsb(16)) + .col( + "fsl4", + array::cycle_vec(array::rand::(), Dimension::from(4)), + ) + .col("struct_simple", array::rand_struct(inner_fields.clone())) + .col("struct_nested", array::rand_struct(nested_fields)) + .col( + "events", + array::rand_list_any(array::rand_struct(event_fields.clone()), true), + ) + .into_reader_rows(RowCount::from(row_num), BatchCount::from(10)); + + let full_dir = TempStrDir::default(); + let mut dataset = Dataset::write( + reader_full, + &*full_dir, + Some(WriteParams { + enable_stable_row_ids: true, + data_storage_version: Some(version), + max_rows_per_file: (row_num / 100) as usize, + ..Default::default() + }), + ) + .await + .unwrap(); + + let opt_full = CompactionOptions { + enable_binary_copy: false, + ..Default::default() + }; + let opt_binary = CompactionOptions { + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + + let _ = compact_files(&mut dataset, opt_full, None).await.unwrap(); + let before = dataset.count_rows(None).await.unwrap(); + let batch_before = dataset.scan().try_into_batch().await.unwrap(); + + let versions = dataset.versions().await.unwrap(); + let mut dataset = dataset.checkout_version(1).await.unwrap(); + + // rollback and trigger another binary copy compaction + dataset.restore().await.unwrap(); + let _ = compact_files(&mut dataset, opt_binary, None).await.unwrap(); + let after = dataset.count_rows(None).await.unwrap(); + let batch_after = dataset.scan().try_into_batch().await.unwrap(); + + assert_eq!(before, after); + assert_eq!(batch_before, batch_after); + } + #[rstest] #[tokio::test] async fn test_compact_data_files( From 79dc8232fc5d9f796f28517e5fd10ec026aa3f56 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 8 Dec 2025 17:04:06 +0800 Subject: [PATCH 02/24] feat: binary copy for compaction --- rust/lance/src/dataset/optimize.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 030a8463cd9..1e9d6ca7c31 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -852,17 +852,6 @@ async fn rewrite_files( // If we aren't using stable row ids, then we need to remap indices. let needs_remapping = !dataset.manifest.uses_stable_row_ids(); let mut new_fragments: Vec; - let mut scanner = dataset.scan(); - let has_blob_columns = dataset - .schema() - .fields_pre_order() - .any(|field| field.is_blob()); - if has_blob_columns { - scanner.blob_handling(BlobHandling::AllBinary); - } - if let Some(batch_size) = options.batch_size { - scanner.batch_size(batch_size); - } let task_id = uuid::Uuid::new_v4(); log::info!( "Compaction task {}: Begin compacting {} rows across {} fragments", @@ -2755,6 +2744,7 @@ mod tests { &*full_dir, Some(WriteParams { enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), max_rows_per_file: (row_num / 100) as usize, ..Default::default() }), From 36d0a9c67a47817d2c7f1a9903087730170596f4 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 8 Dec 2025 17:25:10 +0800 Subject: [PATCH 03/24] feat: binary copy for compaction --- rust/lance/benches/binary_copy.rs | 10 ++- rust/lance/src/dataset/optimize.rs | 119 ----------------------------- 2 files changed, 6 insertions(+), 123 deletions(-) diff --git a/rust/lance/benches/binary_copy.rs b/rust/lance/benches/binary_copy.rs index 9808976b59f..d8337e5c4c4 100644 --- a/rust/lance/benches/binary_copy.rs +++ b/rust/lance/benches/binary_copy.rs @@ -12,6 +12,7 @@ use criterion::{criterion_group, criterion_main, Criterion}; use lance::dataset::{optimize::CompactionOptions, Dataset, WriteParams}; use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; use tempfile::TempDir; +use lance_encoding::version::LanceFileVersion; const ROW_NUM: usize = 5_000_000; @@ -24,8 +25,8 @@ fn bench_binary_copy(c: &mut Criterion) { let dataset = Arc::new(dataset); let mut group = c.benchmark_group("binary_copy_compaction"); - group.sample_size(1); - group.measurement_time(Duration::from_secs(600)); + group.sample_size(3); + group.measurement_time(Duration::from_secs(3600)); group.bench_function("full_compaction", |b| { let dataset = dataset.clone(); @@ -118,6 +119,7 @@ async fn prepare_dataset_on_disk(row_num: usize) -> TempDir { uri, Some(WriteParams { max_rows_per_file: (row_num / 100) as usize, + data_storage_version: Some(LanceFileVersion::V2_2), ..Default::default() }), ) @@ -130,11 +132,11 @@ async fn prepare_dataset_on_disk(row_num: usize) -> TempDir { #[cfg(target_os = "linux")] criterion_group!( name=benches; - config = Criterion::default().significance_level(0.1).sample_size(10); + config = Criterion::default().significance_level(0.1).sample_size(1); targets = bench_binary_copy); #[cfg(not(target_os = "linux"))] criterion_group!( name=benches; - config = Criterion::default().significance_level(0.1).sample_size(10); + config = Criterion::default().significance_level(0.1).sample_size(1); targets = bench_binary_copy); criterion_main!(benches); diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 1e9d6ca7c31..f39746a420b 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -2666,125 +2666,6 @@ mod tests { assert_eq!(before_batch, after_batch); } - #[tokio::test] - async fn test_perf_binary_copy_vs_full() { - use arrow_schema::{DataType, Field, Fields, TimeUnit}; - use lance_core::utils::tempfile::TempStrDir; - use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; - use std::time::Instant; - - let row_num = 5_000_000; - - let inner_fields = Fields::from(vec![ - Field::new("x", DataType::UInt32, true), - Field::new("y", DataType::LargeUtf8, true), - ]); - let nested_fields = Fields::from(vec![ - Field::new("inner", DataType::Struct(inner_fields.clone()), true), - Field::new("fsb", DataType::FixedSizeBinary(16), true), - Field::new("bin", DataType::Binary, true), - ]); - let event_fields = Fields::from(vec![ - Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), - Field::new("payload", DataType::Binary, true), - ]); - - let reader_full = gen_batch() - .col("vec1", array::rand_vec::(Dimension::from(12))) - .col("vec2", array::rand_vec::(Dimension::from(8))) - .col("i32", array::step::()) - .col("i64", array::step::()) - .col("f32", array::rand::()) - .col("f64", array::rand::()) - .col("bool", array::rand_boolean()) - .col("date32", array::rand_date32()) - .col("date64", array::rand_date64()) - .col( - "ts_ms", - array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), - ) - .col( - "utf8", - array::rand_utf8(lance_datagen::ByteCount::from(16), false), - ) - .col("large_utf8", array::random_sentence(1, 6, true)) - .col( - "bin", - array::rand_fixedbin(lance_datagen::ByteCount::from(24), false), - ) - .col( - "large_bin", - array::rand_fixedbin(lance_datagen::ByteCount::from(24), true), - ) - .col( - "varbin", - array::rand_varbin( - lance_datagen::ByteCount::from(8), - lance_datagen::ByteCount::from(32), - ), - ) - .col("fsb16", array::rand_fsb(16)) - .col( - "fsl4", - array::cycle_vec(array::rand::(), Dimension::from(4)), - ) - .col("struct_simple", array::rand_struct(inner_fields.clone())) - .col("struct_nested", array::rand_struct(nested_fields)) - .col( - "events", - array::rand_list_any(array::rand_struct(event_fields.clone()), true), - ) - .into_reader_rows(RowCount::from(row_num), BatchCount::from(10)); - - let full_dir = TempStrDir::default(); - let a = full_dir.as_into_string().into(); - println!("full_dir: {:?}", a); - let mut dataset = Dataset::write( - reader_full, - &*full_dir, - Some(WriteParams { - enable_stable_row_ids: true, - data_storage_version: Some(LanceFileVersion::V2_2), - max_rows_per_file: (row_num / 100) as usize, - ..Default::default() - }), - ) - .await - .unwrap(); - - let opt_full = CompactionOptions { - enable_binary_copy: false, - ..Default::default() - }; - let opt_binary = CompactionOptions { - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - - let t0 = Instant::now(); - let _ = compact_files(&mut dataset, opt_full, None).await.unwrap(); - let d_full = t0.elapsed(); - let before = dataset.count_rows(None).await.unwrap(); - - let versions = dataset.versions().await.unwrap(); - let mut dataset = dataset.checkout_version(1).await.unwrap(); - dataset.restore().await.unwrap(); - let t1 = Instant::now(); - let _ = compact_files(&mut dataset, opt_binary, None).await.unwrap(); - let d_bin = t1.elapsed(); - let after = dataset.count_rows(None).await.unwrap(); - - println!( - "perf: full_compaction={:?}, binary_copy={:?}, speedup={:.2}x", - d_full, - d_bin, - (d_full.as_secs_f64() / d_bin.as_secs_f64()) - ); - - assert_eq!(before, after); - } - #[tokio::test] async fn test_can_use_binary_copy_schema_consistency_ok() { let test_dir = TempStrDir::default(); From a7711f84165b50bf853c210058fd8709c62bb898 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 8 Dec 2025 17:49:53 +0800 Subject: [PATCH 04/24] feat: binary copy for compaction --- rust/lance/benches/binary_copy.rs | 142 ------------------------------ 1 file changed, 142 deletions(-) delete mode 100644 rust/lance/benches/binary_copy.rs diff --git a/rust/lance/benches/binary_copy.rs b/rust/lance/benches/binary_copy.rs deleted file mode 100644 index d8337e5c4c4..00000000000 --- a/rust/lance/benches/binary_copy.rs +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -#![allow(clippy::print_stdout)] - -use std::sync::Arc; -use std::time::Duration; - -use arrow_array::types::{Float32Type, Float64Type, Int32Type, Int64Type}; -use arrow_schema::{DataType, Field, Fields, TimeUnit}; -use criterion::{criterion_group, criterion_main, Criterion}; -use lance::dataset::{optimize::CompactionOptions, Dataset, WriteParams}; -use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; -use tempfile::TempDir; -use lance_encoding::version::LanceFileVersion; - -const ROW_NUM: usize = 5_000_000; - -fn bench_binary_copy(c: &mut Criterion) { - let rt = tokio::runtime::Runtime::new().unwrap(); - - let temp = rt.block_on(prepare_dataset_on_disk(ROW_NUM)); - let dataset_path = temp.path().join("binary-copy-bench.lance"); - let dataset = rt.block_on(async { Dataset::open(dataset_path.to_str().unwrap()).await.unwrap() }); - let dataset = Arc::new(dataset); - - let mut group = c.benchmark_group("binary_copy_compaction"); - group.sample_size(3); - group.measurement_time(Duration::from_secs(3600)); - - group.bench_function("full_compaction", |b| { - let dataset = dataset.clone(); - b.to_async(&rt).iter(move || { - let dataset = dataset.clone(); - async move { - let mut ds = dataset.checkout_version(1).await.unwrap(); - ds.restore().await.unwrap(); - let options = CompactionOptions { enable_binary_copy: false, ..Default::default() }; - let _metrics = lance::dataset::optimize::compact_files(&mut ds, options, None) - .await - .unwrap(); - } - }); - }); - - group.bench_function("binary_copy_compaction", |b| { - let dataset = dataset.clone(); - b.to_async(&rt).iter(move || { - let dataset = dataset.clone(); - async move { - let mut ds = dataset.checkout_version(1).await.unwrap(); - ds.restore().await.unwrap(); - let options = CompactionOptions { enable_binary_copy: true, ..Default::default() }; - let _metrics = lance::dataset::optimize::compact_files(&mut ds, options, None) - .await - .unwrap(); - } - }); - }); - - group.finish(); -} - -async fn prepare_dataset_on_disk(row_num: usize) -> TempDir { - let inner_fields = Fields::from(vec![ - Field::new("x", DataType::UInt32, true), - Field::new("y", DataType::LargeUtf8, true), - ]); - let nested_fields = Fields::from(vec![ - Field::new("inner", DataType::Struct(inner_fields.clone()), true), - Field::new("fsb", DataType::FixedSizeBinary(16), true), - Field::new("bin", DataType::Binary, true), - ]); - let event_fields = Fields::from(vec![ - Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), - Field::new("payload", DataType::Binary, true), - ]); - - let reader = gen_batch() - .col("vec1", array::rand_vec::(Dimension::from(12))) - .col("vec2", array::rand_vec::(Dimension::from(8))) - .col("i32", array::step::()) - .col("i64", array::step::()) - .col("f32", array::rand::()) - .col("f64", array::rand::()) - .col("bool", array::rand_boolean()) - .col("date32", array::rand_date32()) - .col("date64", array::rand_date64()) - .col( - "ts_ms", - array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), - ) - .col("utf8", array::rand_utf8(lance_datagen::ByteCount::from(16), false)) - .col("large_utf8", array::random_sentence(1, 6, true)) - .col("bin", array::rand_fixedbin(lance_datagen::ByteCount::from(24), false)) - .col("large_bin", array::rand_fixedbin(lance_datagen::ByteCount::from(24), true)) - .col( - "varbin", - array::rand_varbin( - lance_datagen::ByteCount::from(8), - lance_datagen::ByteCount::from(32), - ), - ) - .col("fsb16", array::rand_fsb(16)) - .col("struct_simple", array::rand_struct(inner_fields.clone())) - .col("struct_nested", array::rand_struct(nested_fields)) - .col( - "events", - array::rand_list_any(array::rand_struct(event_fields.clone()), true), - ) - .into_reader_rows(RowCount::from(row_num as u64), BatchCount::from(10)); - - let tmp = TempDir::new().unwrap(); - let path = tmp.path().join("binary-copy-bench.lance"); - let uri = path.to_str().unwrap(); - - Dataset::write( - reader, - uri, - Some(WriteParams { - max_rows_per_file: (row_num / 100) as usize, - data_storage_version: Some(LanceFileVersion::V2_2), - ..Default::default() - }), - ) - .await - .expect("failed to write dataset"); - - tmp -} - -#[cfg(target_os = "linux")] -criterion_group!( - name=benches; - config = Criterion::default().significance_level(0.1).sample_size(1); - targets = bench_binary_copy); -#[cfg(not(target_os = "linux"))] -criterion_group!( - name=benches; - config = Criterion::default().significance_level(0.1).sample_size(1); - targets = bench_binary_copy); -criterion_main!(benches); From a5bd940b389e2c94035103a6042a42ac17b5d6e7 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 8 Dec 2025 17:56:10 +0800 Subject: [PATCH 05/24] feat: binary copy for compaction --- rust/lance/Cargo.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 5c77ca850d0..c422a5bcf45 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -162,9 +162,5 @@ harness = false name = "random_access" harness = false -[[bench]] -name = "binary_copy" -harness = false - [lints] workspace = true From e0bc0f94fabd111ac3c64e12a38ec8a512ea8ec0 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 8 Dec 2025 18:26:06 +0800 Subject: [PATCH 06/24] feat: binary copy for compaction --- rust/lance/src/dataset/optimize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index f39746a420b..198e0a276fa 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -2463,7 +2463,7 @@ mod tests { let mut dataset = Dataset::write( data_gen.batch(4_000), - "memory://test/binary_copy_stable_row_ids", + format!("memory://test/binary_copy_stable_row_ids_{}", version).as_str(), Some(WriteParams { enable_stable_row_ids: true, data_storage_version: Some(version), From c68397434728618d8734b4a8240c61f8269ad9ca Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 9 Dec 2025 11:21:32 +0800 Subject: [PATCH 07/24] feat: binary copy for compaction --- .../src/object_store/providers/aws.rs | 16 ++++++++++---- rust/lance/src/dataset/optimize.rs | 22 ++++++++++++++----- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 9bd93bf029a..4ef1cf88e9c 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -214,10 +214,18 @@ async fn resolve_s3_region( client_options = client_options.with_config(*client_key, value.clone()); } } - - let bucket_region = - object_store::aws::resolve_bucket_region(bucket, &client_options).await?; - Ok(Some(bucket_region)) + match object_store::aws::resolve_bucket_region(bucket, &client_options).await { + Ok(bucket_region) => Ok(Some(bucket_region)), + Err(e) => { + log::debug!( + "Failed to resolve S3 bucket region for '{}': {:?}; defaulting to provider chain", + bucket, + e + ); + // Fallback to region provider chain; let downstream choose a default + Ok(None) + } + } } else { Ok(None) } diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 198e0a276fa..d77864af539 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -1831,7 +1831,6 @@ mod tests { use self::remapping::RemappedIndex; use super::*; - use crate::dataset; use crate::dataset::index::frag_reuse::cleanup_frag_reuse_index; use crate::dataset::optimize::remapping::{transpose_row_addrs, transpose_row_ids_from_digest}; use crate::dataset::WriteDestination; @@ -2363,7 +2362,6 @@ mod tests { ..Default::default() }; let metrics = compact_files(&mut dataset, options, None).await.unwrap(); - let debug = dataset.manifest.clone(); assert!(metrics.fragments_added >= 1); assert_eq!( dataset.count_rows(None).await.unwrap() as usize, @@ -2562,7 +2560,22 @@ mod tests { .await .unwrap(); - assert_eq!(before_batch, after_batch); + let before_idx = arrow_ord::sort::sort_to_indices( + before_batch.column_by_name(lance_core::ROW_ID).unwrap(), + None, + None, + ) + .unwrap(); + let after_idx = arrow_ord::sort::sort_to_indices( + after_batch.column_by_name(lance_core::ROW_ID).unwrap(), + None, + None, + ) + .unwrap(); + let before = arrow::compute::take_record_batch(&before_batch, &before_idx).unwrap(); + let after = arrow::compute::take_record_batch(&after_batch, &after_idx).unwrap(); + + assert_eq!(before, after); } #[tokio::test] @@ -2705,7 +2718,7 @@ mod tests { max_rows_per_file: 1_000, ..Default::default() }; - let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + let dataset = Dataset::write(reader, test_uri, Some(write_params)) .await .unwrap(); @@ -2901,7 +2914,6 @@ mod tests { let before = dataset.count_rows(None).await.unwrap(); let batch_before = dataset.scan().try_into_batch().await.unwrap(); - let versions = dataset.versions().await.unwrap(); let mut dataset = dataset.checkout_version(1).await.unwrap(); // rollback and trigger another binary copy compaction From 0e6d74b40154a27d7a9a2228227788c270bc56a0 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 11 Dec 2025 17:18:13 +0800 Subject: [PATCH 08/24] code review --- rust/lance-file/src/writer.rs | 9 + .../src/object_store/providers/aws.rs | 16 +- rust/lance-table/src/rowids.rs | 6 + rust/lance/src/dataset/optimize.rs | 163 ++++++++++-------- 4 files changed, 111 insertions(+), 83 deletions(-) diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index 9099c3ecb29..a18daa57383 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -485,6 +485,15 @@ impl FileWriter { self.schema_metadata.insert(key.into(), value.into()); } + /// Prepare the writer when column data and metadata were produced externally. + /// + /// This is useful for flows that copy already-encoded pages (e.g., binary copy + /// during compaction) where the column buffers have been written directly and we + /// only need to write the footer and schema metadata. The provided + /// `column_metadata` must describe the buffers already persisted by the + /// underlying `ObjectWriter`, and `rows_written` should reflect the total number + /// of rows in those buffers. Call this on a lazily created writer before + /// invoking [`finish`]. pub fn initialize_with_external_metadata( &mut self, schema: lance_core::datatypes::Schema, diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 4ef1cf88e9c..9bd93bf029a 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -214,18 +214,10 @@ async fn resolve_s3_region( client_options = client_options.with_config(*client_key, value.clone()); } } - match object_store::aws::resolve_bucket_region(bucket, &client_options).await { - Ok(bucket_region) => Ok(Some(bucket_region)), - Err(e) => { - log::debug!( - "Failed to resolve S3 bucket region for '{}': {:?}; defaulting to provider chain", - bucket, - e - ); - // Fallback to region provider chain; let downstream choose a default - Ok(None) - } - } + + let bucket_region = + object_store::aws::resolve_bucket_region(bucket, &client_options).await?; + Ok(Some(bucket_region)) } else { Ok(None) } diff --git a/rust/lance-table/src/rowids.rs b/rust/lance-table/src/rowids.rs index 81671e871d3..3434c06dc5d 100644 --- a/rust/lance-table/src/rowids.rs +++ b/rust/lance-table/src/rowids.rs @@ -102,6 +102,12 @@ impl From<&[u64]> for RowIdSequence { } } +impl FromIterator for RowIdSequence { + fn from_iter>(iter: T) -> Self { + Self(vec![U64Segment::from_iter(iter)]) + } +} + impl RowIdSequence { pub fn new() -> Self { Self::default() diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index d77864af539..19029f57f7d 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -176,7 +176,13 @@ pub struct CompactionOptions { /// Whether to enable binary copy optimization when eligible. /// Defaults to false. pub enable_binary_copy: bool, + /// Whether to force binary copy optimization. If true, compaction will fail + /// if binary copy is not supported for the given fragments. + /// Defaults to false. pub enable_binary_copy_force: bool, + /// The batch size in bytes for reading during binary copy operations. + /// Controls how much data is read at once when performing binary copy. + /// Defaults to 16MB (16 * 1024 * 1024). pub binary_copy_read_batch_bytes: Option, } @@ -216,7 +222,7 @@ impl CompactionOptions { /// - Fragment list is non-empty /// - All data files share identical Lance file versions /// - No fragment has a deletion file -/// TODO need to support schema evolution case like add column and drop column +/// TODO: Need to support schema evolution case like add column and drop column /// - All data files share identical schema mappings (`fields`, `column_indices`) fn can_use_binary_copy( dataset: &Dataset, @@ -253,13 +259,15 @@ fn can_use_binary_copy( return false; } - // Establish version baseline from first data file - let first_data_file_version = LanceFileVersion::try_from_major_minor( - fragments[0].files[0].file_major_version, - fragments[0].files[0].file_minor_version, - ) - .map(|v| v.resolve()) - .unwrap(); + // Establish version baseline from the dataset manifest + let storage_file_version = match dataset + .manifest + .data_storage_format + .lance_file_version() + { + Ok(version) => version.resolve(), + Err(_) => return false, + }; // Capture schema mapping baseline from first data file let ref_fields = &fragments[0].files[0].fields; let ref_cols = &fragments[0].files[0].column_indices; @@ -279,7 +287,7 @@ fn can_use_binary_copy( data_file.file_minor_version, ) .map(|v| v.resolve()) - .is_ok_and(|v| v == first_data_file_version); + .is_ok_and(|v| v == storage_file_version); if !version_ok { is_same_version = false; @@ -342,7 +350,7 @@ pub async fn compact_files( if compaction_plan.tasks().is_empty() && options.enable_binary_copy_force { return Err(Error::NotSupported { - source: "not execute binary copy compaction task".into(), + source: "cannot execute binary copy compaction task".into(), location: location!(), }); } @@ -866,41 +874,36 @@ async fn rewrite_files( location: location!(), }); } - let mut row_ids_rx; + let mut row_ids_rx: Option> = None; + let mut reader: Option = None; - let (reader, rx_initial) = if !can_binary_copy { - prepare_reader( + if !can_binary_copy { + let (prepared_reader, rx_initial) = prepare_reader( dataset.as_ref(), &fragments, options.batch_size, true, needs_remapping, ) - .await? - } else { - prepare_reader( - dataset.as_ref(), - &fragments, - options.batch_size, - false, - false, - ) - .await? - }; - row_ids_rx = rx_initial; - - let mut rows_read = 0; - let schema = reader.schema(); - let reader = reader.inspect_ok(move |batch| { - rows_read += batch.num_rows(); - log::info!( - "Compaction task {}: Read progress {}/{}", - task_id, - rows_read, - num_rows, - ); - }); - let reader = Box::pin(RecordBatchStreamAdapter::new(schema, reader)); + .await?; + row_ids_rx = rx_initial; + + let mut rows_read = 0; + let schema = prepared_reader.schema(); + let reader_with_progress = prepared_reader.inspect_ok(move |batch| { + rows_read += batch.num_rows(); + log::info!( + "Compaction task {}: Read progress {}/{}", + task_id, + rows_read, + num_rows, + ); + }); + reader = Some(Box::pin(RecordBatchStreamAdapter::new( + schema, + reader_with_progress, + ))); + } let mut params = WriteParams { max_rows_per_file: options.target_rows_per_fragment, @@ -932,29 +935,7 @@ async fn rewrite_files( }); } - if new_fragments.is_empty() { - // rollback to common compaction if binary copy not supported - let (reader_fallback, rx_fb) = prepare_reader( - dataset.as_ref(), - &fragments, - options.batch_size, - true, - needs_remapping, - ) - .await?; - row_ids_rx = rx_fb; - let (frags, _) = write_fragments_internal( - Some(dataset.as_ref()), - dataset.object_store.clone(), - &dataset.base, - dataset.schema().clone(), - reader_fallback, - params, - None, - ) - .await?; - new_fragments = frags; - } else if needs_remapping { + if needs_remapping { let (tx, rx) = std::sync::mpsc::channel(); let mut addrs = RoaringTreemap::new(); for frag in &fragments { @@ -976,7 +957,7 @@ async fn rewrite_files( dataset.object_store.clone(), &dataset.base, dataset.schema().clone(), - reader, + reader.expect("reader must be prepared for non-binary-copy path"), params, None, ) @@ -1309,10 +1290,10 @@ async fn rewrite_files_binary_copy( .open_file_with_priority(&full_path, 0, &df.file_size_bytes) .await?; let file_meta = LFReader::read_all_metadata(&file_scheduler).await?; - let src_collum_infos = file_meta.column_infos.clone(); + let src_colum_infos = file_meta.column_infos.clone(); // Initialize current_page_table if current_page_table.is_empty() { - current_page_table = src_collum_infos + current_page_table = src_colum_infos .iter() .map(|column_index| ColumnInfo { index: column_index.index, @@ -1326,7 +1307,7 @@ async fn rewrite_files_binary_copy( } // Iterate through each column of the current data file of the current fragment - for (col_idx, src_column_info) in src_collum_infos.iter().enumerate() { + for (col_idx, src_column_info) in src_colum_infos.iter().enumerate() { // v2_0 compatibility: special handling for non-leaf structural header columns // - v2_0 expects structural header columns to have a SINGLE page; they carry layout // metadata only and are not true data carriers. @@ -1474,12 +1455,8 @@ async fn rewrite_files_binary_copy( // Take the subsequence of row IDs corresponding to this file let slice = seq.slice(frag_row_ids_offset as usize, count); - // Materialize the slice into a Vec for conversion - // NOTE: This allocation can be avoided by extending with `slice` directly. - let ids_vec: Vec = slice.iter().collect(); - // Append these row IDs to the accumulated sequence for the current output - current_row_ids.extend(RowIdSequence::from(ids_vec.as_slice())); + current_row_ids.extend(slice.iter().into_iter().collect()); // Advance the offset so the next file reads the subsequent row IDs frag_row_ids_offset += count as u64; @@ -1560,7 +1537,7 @@ async fn rewrite_files_binary_copy( } } } - } // Complete the writing of all fragments, except for some data remaining in memory + } // Finished writing all fragments; any remaining data in memory will be flushed below if total_rows_in_current > 0 { // Flush remaining rows as a final output file @@ -2748,6 +2725,50 @@ mod tests { assert!(!can_use_binary_copy(&dataset, &options, &frags)); } + #[tokio::test] + async fn test_can_use_binary_copy_version_mismatch() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 500, + data_storage_version: Some(LanceFileVersion::V2_0), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + // Append additional data and then mark its files as a newer format version (v2.1). + let reader_append = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + + dataset.append(reader_append, None).await.unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + ..Default::default() + }; + let mut frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!( + frags.len() >= 2, + "expected multiple fragments for version mismatch test" + ); + + // Simulate mixed file versions by marking the second fragment as v2.1. + let (v21_major, v21_minor) = LanceFileVersion::V2_1.to_numbers(); + for file in &mut frags[1].files { + file.file_major_version = v21_major; + file.file_minor_version = v21_minor; + } + + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + } + #[tokio::test] async fn test_can_use_binary_copy_reject_deletions() { let test_dir = TempStrDir::default(); From 041a79b1e339fdd83825ac1b3a5eefbd3fbdc399 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 11 Dec 2025 19:09:34 +0800 Subject: [PATCH 09/24] code review --- rust/lance/src/dataset/optimize.rs | 1177 +---------------- .../lance/src/dataset/optimize/binary_copy.rs | 555 ++++++++ .../src/dataset/optimize/tests/binary_copy.rs | 774 +++++++++++ 3 files changed, 1337 insertions(+), 1169 deletions(-) create mode 100644 rust/lance/src/dataset/optimize/binary_copy.rs create mode 100644 rust/lance/src/dataset/optimize/tests/binary_copy.rs diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 19029f57f7d..40eb5339877 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -98,39 +98,26 @@ use crate::Result; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::SendableRecordBatchStream; use futures::{StreamExt, TryStreamExt}; -use lance_arrow::DataTypeExt; use lance_core::datatypes::BlobHandling; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::utils::tracing::{DATASET_COMPACTING_EVENT, TRACE_DATASET_EVENTS}; use lance_core::Error; use lance_index::frag_reuse::FragReuseGroup; use lance_index::DatasetIndexExt; -use lance_table::format::{DataFile, Fragment, RowIdMeta}; +use lance_table::format::{Fragment, RowIdMeta}; use roaring::{RoaringBitmap, RoaringTreemap}; use serde::{Deserialize, Serialize}; use snafu::location; use tracing::info; +mod binary_copy; pub mod remapping; use super::rowids::load_row_id_sequence; -use crate::dataset::fragment::write::generate_random_filename; use crate::index::frag_reuse::build_new_frag_reuse_index; use crate::io::deletion::read_dataset_deletion_file; -use lance_core::datatypes::Schema; -use lance_encoding::decoder::{ColumnInfo, PageEncoding, PageInfo as DecPageInfo}; -use lance_encoding::version::LanceFileVersion; -use lance_file::format::pbfile; -use lance_file::reader::FileReader as LFReader; -use lance_file::writer::{FileWriter, FileWriterOptions}; -use lance_io::object_writer::ObjectWriter; -use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; -use lance_io::traits::Writer; -use lance_table::rowids::{write_row_ids, RowIdSequence}; -use prost::Message; -use prost_types::Any; +use binary_copy::rewrite_files_binary_copy; pub use remapping::{IgnoreRemap, IndexRemapper, IndexRemapperOptions, RemappedIndex}; -use tokio::io::AsyncWriteExt; /// Options to be passed to [compact_files]. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] @@ -260,11 +247,7 @@ fn can_use_binary_copy( } // Establish version baseline from the dataset manifest - let storage_file_version = match dataset - .manifest - .data_storage_format - .lance_file_version() - { + let storage_file_version = match dataset.manifest.data_storage_format.lance_file_version() { Ok(version) => version.resolve(), Err(_) => return false, }; @@ -1195,511 +1178,6 @@ async fn recalc_versions_for_rewritten_fragments( Ok(()) } -async fn rewrite_files_binary_copy( - dataset: &Dataset, - fragments: &[Fragment], - params: &WriteParams, - read_batch_bytes_opt: Option, -) -> Result> { - // Binary copy algorithm overview: - // - Reads page and buffer regions directly from source files in bounded batches - // - Appends them to a new output file with alignment, updating offsets - // - Recomputes page priorities by adding the cumulative row count to preserve order - // - For v2_0, enforces single-page structural header columns when closing a file - // - Writes a new footer (schema descriptor, column metadata, offset tables, version) - // - Optionally carries forward stable row ids and persists them inline in fragment metadata - // Merge small Lance files into larger ones by page-level binary copy. - let schema = dataset.schema().clone(); - let full_field_ids = schema.field_ids(); - - // The previous checks have ensured that the file versions of all files are consistent. - let version = LanceFileVersion::try_from_major_minor( - fragments[0].files[0].file_major_version, - fragments[0].files[0].file_minor_version, - ) - .unwrap() - .resolve(); - // v2_0 compatibility: column layout differs across file versions - // - v2_0 materializes BOTH leaf columns and non-leaf structural headers (e.g., Struct / List) - // which means the ColumnInfo set includes all fields in pre-order traversal. - // - v2_1+ materializes ONLY leaf columns. Non-leaf structural headers are not stored as columns. - // As a result, the ColumnInfo set contains leaf fields only. - // To correctly align copy layout, we derive `column_count` by version: - // - v2_0: use total number of fields in pre-order (leaf + non-leaf headers) - // - v2_1+: use only the number of leaf fields - let leaf_count = schema.fields_pre_order().filter(|f| f.is_leaf()).count(); - let column_count = if version == LanceFileVersion::V2_0 { - schema.fields_pre_order().count() - } else { - leaf_count - }; - - // v2_0 compatibility: build a map to identify non-leaf structural header columns - // - In v2_0 these headers exist as columns and must have a single page - // - In v2_1+ these headers are not stored as columns and this map is unused - let mut is_non_leaf_column: Vec = vec![false; column_count]; - if version == LanceFileVersion::V2_0 { - for (col_idx, field) in schema.fields_pre_order().enumerate() { - // Only mark non-packed Struct fields (lists remain as leaf data carriers) - let is_non_leaf = field.data_type().is_struct() && !field.is_packed_struct(); - is_non_leaf_column[col_idx] = is_non_leaf; - } - } - - let mut out: Vec = Vec::new(); - let mut current_writer: Option = None; - let mut current_filename: Option = None; - let mut current_pos: u64 = 0; - let mut current_page_table: Vec = Vec::new(); - - // Column-list> - let mut col_pages: Vec> = std::iter::repeat_with(Vec::::new) - .take(column_count) - .collect(); - let mut col_buffers: Vec> = vec![Vec::new(); column_count]; - let mut total_rows_in_current: u64 = 0; - let max_rows_per_file = params.max_rows_per_file as u64; - let uses_stable_row_ids = dataset.manifest.uses_stable_row_ids(); - let mut current_row_ids = RowIdSequence::new(); - - // Align all writes to 64-byte boundaries to honor typical IO alignment and - // keep buffer offsets valid across concatenated pages. - const ALIGN: usize = 64; - static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); - let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); - // Visit each fragment and all of its data files (a fragment may contain multiple files) - for frag in fragments.iter() { - let mut frag_row_ids_offset: u64 = 0; - let frag_row_ids = if uses_stable_row_ids { - Some(load_row_id_sequence(dataset, frag).await?) - } else { - None - }; - for df in frag.files.iter() { - let object_store = if let Some(base_id) = df.base_id { - dataset.object_store_for_base(base_id).await? - } else { - dataset.object_store.clone() - }; - let full_path = dataset.data_file_dir(df)?.child(df.path.as_str()); - let scan_scheduler = ScanScheduler::new( - object_store.clone(), - SchedulerConfig::max_bandwidth(&object_store), - ); - let file_scheduler = scan_scheduler - .open_file_with_priority(&full_path, 0, &df.file_size_bytes) - .await?; - let file_meta = LFReader::read_all_metadata(&file_scheduler).await?; - let src_colum_infos = file_meta.column_infos.clone(); - // Initialize current_page_table - if current_page_table.is_empty() { - current_page_table = src_colum_infos - .iter() - .map(|column_index| ColumnInfo { - index: column_index.index, - buffer_offsets_and_sizes: Arc::from( - Vec::<(u64, u64)>::new().into_boxed_slice(), - ), - page_infos: Arc::from(Vec::::new().into_boxed_slice()), - encoding: column_index.encoding.clone(), - }) - .collect(); - } - - // Iterate through each column of the current data file of the current fragment - for (col_idx, src_column_info) in src_colum_infos.iter().enumerate() { - // v2_0 compatibility: special handling for non-leaf structural header columns - // - v2_0 expects structural header columns to have a SINGLE page; they carry layout - // metadata only and are not true data carriers. - // - When merging multiple input files via binary copy, naively appending pages would - // yield multiple pages for the same structural header column, violating v2_0 rules. - // - To preserve v2_0 invariants, we skip pages beyond the first one for these columns. - // - During finalization we also normalize the single remaining page’s `num_rows` to the - // total number of rows in the output file and reset `priority` to 0. - // - For v2_1+ this logic does not apply because non-leaf headers are not stored as columns. - let is_non_leaf = col_idx < is_non_leaf_column.len() && is_non_leaf_column[col_idx]; - if is_non_leaf && !col_pages[col_idx].is_empty() { - continue; - } - - if current_writer.is_none() { - let filename = format!("{}.lance", generate_random_filename()); - let path = dataset.base.child(super::DATA_DIR).child(filename.as_str()); - let writer = dataset.object_store.create(&path).await?; - current_writer = Some(writer); - current_filename = Some(filename); - current_pos = 0; - } - - let read_batch_bytes: u64 = read_batch_bytes_opt.unwrap_or(16 * 1024 * 1024) as u64; - - let mut page_index = 0; - - // Iterate through each page of the current column in the current data file of the current fragment - while page_index < src_column_info.page_infos.len() { - let mut batch_ranges: Vec> = Vec::new(); - let mut batch_counts: Vec = Vec::new(); - let mut batch_bytes: u64 = 0; - let mut batch_pages: usize = 0; - // Build a single read batch by coalescing consecutive pages up to - // `read_batch_bytes` budget: - // - Accumulate total bytes (`batch_bytes`) and page count (`batch_pages`). - // - For each page, append its buffer ranges to `batch_ranges` and record - // the number of buffers in `batch_counts` so returned bytes can be - // mapped back to page boundaries. - // - Stop when adding the next page would exceed the byte budget, then - // issue one I/O request for the collected ranges. - // - Advance `page_index` to reflect pages scheduled in this batch. - for current_page in &src_column_info.page_infos[page_index..] { - let page_bytes: u64 = current_page - .buffer_offsets_and_sizes - .iter() - .map(|(_, size)| *size) - .sum(); - let would_exceed = - batch_pages > 0 && (batch_bytes + page_bytes > read_batch_bytes); - if would_exceed { - break; - } - batch_counts.push(current_page.buffer_offsets_and_sizes.len()); - for (offset, size) in current_page.buffer_offsets_and_sizes.iter() { - batch_ranges.push((*offset)..(*offset + *size)); - } - batch_bytes += page_bytes; - batch_pages += 1; - page_index += 1; - } - - let bytes_vec = if batch_ranges.is_empty() { - Vec::new() - } else { - // read many buffers at once - file_scheduler.submit_request(batch_ranges, 0).await? - }; - let mut bytes_iter = bytes_vec.into_iter(); - - for (local_idx, buffer_count) in batch_counts.iter().enumerate() { - // Reconstruct the absolute page index within the source column: - // - `page_index` now points to the page position - // - `batch_pages` is how many pages we included in this batch - // - `local_idx` enumerates pages inside the batch [0..batch_pages) - // Therefore `page_index - batch_pages + local_idx` yields the exact - // source page we are currently materializing, allowing us to access - // its metadata (encoding, row count, buffers) for the new page entry. - let page = - &src_column_info.page_infos[page_index - batch_pages + local_idx]; - let mut new_offsets = Vec::with_capacity(*buffer_count); - for _ in 0..*buffer_count { - if let Some(bytes) = bytes_iter.next() { - let writer = current_writer.as_mut().unwrap(); - let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; - if pad != 0 { - writer.write_all(&zero_buf[..pad]).await?; - current_pos += pad as u64; - } - let start = current_pos; - writer.write_all(&bytes).await?; - current_pos += bytes.len() as u64; - new_offsets.push((start, bytes.len() as u64)); - } - } - - // manual clone encoding - let encoding = if page.encoding.is_structural() { - PageEncoding::Structural(page.encoding.as_structural().clone()) - } else { - PageEncoding::Legacy(page.encoding.as_legacy().clone()) - }; - // `priority` acts as the global row offset for this page, ensuring - // downstream iterators maintain the correct logical order across - // merged inputs. - let new_page_info = DecPageInfo { - num_rows: page.num_rows, - priority: page.priority + total_rows_in_current, - encoding, - buffer_offsets_and_sizes: Arc::from(new_offsets.into_boxed_slice()), - }; - col_pages[col_idx].push(new_page_info); - } - } // finished scheduling & copying pages for this column in the current source file - - // Copy column-level buffers (outside page data) with alignment - if !src_column_info.buffer_offsets_and_sizes.is_empty() { - let ranges: Vec> = src_column_info - .buffer_offsets_and_sizes - .iter() - .map(|(offset, size)| (*offset)..(*offset + *size)) - .collect(); - let bytes_vec = file_scheduler.submit_request(ranges, 0).await?; - for bytes in bytes_vec.into_iter() { - let writer = current_writer.as_mut().unwrap(); - let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; - if pad != 0 { - writer.write_all(&zero_buf[..pad]).await?; - current_pos += pad as u64; - } - let start = current_pos; - writer.write_all(&bytes).await?; - current_pos += bytes.len() as u64; - col_buffers[col_idx].push((start, bytes.len() as u64)); - } - } - } // finished all columns in the current source file - - if uses_stable_row_ids { - // When stable row IDs are enabled, incorporate the fragment's row IDs - if let Some(seq) = frag_row_ids.as_ref() { - // Number of rows in the current source file - let count = file_meta.num_rows as usize; - - // Take the subsequence of row IDs corresponding to this file - let slice = seq.slice(frag_row_ids_offset as usize, count); - - // Append these row IDs to the accumulated sequence for the current output - current_row_ids.extend(slice.iter().into_iter().collect()); - - // Advance the offset so the next file reads the subsequent row IDs - frag_row_ids_offset += count as u64; - } - } - - // Accumulate rows for the current output file and flush when reaching the threshold - total_rows_in_current += file_meta.num_rows; - if total_rows_in_current >= max_rows_per_file { - // v2_0 compatibility: enforce single-page structural headers before file close - // - We truncate to a single page and rewrite the page’s `num_rows` to match the output - // file’s row count so downstream decoders see a consistent header. - let mut final_cols: Vec> = Vec::with_capacity(column_count); - for (i, column_info) in current_page_table.iter().enumerate() { - // For v2_0 struct headers, force a single page and set num_rows to total - let mut pages_vec = std::mem::take(&mut col_pages[i]); - if version == LanceFileVersion::V2_0 - && is_non_leaf_column.get(i).copied().unwrap_or(false) - && !pages_vec.is_empty() - { - pages_vec[0].num_rows = total_rows_in_current; - pages_vec[0].priority = 0; - pages_vec.truncate(1); - } - let pages_arc = Arc::from(pages_vec.into_boxed_slice()); - let buffers_vec = std::mem::take(&mut col_buffers[i]); - final_cols.push(Arc::new(ColumnInfo::new( - column_info.index, - pages_arc, - buffers_vec, - column_info.encoding.clone(), - ))); - } - let writer = current_writer.take().unwrap(); - flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; - - // Register the newly closed output file as a fragment data file - let (maj, min) = version.to_numbers(); - let mut fragment_out = Fragment::new(0); - let mut data_file_out = - DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); - // v2_0 vs v2_1+ field-to-column index mapping - // - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping - // - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index - let is_structural = version >= LanceFileVersion::V2_1; - let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); - let mut curr_col_idx: i32 = 0; - for field in schema.fields_pre_order() { - if field.is_packed_struct() || field.children.is_empty() || !is_structural { - field_column_indices.push(curr_col_idx); - curr_col_idx += 1; - } else { - field_column_indices.push(-1); - } - } - data_file_out.fields = full_field_ids.clone(); - data_file_out.column_indices = field_column_indices; - fragment_out.files.push(data_file_out); - fragment_out.physical_rows = Some(total_rows_in_current as usize); - if uses_stable_row_ids { - fragment_out.row_id_meta = - Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); - } - // Reset state for next output file - current_writer = None; - current_pos = 0; - current_page_table.clear(); - for v in col_pages.iter_mut() { - v.clear(); - } - for v in col_buffers.iter_mut() { - v.clear(); - } - out.push(fragment_out); - total_rows_in_current = 0; - if uses_stable_row_ids { - current_row_ids = RowIdSequence::new(); - } - } - } - } // Finished writing all fragments; any remaining data in memory will be flushed below - - if total_rows_in_current > 0 { - // Flush remaining rows as a final output file - // v2_0 compatibility: same single-page enforcement applies for the final file close - let mut final_cols: Vec> = Vec::with_capacity(column_count); - for (i, ci) in current_page_table.iter().enumerate() { - // For v2_0 struct headers, force a single page and set num_rows to total - let mut pages_vec = std::mem::take(&mut col_pages[i]); - if version == LanceFileVersion::V2_0 - && is_non_leaf_column.get(i).copied().unwrap_or(false) - && !pages_vec.is_empty() - { - pages_vec[0].num_rows = total_rows_in_current; - pages_vec[0].priority = 0; - pages_vec.truncate(1); - } - let pages_arc = Arc::from(pages_vec.into_boxed_slice()); - let buffers_vec = std::mem::take(&mut col_buffers[i]); - final_cols.push(Arc::new(ColumnInfo::new( - ci.index, - pages_arc, - buffers_vec, - ci.encoding.clone(), - ))); - } - if current_writer.is_none() { - let filename = format!("{}.lance", generate_random_filename()); - let path = dataset.base.child(super::DATA_DIR).child(filename.as_str()); - let writer = dataset.object_store.create(&path).await?; - current_writer = Some(writer); - current_filename = Some(filename); - } - let writer = current_writer.take().unwrap(); - flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; - // Register the final file - let (maj, min) = version.to_numbers(); - let mut frag = Fragment::new(0); - let mut df = DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); - // v2_0 vs v2_1+ field-to-column index mapping for the final file - let is_structural = version >= LanceFileVersion::V2_1; - let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); - let mut curr_col_idx: i32 = 0; - for field in schema.fields_pre_order() { - if field.is_packed_struct() || field.children.is_empty() || !is_structural { - field_column_indices.push(curr_col_idx); - curr_col_idx += 1; - } else { - field_column_indices.push(-1); - } - } - df.fields = full_field_ids.clone(); - df.column_indices = field_column_indices; - frag.files.push(df); - frag.physical_rows = Some(total_rows_in_current as usize); - if uses_stable_row_ids { - frag.row_id_meta = Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); - } - out.push(frag); - } - Ok(out) -} - -/// Finalizes a compacted data file by writing the Lance footer via `FileWriter`. -/// -/// This function does not manually craft the footer. Instead it: -/// - Pads the current `ObjectWriter` position to a 64‑byte boundary (required for v2_1+ readers). -/// - Converts the collected per‑column info (`final_cols`) into `ColumnMetadata`. -/// - Constructs a `lance_file::writer::FileWriter` with the active `schema`, column metadata, -/// and `total_rows_in_current`. -/// - Calls `FileWriter::finish()` to emit column metadata, offset tables, global buffers -/// (schema descriptor), version, and to close the writer. -/// -/// Preconditions: -/// - All page data and column‑level buffers referenced by `final_cols` have already been written -/// to `writer`; otherwise offsets in the footer will be invalid. -/// -/// Version notes: -/// - v2_0 structural single‑page enforcement is handled when building `final_cols`; this function -/// only performs consistent finalization. -async fn flush_footer( - mut writer: ObjectWriter, - schema: &Schema, - final_cols: &[Arc], - total_rows_in_current: u64, - version: LanceFileVersion, -) -> Result<()> { - if version >= LanceFileVersion::V2_1 { - const ALIGN: usize = 64; - static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); - let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); - let pos = writer.tell().await? as u64; - let pad = (ALIGN as u64 - (pos % ALIGN as u64)) % ALIGN as u64; - if pad != 0 { - writer.write_all(&zero_buf[..pad as usize]).await?; - } - } - let mut col_metadatas = Vec::with_capacity(final_cols.len()); - for col in final_cols { - let pages = col - .page_infos - .iter() - .map(|page_info| { - let encoded_encoding = match &page_info.encoding { - PageEncoding::Legacy(array_encoding) => { - Any::from_msg(array_encoding)?.encode_to_vec() - } - PageEncoding::Structural(page_layout) => { - Any::from_msg(page_layout)?.encode_to_vec() - } - }; - let (buffer_offsets, buffer_sizes): (Vec<_>, Vec<_>) = page_info - .buffer_offsets_and_sizes - .as_ref() - .iter() - .cloned() - .unzip(); - Ok(pbfile::column_metadata::Page { - buffer_offsets, - buffer_sizes, - encoding: Some(pbfile::Encoding { - location: Some(pbfile::encoding::Location::Direct( - pbfile::DirectEncoding { - encoding: encoded_encoding, - }, - )), - }), - length: page_info.num_rows, - priority: page_info.priority, - }) - }) - .collect::>>()?; - let (buffer_offsets, buffer_sizes): (Vec<_>, Vec<_>) = - col.buffer_offsets_and_sizes.iter().cloned().unzip(); - let encoded_col_encoding = Any::from_msg(&col.encoding)?.encode_to_vec(); - let column = pbfile::ColumnMetadata { - pages, - buffer_offsets, - buffer_sizes, - encoding: Some(pbfile::Encoding { - location: Some(pbfile::encoding::Location::Direct(pbfile::DirectEncoding { - encoding: encoded_col_encoding, - })), - }), - }; - col_metadatas.push(column); - } - let mut file_writer = FileWriter::new_lazy( - writer, - FileWriterOptions { - format_version: Some(version), - ..Default::default() - }, - ); - file_writer.initialize_with_external_metadata( - schema.clone(), - col_metadatas, - total_rows_in_current, - ); - file_writer.finish().await?; - Ok(()) -} - /// Commit the results of file compaction. /// /// It is not required that all tasks are passed to this method. If some failed, @@ -1806,6 +1284,7 @@ pub async fn commit_compaction( #[cfg(test)] mod tests { + mod binary_copy; use self::remapping::RemappedIndex; use super::*; use crate::dataset::index::frag_reuse::cleanup_frag_reuse_index; @@ -1829,7 +1308,9 @@ mod tests { use lance_datagen::Dimension; use lance_file::version::LanceFileVersion; use lance_index::frag_reuse::FRAG_REUSE_INDEX_NAME; - use lance_index::scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams}; + use lance_index::scalar::{ + BuiltinIndexType, FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams, + }; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::pq::PQBuildParams; use lance_index::{Index, IndexType}; @@ -2305,648 +1786,6 @@ mod tests { assert_eq!(fragment_ids, vec![3, 7, 8, 9, 10]); } - #[tokio::test] - async fn test_binary_copy_merge_small_files() { - for version in LanceFileVersion::iter_non_legacy() { - do_test_binary_copy_merge_small_files(version).await; - } - } - - async fn do_test_binary_copy_merge_small_files(version: LanceFileVersion) { - let test_dir = TempStrDir::default(); - let test_uri = &test_dir; - - let data = sample_data(); - let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - let reader2 = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - let write_params = WriteParams { - max_rows_per_file: 2_500, - max_rows_per_group: 1_000, - data_storage_version: Some(version), - ..Default::default() - }; - let mut dataset = Dataset::write(reader, test_uri, Some(write_params.clone())) - .await - .unwrap(); - dataset.append(reader2, Some(write_params)).await.unwrap(); - - let before = dataset.scan().try_into_batch().await.unwrap(); - - let options = CompactionOptions { - target_rows_per_fragment: 100_000_000, - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - let metrics = compact_files(&mut dataset, options, None).await.unwrap(); - assert!(metrics.fragments_added >= 1); - assert_eq!( - dataset.count_rows(None).await.unwrap() as usize, - before.num_rows() - ); - let after = dataset.scan().try_into_batch().await.unwrap(); - assert_eq!(before, after); - } - - #[tokio::test] - async fn test_binary_copy_with_defer_remap() { - for version in LanceFileVersion::iter_non_legacy() { - do_test_binary_copy_with_defer_remap(version).await; - } - } - - async fn do_test_binary_copy_with_defer_remap(version: LanceFileVersion) { - use arrow_schema::{DataType, Field, Fields, TimeUnit}; - use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; - use std::sync::Arc; - - let fixed_list_dt = - DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4); - - let meta_fields = Fields::from(vec![ - Field::new("a", DataType::Utf8, true), - Field::new("b", DataType::Int32, true), - Field::new("c", fixed_list_dt.clone(), true), - ]); - - let inner_fields = Fields::from(vec![ - Field::new("x", DataType::UInt32, true), - Field::new("y", DataType::LargeUtf8, true), - ]); - let nested_fields = Fields::from(vec![ - Field::new("inner", DataType::Struct(inner_fields.clone()), true), - Field::new("fsb", DataType::FixedSizeBinary(8), true), - ]); - - let event_fields = Fields::from(vec![ - Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), - Field::new("payload", DataType::Binary, true), - ]); - - let reader = gen_batch() - .col("vec", array::rand_vec::(Dimension::from(16))) - .col("i", array::step::()) - .col("meta", array::rand_struct(meta_fields)) - .col("nested", array::rand_struct(nested_fields)) - .col( - "events", - array::rand_list_any(array::rand_struct(event_fields), true), - ) - .into_reader_rows(RowCount::from(6_000), BatchCount::from(1)); - - let mut dataset = Dataset::write( - reader, - "memory://test/binary_copy_nested", - Some(WriteParams { - max_rows_per_file: 1_000, - data_storage_version: Some(version), - ..Default::default() - }), - ) - .await - .unwrap(); - - let before_batch = dataset.scan().try_into_batch().await.unwrap(); - - let options = CompactionOptions { - defer_index_remap: true, - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); - - let after_batch = dataset.scan().try_into_batch().await.unwrap(); - - assert_eq!(before_batch, after_batch); - } - - #[tokio::test] - async fn test_binary_copy_with_stable_row_ids_enabled() { - for version in LanceFileVersion::iter_non_legacy() { - do_test_binary_copy_with_stable_row_ids_enabled(version).await; - } - } - - async fn do_test_binary_copy_with_stable_row_ids_enabled(version: LanceFileVersion) { - use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector}; - let mut data_gen = BatchGenerator::new() - .col(Box::new( - RandomVector::new().vec_width(8).named("vec".to_owned()), - )) - .col(Box::new(IncrementingInt32::new().named("i".to_owned()))); - - let mut dataset = Dataset::write( - data_gen.batch(4_000), - format!("memory://test/binary_copy_stable_row_ids_{}", version).as_str(), - Some(WriteParams { - enable_stable_row_ids: true, - data_storage_version: Some(version), - max_rows_per_file: 500, - ..Default::default() - }), - ) - .await - .unwrap(); - - dataset - .create_index( - &["i"], - IndexType::Scalar, - Some("scalar".into()), - &ScalarIndexParams::default(), - false, - ) - .await - .unwrap(); - let params = VectorIndexParams::ivf_pq(1, 8, 1, MetricType::L2, 50); - dataset - .create_index( - &["vec"], - IndexType::Vector, - Some("vector".into()), - ¶ms, - false, - ) - .await - .unwrap(); - - async fn index_set(dataset: &Dataset) -> HashSet { - dataset - .load_indices() - .await - .unwrap() - .iter() - .map(|index| index.uuid) - .collect() - } - let indices = index_set(&dataset).await; - - async fn vector_query(dataset: &Dataset) -> RecordBatch { - let mut scanner = dataset.scan(); - let query = Float32Array::from(vec![0.0f32; 8]); - scanner - .nearest("vec", &query, 10) - .unwrap() - .project(&["i"]) - .unwrap(); - scanner.try_into_batch().await.unwrap() - } - - async fn scalar_query(dataset: &Dataset) -> RecordBatch { - let mut scanner = dataset.scan(); - scanner.filter("i = 100").unwrap().project(&["i"]).unwrap(); - scanner.try_into_batch().await.unwrap() - } - - let before_vec_result = vector_query(&dataset).await; - let before_scalar_result = scalar_query(&dataset).await; - - let before_batch = dataset - .scan() - .project(&["vec", "i"]) - .unwrap() - .with_row_id() - .try_into_batch() - .await - .unwrap(); - - let options = CompactionOptions { - target_rows_per_fragment: 2_000, - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); - - let current_indices = index_set(&dataset).await; - assert_eq!(indices, current_indices); - - let after_vec_result = vector_query(&dataset).await; - assert_eq!(before_vec_result, after_vec_result); - - let after_scalar_result = scalar_query(&dataset).await; - assert_eq!(before_scalar_result, after_scalar_result); - - let after_batch = dataset - .scan() - .project(&["vec", "i"]) - .unwrap() - .with_row_id() - .try_into_batch() - .await - .unwrap(); - - let before_idx = arrow_ord::sort::sort_to_indices( - before_batch.column_by_name(lance_core::ROW_ID).unwrap(), - None, - None, - ) - .unwrap(); - let after_idx = arrow_ord::sort::sort_to_indices( - after_batch.column_by_name(lance_core::ROW_ID).unwrap(), - None, - None, - ) - .unwrap(); - let before = arrow::compute::take_record_batch(&before_batch, &before_idx).unwrap(); - let after = arrow::compute::take_record_batch(&after_batch, &after_idx).unwrap(); - - assert_eq!(before, after); - } - - #[tokio::test] - async fn test_binary_copy_without_stable_row_ids_remap() { - for version in LanceFileVersion::iter_non_legacy() { - do_test_binary_copy_without_stable_row_ids_remap(version).await; - } - } - - async fn do_test_binary_copy_without_stable_row_ids_remap(version: LanceFileVersion) { - let mut data_gen = BatchGenerator::new() - .col(Box::new( - RandomVector::new().vec_width(8).named("vec".to_owned()), - )) - .col(Box::new(IncrementingInt32::new().named("i".to_owned()))); - - let mut dataset = Dataset::write( - data_gen.batch(4_000), - "memory://test/binary_copy_no_stable", - Some(WriteParams { - enable_stable_row_ids: false, - data_storage_version: Some(version), - max_rows_per_file: 500, - ..Default::default() - }), - ) - .await - .unwrap(); - - dataset - .create_index( - &["i"], - IndexType::Scalar, - Some("scalar".into()), - &ScalarIndexParams::default(), - false, - ) - .await - .unwrap(); - let params = VectorIndexParams::ivf_pq(1, 8, 1, MetricType::L2, 50); - dataset - .create_index( - &["vec"], - IndexType::Vector, - Some("vector".into()), - ¶ms, - false, - ) - .await - .unwrap(); - - async fn vector_query(dataset: &Dataset) -> RecordBatch { - let mut scanner = dataset.scan(); - let query = Float32Array::from(vec![0.0f32; 8]); - scanner - .nearest("vec", &query, 10) - .unwrap() - .project(&["i"]) - .unwrap(); - scanner.try_into_batch().await.unwrap() - } - - async fn scalar_query(dataset: &Dataset) -> RecordBatch { - let mut scanner = dataset.scan(); - scanner.filter("i = 100").unwrap().project(&["i"]).unwrap(); - scanner.try_into_batch().await.unwrap() - } - - let before_vec_result = vector_query(&dataset).await; - let before_scalar_result = scalar_query(&dataset).await; - let before_batch = dataset - .scan() - .project(&["vec", "i"]) - .unwrap() - .try_into_batch() - .await - .unwrap(); - - let options = CompactionOptions { - target_rows_per_fragment: 2_000, - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); - - let after_vec_result = vector_query(&dataset).await; - assert_eq!(before_vec_result, after_vec_result); - - let after_scalar_result = scalar_query(&dataset).await; - assert_eq!(before_scalar_result, after_scalar_result); - - let after_batch = dataset - .scan() - .project(&["vec", "i"]) - .unwrap() - .try_into_batch() - .await - .unwrap(); - - assert_eq!(before_batch, after_batch); - } - - #[tokio::test] - async fn test_can_use_binary_copy_schema_consistency_ok() { - let test_dir = TempStrDir::default(); - let test_uri = &test_dir; - let data = sample_data(); - let reader1 = RecordBatchIterator::new(vec![Ok(data.slice(0, 5_000))], data.schema()); - let reader2 = RecordBatchIterator::new(vec![Ok(data.slice(5_000, 5_000))], data.schema()); - let write_params = WriteParams { - max_rows_per_file: 1_000, - ..Default::default() - }; - let mut dataset = Dataset::write(reader1, test_uri, Some(write_params.clone())) - .await - .unwrap(); - dataset.append(reader2, Some(write_params)).await.unwrap(); - - let options = CompactionOptions { - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - let frags: Vec = dataset - .get_fragments() - .into_iter() - .map(Into::into) - .collect(); - assert!(can_use_binary_copy(&dataset, &options, &frags)); - } - - #[tokio::test] - async fn test_can_use_binary_copy_schema_mismatch() { - let test_dir = TempStrDir::default(); - let test_uri = &test_dir; - let data = sample_data(); - let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - let write_params = WriteParams { - max_rows_per_file: 1_000, - ..Default::default() - }; - let dataset = Dataset::write(reader, test_uri, Some(write_params)) - .await - .unwrap(); - - let options = CompactionOptions { - enable_binary_copy: true, - ..Default::default() - }; - let mut frags: Vec = dataset - .get_fragments() - .into_iter() - .map(Into::into) - .collect(); - // Introduce a column index mismatch in the first data file - if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { - if let Some(first) = df.column_indices.get_mut(0) { - *first = -*first - 1; - } else { - df.column_indices.push(-1); - } - } - assert!(!can_use_binary_copy(&dataset, &options, &frags)); - - // Also introduce a version mismatch and ensure rejection - if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { - df.file_minor_version = if df.file_minor_version == 1 { 2 } else { 1 }; - } - assert!(!can_use_binary_copy(&dataset, &options, &frags)); - } - - #[tokio::test] - async fn test_can_use_binary_copy_version_mismatch() { - let test_dir = TempStrDir::default(); - let test_uri = &test_dir; - let data = sample_data(); - let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - let write_params = WriteParams { - max_rows_per_file: 500, - data_storage_version: Some(LanceFileVersion::V2_0), - ..Default::default() - }; - let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) - .await - .unwrap(); - - // Append additional data and then mark its files as a newer format version (v2.1). - let reader_append = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - - dataset.append(reader_append, None).await.unwrap(); - - let options = CompactionOptions { - enable_binary_copy: true, - ..Default::default() - }; - let mut frags: Vec = dataset - .get_fragments() - .into_iter() - .map(Into::into) - .collect(); - assert!( - frags.len() >= 2, - "expected multiple fragments for version mismatch test" - ); - - // Simulate mixed file versions by marking the second fragment as v2.1. - let (v21_major, v21_minor) = LanceFileVersion::V2_1.to_numbers(); - for file in &mut frags[1].files { - file.file_major_version = v21_major; - file.file_minor_version = v21_minor; - } - - assert!(!can_use_binary_copy(&dataset, &options, &frags)); - } - - #[tokio::test] - async fn test_can_use_binary_copy_reject_deletions() { - let test_dir = TempStrDir::default(); - let test_uri = &test_dir; - let data = sample_data(); - let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - let write_params = WriteParams { - max_rows_per_file: 1_000, - ..Default::default() - }; - let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) - .await - .unwrap(); - dataset.delete("a < 10").await.unwrap(); - - let options = CompactionOptions { - enable_binary_copy: true, - ..Default::default() - }; - let frags: Vec = dataset - .get_fragments() - .into_iter() - .map(Into::into) - .collect(); - assert!(!can_use_binary_copy(&dataset, &options, &frags)); - } - - #[tokio::test] - async fn test_binary_copy_fallback_to_common_compaction() { - let test_dir = TempStrDir::default(); - let test_uri = &test_dir; - let data = sample_data(); - let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); - let write_params = WriteParams { - max_rows_per_file: 500, - ..Default::default() - }; - let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) - .await - .unwrap(); - dataset.delete("a < 100").await.unwrap(); - - let before = dataset.scan().try_into_batch().await.unwrap(); - - let options = CompactionOptions { - target_rows_per_fragment: 100_000, - enable_binary_copy: true, - ..Default::default() - }; - - let frags: Vec = dataset - .get_fragments() - .into_iter() - .map(Into::into) - .collect(); - assert!(!can_use_binary_copy(&dataset, &options, &frags)); - - let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); - - let after = dataset.scan().try_into_batch().await.unwrap(); - assert_eq!(before, after); - } - - #[tokio::test] - async fn test_binary_copy_compaction_with_complex_schema() { - for version in LanceFileVersion::iter_non_legacy() { - do_test_binary_copy_compaction_with_complex_schema(version).await; - } - } - - async fn do_test_binary_copy_compaction_with_complex_schema(version: LanceFileVersion) { - use arrow_schema::{DataType, Field, Fields, TimeUnit}; - use lance_core::utils::tempfile::TempStrDir; - use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; - - let row_num = 1_000; - - let inner_fields = Fields::from(vec![ - Field::new("x", DataType::UInt32, true), - Field::new("y", DataType::LargeUtf8, true), - ]); - let nested_fields = Fields::from(vec![ - Field::new("inner", DataType::Struct(inner_fields.clone()), true), - Field::new("fsb", DataType::FixedSizeBinary(16), true), - Field::new("bin", DataType::Binary, true), - ]); - let event_fields = Fields::from(vec![ - Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), - Field::new("payload", DataType::Binary, true), - ]); - - let reader_full = gen_batch() - .col("vec1", array::rand_vec::(Dimension::from(12))) - .col("vec2", array::rand_vec::(Dimension::from(8))) - .col("i32", array::step::()) - .col("i64", array::step::()) - .col("f32", array::rand::()) - .col("f64", array::rand::()) - .col("bool", array::rand_boolean()) - .col("date32", array::rand_date32()) - .col("date64", array::rand_date64()) - .col( - "ts_ms", - array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), - ) - .col( - "utf8", - array::rand_utf8(lance_datagen::ByteCount::from(16), false), - ) - .col("large_utf8", array::random_sentence(1, 6, true)) - .col( - "bin", - array::rand_fixedbin(lance_datagen::ByteCount::from(24), false), - ) - .col( - "large_bin", - array::rand_fixedbin(lance_datagen::ByteCount::from(24), true), - ) - .col( - "varbin", - array::rand_varbin( - lance_datagen::ByteCount::from(8), - lance_datagen::ByteCount::from(32), - ), - ) - .col("fsb16", array::rand_fsb(16)) - .col( - "fsl4", - array::cycle_vec(array::rand::(), Dimension::from(4)), - ) - .col("struct_simple", array::rand_struct(inner_fields.clone())) - .col("struct_nested", array::rand_struct(nested_fields)) - .col( - "events", - array::rand_list_any(array::rand_struct(event_fields.clone()), true), - ) - .into_reader_rows(RowCount::from(row_num), BatchCount::from(10)); - - let full_dir = TempStrDir::default(); - let mut dataset = Dataset::write( - reader_full, - &*full_dir, - Some(WriteParams { - enable_stable_row_ids: true, - data_storage_version: Some(version), - max_rows_per_file: (row_num / 100) as usize, - ..Default::default() - }), - ) - .await - .unwrap(); - - let opt_full = CompactionOptions { - enable_binary_copy: false, - ..Default::default() - }; - let opt_binary = CompactionOptions { - enable_binary_copy: true, - enable_binary_copy_force: true, - ..Default::default() - }; - - let _ = compact_files(&mut dataset, opt_full, None).await.unwrap(); - let before = dataset.count_rows(None).await.unwrap(); - let batch_before = dataset.scan().try_into_batch().await.unwrap(); - - let mut dataset = dataset.checkout_version(1).await.unwrap(); - - // rollback and trigger another binary copy compaction - dataset.restore().await.unwrap(); - let _ = compact_files(&mut dataset, opt_binary, None).await.unwrap(); - let after = dataset.count_rows(None).await.unwrap(); - let batch_after = dataset.scan().try_into_batch().await.unwrap(); - - assert_eq!(before, after); - assert_eq!(batch_before, batch_after); - } - #[rstest] #[tokio::test] async fn test_compact_data_files( diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs new file mode 100644 index 00000000000..304db331112 --- /dev/null +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::dataset::fragment::write::generate_random_filename; +use crate::dataset::optimize::load_row_id_sequence; +use crate::dataset::WriteParams; +use crate::dataset::DATA_DIR; +use crate::datatypes::Schema; +use crate::Dataset; +use crate::Result; +use lance_arrow::DataTypeExt; +use lance_encoding::decoder::{ColumnInfo, PageEncoding, PageInfo as DecPageInfo}; +use lance_encoding::version::LanceFileVersion; +use lance_file::format::pbfile; +use lance_file::reader::FileReader as LFReader; +use lance_file::writer::{FileWriter, FileWriterOptions}; +use lance_io::object_writer::ObjectWriter; +use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; +use lance_io::traits::Writer; +use lance_table::format::{DataFile, Fragment, RowIdMeta}; +use lance_table::rowids::{write_row_ids, RowIdSequence}; +use prost::Message; +use prost_types::Any; +use std::ops::Range; +use std::sync::Arc; +use tokio::io::AsyncWriteExt; + +/// Rewrite the files in a single task using binary copy semantics. +/// +/// Flow overview (per task): +/// fragments +/// └── data files +/// └── columns +/// └── pages (batched reads) -> aligned writes -> page metadata +/// └── column buffers -> aligned writes -> buffer metadata +/// └── flush when target rows reached -> write footer -> fragment metadata +/// └── final flush for remaining rows +/// +/// Behavior highlights: +/// - Assumes all input files share the same Lance file version; version drives column-count +/// calculation (v2.0 includes structural headers, v2.1+ only leaf columns). +/// - Preserves stable row ids by concatenating row-id sequences when enabled. +/// - Enforces 64-byte alignment for page and buffer writes to satisfy downstream readers. +/// - For v2.0, preserves single-page structural headers and normalizes their row counts/priority. +/// - Flushes an output file once `max_rows_per_file` rows are accumulated, then repeats. +/// +/// Parameters: +/// - `dataset`: target dataset (for storage/config and schema). +/// - `fragments`: fragments to merge via binary copy (assumed consistent versions). +/// - `params`: write parameters (uses `max_rows_per_file`). +/// - `read_batch_bytes_opt`: optional I/O batch size when coalescing page reads. +pub async fn rewrite_files_binary_copy( + dataset: &Dataset, + fragments: &[Fragment], + params: &WriteParams, + read_batch_bytes_opt: Option, +) -> Result> { + // Binary copy algorithm overview: + // - Reads page and buffer regions directly from source files in bounded batches + // - Appends them to a new output file with alignment, updating offsets + // - Recomputes page priorities by adding the cumulative row count to preserve order + // - For v2_0, enforces single-page structural header columns when closing a file + // - Writes a new footer (schema descriptor, column metadata, offset tables, version) + // - Optionally carries forward stable row ids and persists them inline in fragment metadata + // Merge small Lance files into larger ones by page-level binary copy. + let schema = dataset.schema().clone(); + let full_field_ids = schema.field_ids(); + + // The previous checks have ensured that the file versions of all files are consistent. + let version = LanceFileVersion::try_from_major_minor( + fragments[0].files[0].file_major_version, + fragments[0].files[0].file_minor_version, + ) + .unwrap() + .resolve(); + // v2_0 compatibility: column layout differs across file versions + // - v2_0 materializes BOTH leaf columns and non-leaf structural headers (e.g., Struct / List) + // which means the ColumnInfo set includes all fields in pre-order traversal. + // - v2_1+ materializes ONLY leaf columns. Non-leaf structural headers are not stored as columns. + // As a result, the ColumnInfo set contains leaf fields only. + // To correctly align copy layout, we derive `column_count` by version: + // - v2_0: use total number of fields in pre-order (leaf + non-leaf headers) + // - v2_1+: use only the number of leaf fields + let leaf_count = schema.fields_pre_order().filter(|f| f.is_leaf()).count(); + let column_count = if version == LanceFileVersion::V2_0 { + schema.fields_pre_order().count() + } else { + leaf_count + }; + + // v2_0 compatibility: build a map to identify non-leaf structural header columns + // - In v2_0 these headers exist as columns and must have a single page + // - In v2_1+ these headers are not stored as columns and this map is unused + let mut is_non_leaf_column: Vec = vec![false; column_count]; + if version == LanceFileVersion::V2_0 { + for (col_idx, field) in schema.fields_pre_order().enumerate() { + // Only mark non-packed Struct fields (lists remain as leaf data carriers) + let is_non_leaf = field.data_type().is_struct() && !field.is_packed_struct(); + is_non_leaf_column[col_idx] = is_non_leaf; + } + } + + let mut out: Vec = Vec::new(); + let mut current_writer: Option = None; + let mut current_filename: Option = None; + let mut current_pos: u64 = 0; + let mut current_page_table: Vec = Vec::new(); + + // Column-list> + let mut col_pages: Vec> = std::iter::repeat_with(Vec::::new) + .take(column_count) + .collect(); + let mut col_buffers: Vec> = vec![Vec::new(); column_count]; + let mut total_rows_in_current: u64 = 0; + let max_rows_per_file = params.max_rows_per_file as u64; + let uses_stable_row_ids = dataset.manifest.uses_stable_row_ids(); + let mut current_row_ids = RowIdSequence::new(); + + // Align all writes to 64-byte boundaries to honor typical IO alignment and + // keep buffer offsets valid across concatenated pages. + const ALIGN: usize = 64; + static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); + let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); + // Visit each fragment and all of its data files (a fragment may contain multiple files) + for frag in fragments.iter() { + let mut frag_row_ids_offset: u64 = 0; + let frag_row_ids = if uses_stable_row_ids { + Some(load_row_id_sequence(dataset, frag).await?) + } else { + None + }; + for df in frag.files.iter() { + let object_store = if let Some(base_id) = df.base_id { + dataset.object_store_for_base(base_id).await? + } else { + dataset.object_store.clone() + }; + let full_path = dataset.data_file_dir(df)?.child(df.path.as_str()); + let scan_scheduler = ScanScheduler::new( + object_store.clone(), + SchedulerConfig::max_bandwidth(&object_store), + ); + let file_scheduler = scan_scheduler + .open_file_with_priority(&full_path, 0, &df.file_size_bytes) + .await?; + let file_meta = LFReader::read_all_metadata(&file_scheduler).await?; + let src_colum_infos = file_meta.column_infos.clone(); + // Initialize current_page_table + if current_page_table.is_empty() { + current_page_table = src_colum_infos + .iter() + .map(|column_index| ColumnInfo { + index: column_index.index, + buffer_offsets_and_sizes: Arc::from( + Vec::<(u64, u64)>::new().into_boxed_slice(), + ), + page_infos: Arc::from(Vec::::new().into_boxed_slice()), + encoding: column_index.encoding.clone(), + }) + .collect(); + } + + // Iterate through each column of the current data file of the current fragment + for (col_idx, src_column_info) in src_colum_infos.iter().enumerate() { + // v2_0 compatibility: special handling for non-leaf structural header columns + // - v2_0 expects structural header columns to have a SINGLE page; they carry layout + // metadata only and are not true data carriers. + // - When merging multiple input files via binary copy, naively appending pages would + // yield multiple pages for the same structural header column, violating v2_0 rules. + // - To preserve v2_0 invariants, we skip pages beyond the first one for these columns. + // - During finalization we also normalize the single remaining page’s `num_rows` to the + // total number of rows in the output file and reset `priority` to 0. + // - For v2_1+ this logic does not apply because non-leaf headers are not stored as columns. + let is_non_leaf = col_idx < is_non_leaf_column.len() && is_non_leaf_column[col_idx]; + if is_non_leaf && !col_pages[col_idx].is_empty() { + continue; + } + + if current_writer.is_none() { + let filename = format!("{}.lance", generate_random_filename()); + let path = dataset.base.child(DATA_DIR).child(filename.as_str()); + let writer = dataset.object_store.create(&path).await?; + current_writer = Some(writer); + current_filename = Some(filename); + current_pos = 0; + } + + let read_batch_bytes: u64 = read_batch_bytes_opt.unwrap_or(16 * 1024 * 1024) as u64; + + let mut page_index = 0; + + // Iterate through each page of the current column in the current data file of the current fragment + while page_index < src_column_info.page_infos.len() { + let mut batch_ranges: Vec> = Vec::new(); + let mut batch_counts: Vec = Vec::new(); + let mut batch_bytes: u64 = 0; + let mut batch_pages: usize = 0; + // Build a single read batch by coalescing consecutive pages up to + // `read_batch_bytes` budget: + // - Accumulate total bytes (`batch_bytes`) and page count (`batch_pages`). + // - For each page, append its buffer ranges to `batch_ranges` and record + // the number of buffers in `batch_counts` so returned bytes can be + // mapped back to page boundaries. + // - Stop when adding the next page would exceed the byte budget, then + // issue one I/O request for the collected ranges. + // - Advance `page_index` to reflect pages scheduled in this batch. + for current_page in &src_column_info.page_infos[page_index..] { + let page_bytes: u64 = current_page + .buffer_offsets_and_sizes + .iter() + .map(|(_, size)| *size) + .sum(); + let would_exceed = + batch_pages > 0 && (batch_bytes + page_bytes > read_batch_bytes); + if would_exceed { + break; + } + batch_counts.push(current_page.buffer_offsets_and_sizes.len()); + for (offset, size) in current_page.buffer_offsets_and_sizes.iter() { + batch_ranges.push((*offset)..(*offset + *size)); + } + batch_bytes += page_bytes; + batch_pages += 1; + page_index += 1; + } + + let bytes_vec = if batch_ranges.is_empty() { + Vec::new() + } else { + // read many buffers at once + file_scheduler.submit_request(batch_ranges, 0).await? + }; + let mut bytes_iter = bytes_vec.into_iter(); + + for (local_idx, buffer_count) in batch_counts.iter().enumerate() { + // Reconstruct the absolute page index within the source column: + // - `page_index` now points to the page position + // - `batch_pages` is how many pages we included in this batch + // - `local_idx` enumerates pages inside the batch [0..batch_pages) + // Therefore `page_index - batch_pages + local_idx` yields the exact + // source page we are currently materializing, allowing us to access + // its metadata (encoding, row count, buffers) for the new page entry. + let page = + &src_column_info.page_infos[page_index - batch_pages + local_idx]; + let mut new_offsets = Vec::with_capacity(*buffer_count); + for _ in 0..*buffer_count { + if let Some(bytes) = bytes_iter.next() { + let writer = current_writer.as_mut().unwrap(); + let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; + if pad != 0 { + writer.write_all(&zero_buf[..pad]).await?; + current_pos += pad as u64; + } + let start = current_pos; + writer.write_all(&bytes).await?; + current_pos += bytes.len() as u64; + new_offsets.push((start, bytes.len() as u64)); + } + } + + // manual clone encoding + let encoding = if page.encoding.is_structural() { + PageEncoding::Structural(page.encoding.as_structural().clone()) + } else { + PageEncoding::Legacy(page.encoding.as_legacy().clone()) + }; + // `priority` acts as the global row offset for this page, ensuring + // downstream iterators maintain the correct logical order across + // merged inputs. + let new_page_info = DecPageInfo { + num_rows: page.num_rows, + priority: page.priority + total_rows_in_current, + encoding, + buffer_offsets_and_sizes: Arc::from(new_offsets.into_boxed_slice()), + }; + col_pages[col_idx].push(new_page_info); + } + } // finished scheduling & copying pages for this column in the current source file + + // Copy column-level buffers (outside page data) with alignment + if !src_column_info.buffer_offsets_and_sizes.is_empty() { + let ranges: Vec> = src_column_info + .buffer_offsets_and_sizes + .iter() + .map(|(offset, size)| (*offset)..(*offset + *size)) + .collect(); + let bytes_vec = file_scheduler.submit_request(ranges, 0).await?; + for bytes in bytes_vec.into_iter() { + let writer = current_writer.as_mut().unwrap(); + let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; + if pad != 0 { + writer.write_all(&zero_buf[..pad]).await?; + current_pos += pad as u64; + } + let start = current_pos; + writer.write_all(&bytes).await?; + current_pos += bytes.len() as u64; + col_buffers[col_idx].push((start, bytes.len() as u64)); + } + } + } // finished all columns in the current source file + + if uses_stable_row_ids { + // When stable row IDs are enabled, incorporate the fragment's row IDs + if let Some(seq) = frag_row_ids.as_ref() { + // Number of rows in the current source file + let count = file_meta.num_rows as usize; + + // Take the subsequence of row IDs corresponding to this file + let slice = seq.slice(frag_row_ids_offset as usize, count); + + // Append these row IDs to the accumulated sequence for the current output + current_row_ids.extend(slice.iter().collect()); + + // Advance the offset so the next file reads the subsequent row IDs + frag_row_ids_offset += count as u64; + } + } + + // Accumulate rows for the current output file and flush when reaching the threshold + total_rows_in_current += file_meta.num_rows; + if total_rows_in_current >= max_rows_per_file { + // v2_0 compatibility: enforce single-page structural headers before file close + // - We truncate to a single page and rewrite the page’s `num_rows` to match the output + // file’s row count so downstream decoders see a consistent header. + let mut final_cols: Vec> = Vec::with_capacity(column_count); + for (i, column_info) in current_page_table.iter().enumerate() { + // For v2_0 struct headers, force a single page and set num_rows to total + let mut pages_vec = std::mem::take(&mut col_pages[i]); + if version == LanceFileVersion::V2_0 + && is_non_leaf_column.get(i).copied().unwrap_or(false) + && !pages_vec.is_empty() + { + pages_vec[0].num_rows = total_rows_in_current; + pages_vec[0].priority = 0; + pages_vec.truncate(1); + } + let pages_arc = Arc::from(pages_vec.into_boxed_slice()); + let buffers_vec = std::mem::take(&mut col_buffers[i]); + final_cols.push(Arc::new(ColumnInfo::new( + column_info.index, + pages_arc, + buffers_vec, + column_info.encoding.clone(), + ))); + } + let writer = current_writer.take().unwrap(); + flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; + + // Register the newly closed output file as a fragment data file + let (maj, min) = version.to_numbers(); + let mut fragment_out = Fragment::new(0); + let mut data_file_out = + DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); + // v2_0 vs v2_1+ field-to-column index mapping + // - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping + // - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index + let is_structural = version >= LanceFileVersion::V2_1; + let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); + let mut curr_col_idx: i32 = 0; + for field in schema.fields_pre_order() { + if field.is_packed_struct() || field.children.is_empty() || !is_structural { + field_column_indices.push(curr_col_idx); + curr_col_idx += 1; + } else { + field_column_indices.push(-1); + } + } + data_file_out.fields = full_field_ids.clone(); + data_file_out.column_indices = field_column_indices; + fragment_out.files.push(data_file_out); + fragment_out.physical_rows = Some(total_rows_in_current as usize); + if uses_stable_row_ids { + fragment_out.row_id_meta = + Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); + } + // Reset state for next output file + current_writer = None; + current_pos = 0; + current_page_table.clear(); + for v in col_pages.iter_mut() { + v.clear(); + } + for v in col_buffers.iter_mut() { + v.clear(); + } + out.push(fragment_out); + total_rows_in_current = 0; + if uses_stable_row_ids { + current_row_ids = RowIdSequence::new(); + } + } + } + } // Finished writing all fragments; any remaining data in memory will be flushed below + + if total_rows_in_current > 0 { + // Flush remaining rows as a final output file + // v2_0 compatibility: same single-page enforcement applies for the final file close + let mut final_cols: Vec> = Vec::with_capacity(column_count); + for (i, ci) in current_page_table.iter().enumerate() { + // For v2_0 struct headers, force a single page and set num_rows to total + let mut pages_vec = std::mem::take(&mut col_pages[i]); + if version == LanceFileVersion::V2_0 + && is_non_leaf_column.get(i).copied().unwrap_or(false) + && !pages_vec.is_empty() + { + pages_vec[0].num_rows = total_rows_in_current; + pages_vec[0].priority = 0; + pages_vec.truncate(1); + } + let pages_arc = Arc::from(pages_vec.into_boxed_slice()); + let buffers_vec = std::mem::take(&mut col_buffers[i]); + final_cols.push(Arc::new(ColumnInfo::new( + ci.index, + pages_arc, + buffers_vec, + ci.encoding.clone(), + ))); + } + if current_writer.is_none() { + let filename = format!("{}.lance", generate_random_filename()); + let path = dataset.base.child(DATA_DIR).child(filename.as_str()); + let writer = dataset.object_store.create(&path).await?; + current_writer = Some(writer); + current_filename = Some(filename); + } + let writer = current_writer.take().unwrap(); + flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; + // Register the final file + let (maj, min) = version.to_numbers(); + let mut frag = Fragment::new(0); + let mut df = DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); + // v2_0 vs v2_1+ field-to-column index mapping for the final file + let is_structural = version >= LanceFileVersion::V2_1; + let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); + let mut curr_col_idx: i32 = 0; + for field in schema.fields_pre_order() { + if field.is_packed_struct() || field.children.is_empty() || !is_structural { + field_column_indices.push(curr_col_idx); + curr_col_idx += 1; + } else { + field_column_indices.push(-1); + } + } + df.fields = full_field_ids.clone(); + df.column_indices = field_column_indices; + frag.files.push(df); + frag.physical_rows = Some(total_rows_in_current as usize); + if uses_stable_row_ids { + frag.row_id_meta = Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); + } + out.push(frag); + } + Ok(out) +} + +/// Finalizes a compacted data file by writing the Lance footer via `FileWriter`. +/// +/// This function does not manually craft the footer. Instead it: +/// - Pads the current `ObjectWriter` position to a 64‑byte boundary (required for v2_1+ readers). +/// - Converts the collected per‑column info (`final_cols`) into `ColumnMetadata`. +/// - Constructs a `lance_file::writer::FileWriter` with the active `schema`, column metadata, +/// and `total_rows_in_current`. +/// - Calls `FileWriter::finish()` to emit column metadata, offset tables, global buffers +/// (schema descriptor), version, and to close the writer. +/// +/// Preconditions: +/// - All page data and column‑level buffers referenced by `final_cols` have already been written +/// to `writer`; otherwise offsets in the footer will be invalid. +/// +/// Version notes: +/// - v2_0 structural single‑page enforcement is handled when building `final_cols`; this function +/// only performs consistent finalization. +async fn flush_footer( + mut writer: ObjectWriter, + schema: &Schema, + final_cols: &[Arc], + total_rows_in_current: u64, + version: LanceFileVersion, +) -> Result<()> { + if version >= LanceFileVersion::V2_1 { + const ALIGN: usize = 64; + static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); + let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); + let pos = writer.tell().await? as u64; + let pad = (ALIGN as u64 - (pos % ALIGN as u64)) % ALIGN as u64; + if pad != 0 { + writer.write_all(&zero_buf[..pad as usize]).await?; + } + } + let mut col_metadatas = Vec::with_capacity(final_cols.len()); + for col in final_cols { + let pages = col + .page_infos + .iter() + .map(|page_info| { + let encoded_encoding = match &page_info.encoding { + PageEncoding::Legacy(array_encoding) => { + Any::from_msg(array_encoding)?.encode_to_vec() + } + PageEncoding::Structural(page_layout) => { + Any::from_msg(page_layout)?.encode_to_vec() + } + }; + let (buffer_offsets, buffer_sizes): (Vec<_>, Vec<_>) = page_info + .buffer_offsets_and_sizes + .as_ref() + .iter() + .cloned() + .unzip(); + Ok(pbfile::column_metadata::Page { + buffer_offsets, + buffer_sizes, + encoding: Some(pbfile::Encoding { + location: Some(pbfile::encoding::Location::Direct( + pbfile::DirectEncoding { + encoding: encoded_encoding, + }, + )), + }), + length: page_info.num_rows, + priority: page_info.priority, + }) + }) + .collect::>>()?; + let (buffer_offsets, buffer_sizes): (Vec<_>, Vec<_>) = + col.buffer_offsets_and_sizes.iter().cloned().unzip(); + let encoded_col_encoding = Any::from_msg(&col.encoding)?.encode_to_vec(); + let column = pbfile::ColumnMetadata { + pages, + buffer_offsets, + buffer_sizes, + encoding: Some(pbfile::Encoding { + location: Some(pbfile::encoding::Location::Direct(pbfile::DirectEncoding { + encoding: encoded_col_encoding, + })), + }), + }; + col_metadatas.push(column); + } + let mut file_writer = FileWriter::new_lazy( + writer, + FileWriterOptions { + format_version: Some(version), + ..Default::default() + }, + ); + file_writer.initialize_with_external_metadata( + schema.clone(), + col_metadatas, + total_rows_in_current, + ); + file_writer.finish().await?; + Ok(()) +} diff --git a/rust/lance/src/dataset/optimize/tests/binary_copy.rs b/rust/lance/src/dataset/optimize/tests/binary_copy.rs new file mode 100644 index 00000000000..b28a2324e9e --- /dev/null +++ b/rust/lance/src/dataset/optimize/tests/binary_copy.rs @@ -0,0 +1,774 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use super::*; + +#[tokio::test] +async fn test_binary_copy_merge_small_files() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_merge_small_files(version).await; + } +} + +async fn do_test_binary_copy_merge_small_files(version: LanceFileVersion) { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let reader2 = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 2_500, + max_rows_per_group: 1_000, + data_storage_version: Some(version), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params.clone())) + .await + .unwrap(); + dataset.append(reader2, Some(write_params)).await.unwrap(); + + let before = dataset.scan().try_into_batch().await.unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 100_000_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert!(metrics.fragments_added >= 1); + assert_eq!( + dataset.count_rows(None).await.unwrap() as usize, + before.num_rows() + ); + let after = dataset.scan().try_into_batch().await.unwrap(); + assert_eq!(before, after); +} + +#[tokio::test] +async fn test_binary_copy_with_defer_remap() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_with_defer_remap(version).await; + } +} + +async fn do_test_binary_copy_with_defer_remap(version: LanceFileVersion) { + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; + use std::sync::Arc; + + let fixed_list_dt = + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4); + + let meta_fields = Fields::from(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Int32, true), + Field::new("c", fixed_list_dt.clone(), true), + ]); + + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::UInt32, true), + Field::new("y", DataType::LargeUtf8, true), + ]); + let nested_fields = Fields::from(vec![ + Field::new("inner", DataType::Struct(inner_fields.clone()), true), + Field::new("fsb", DataType::FixedSizeBinary(8), true), + ]); + + let event_fields = Fields::from(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("payload", DataType::Binary, true), + ]); + + let reader = gen_batch() + .col("vec", array::rand_vec::(Dimension::from(16))) + .col("i", array::step::()) + .col("meta", array::rand_struct(meta_fields)) + .col("nested", array::rand_struct(nested_fields)) + .col( + "events", + array::rand_list_any(array::rand_struct(event_fields), true), + ) + .into_reader_rows(RowCount::from(6_000), BatchCount::from(1)); + + let mut dataset = Dataset::write( + reader, + "memory://test/binary_copy_nested", + Some(WriteParams { + max_rows_per_file: 1_000, + data_storage_version: Some(version), + ..Default::default() + }), + ) + .await + .unwrap(); + + let before_batch = dataset.scan().try_into_batch().await.unwrap(); + + let options = CompactionOptions { + defer_index_remap: true, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let after_batch = dataset.scan().try_into_batch().await.unwrap(); + + assert_eq!(before_batch, after_batch); +} + +#[tokio::test] +async fn test_binary_copy_preserves_stable_row_ids() { + for version in LanceFileVersion::iter_non_legacy() { + do_binary_copy_preserves_stable_row_ids(version).await; + } +} + +async fn do_binary_copy_preserves_stable_row_ids(version: LanceFileVersion) { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32, RandomVector}; + let mut data_gen = BatchGenerator::new() + .col(Box::new( + RandomVector::new().vec_width(8).named("vec".to_owned()), + )) + .col(Box::new(IncrementingInt32::new().named("i".to_owned()))); + + let mut dataset = Dataset::write( + data_gen.batch(4_000), + format!("memory://test/binary_copy_stable_row_ids_{}", version).as_str(), + Some(WriteParams { + enable_stable_row_ids: true, + data_storage_version: Some(version), + max_rows_per_file: 500, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("scalar".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let params = VectorIndexParams::ivf_pq(1, 8, 1, MetricType::L2, 50); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vector".into()), + ¶ms, + false, + ) + .await + .unwrap(); + + async fn index_set(dataset: &Dataset) -> HashSet { + dataset + .load_indices() + .await + .unwrap() + .iter() + .map(|index| index.uuid) + .collect() + } + let indices = index_set(&dataset).await; + + async fn vector_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + let query = Float32Array::from(vec![0.0f32; 8]); + scanner + .nearest("vec", &query, 10) + .unwrap() + .project(&["i"]) + .unwrap(); + scanner.try_into_batch().await.unwrap() + } + + async fn scalar_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + scanner.filter("i = 100").unwrap().project(&["i"]).unwrap(); + scanner.try_into_batch().await.unwrap() + } + + let before_vec_result = vector_query(&dataset).await; + let before_scalar_result = scalar_query(&dataset).await; + + let before_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let current_indices = index_set(&dataset).await; + assert_eq!(indices, current_indices); + + let after_vec_result = vector_query(&dataset).await; + assert_eq!(before_vec_result, after_vec_result); + + let after_scalar_result = scalar_query(&dataset).await; + assert_eq!(before_scalar_result, after_scalar_result); + + let after_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap(); + + let before_idx = arrow_ord::sort::sort_to_indices( + before_batch.column_by_name(lance_core::ROW_ID).unwrap(), + None, + None, + ) + .unwrap(); + let after_idx = arrow_ord::sort::sort_to_indices( + after_batch.column_by_name(lance_core::ROW_ID).unwrap(), + None, + None, + ) + .unwrap(); + let before = arrow::compute::take_record_batch(&before_batch, &before_idx).unwrap(); + let after = arrow::compute::take_record_batch(&after_batch, &after_idx).unwrap(); + + assert_eq!(before, after); +} + +#[tokio::test] +async fn test_binary_copy_remaps_unstable_row_ids() { + for version in LanceFileVersion::iter_non_legacy() { + do_binary_copy_remaps_unstable_row_ids(version).await; + } +} + +async fn do_binary_copy_remaps_unstable_row_ids(version: LanceFileVersion) { + let mut data_gen = BatchGenerator::new() + .col(Box::new( + RandomVector::new().vec_width(8).named("vec".to_owned()), + )) + .col(Box::new(IncrementingInt32::new().named("i".to_owned()))); + + let mut dataset = Dataset::write( + data_gen.batch(4_000), + "memory://test/binary_copy_no_stable", + Some(WriteParams { + enable_stable_row_ids: false, + data_storage_version: Some(version), + max_rows_per_file: 500, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("scalar".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let params = VectorIndexParams::ivf_pq(1, 8, 1, MetricType::L2, 50); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vector".into()), + ¶ms, + false, + ) + .await + .unwrap(); + + async fn vector_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + let query = Float32Array::from(vec![0.0f32; 8]); + scanner + .nearest("vec", &query, 10) + .unwrap() + .project(&["i"]) + .unwrap(); + scanner.try_into_batch().await.unwrap() + } + + async fn scalar_query(dataset: &Dataset) -> RecordBatch { + let mut scanner = dataset.scan(); + scanner.filter("i = 100").unwrap().project(&["i"]).unwrap(); + scanner.try_into_batch().await.unwrap() + } + + let before_vec_result = vector_query(&dataset).await; + let before_scalar_result = scalar_query(&dataset).await; + let before_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let after_vec_result = vector_query(&dataset).await; + assert_eq!(before_vec_result, after_vec_result); + + let after_scalar_result = scalar_query(&dataset).await; + assert_eq!(before_scalar_result, after_scalar_result); + + let after_batch = dataset + .scan() + .project(&["vec", "i"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + assert_eq!(before_batch, after_batch); +} + +#[tokio::test] +async fn test_binary_copy_preserves_zonemap_queries() { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + let mut data_gen = BatchGenerator::new() + .col(Box::new(IncrementingInt32::new().named("a".to_owned()))) + .col(Box::new(IncrementingInt32::new().named("b".to_owned()))); + + let mut dataset = Dataset::write( + data_gen.batch(5_000), + "memory://test/binary_copy_zonemap", + Some(WriteParams { + max_rows_per_file: 500, + data_storage_version: Some(LanceFileVersion::V2_1), + ..Default::default() + }), + ) + .await + .unwrap(); + + let zonemap_params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap); + dataset + .create_index( + &["a"], + IndexType::Scalar, + Some("zonemap".into()), + &zonemap_params, + false, + ) + .await + .unwrap(); + + let predicate = "a >= 2500 AND b < 4000"; + let before = dataset + .scan() + .filter(predicate) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 100_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + compact_files(&mut dataset, options, None).await.unwrap(); + + let after = dataset + .scan() + .filter(predicate) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + assert_eq!(before, after); +} + +#[tokio::test] +async fn test_binary_copy_preserves_bloom_filter_queries() { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + let mut data_gen = BatchGenerator::new() + .col(Box::new(IncrementingInt32::new().named("id".to_owned()))) + .col(Box::new(IncrementingInt32::new().named("val".to_owned()))); + + let mut dataset = Dataset::write( + data_gen.batch(6_000), + "memory://test/binary_copy_bloom", + Some(WriteParams { + max_rows_per_file: 500, + data_storage_version: Some(LanceFileVersion::V2_1), + ..Default::default() + }), + ) + .await + .unwrap(); + + #[derive(serde::Serialize)] + struct BloomParams { + number_of_items: u64, + probability: f64, + } + let bloom_params = + ScalarIndexParams::for_builtin(BuiltinIndexType::BloomFilter).with_params(&BloomParams { + number_of_items: 500, + probability: 0.01, + }); + dataset + .create_index( + &["val"], + IndexType::Scalar, + Some("bloom".into()), + &bloom_params, + false, + ) + .await + .unwrap(); + + let predicate = "val IN (123, 124, 125, 126)"; + let before = dataset + .scan() + .filter(predicate) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 100_000, + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + compact_files(&mut dataset, options, None).await.unwrap(); + + let after = dataset + .scan() + .filter(predicate) + .unwrap() + .try_into_batch() + .await + .unwrap(); + + assert_eq!(before, after); +} + +#[tokio::test] +async fn test_binary_copy_fallback_to_common_compaction() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 500, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + dataset.delete("a < 100").await.unwrap(); + + let before = dataset.scan().try_into_batch().await.unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 100_000, + enable_binary_copy: true, + ..Default::default() + }; + + let frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + + let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); + + let after = dataset.scan().try_into_batch().await.unwrap(); + assert_eq!(before, after); +} + +#[tokio::test] +async fn test_can_use_binary_copy_schema_consistency_ok() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader1 = RecordBatchIterator::new(vec![Ok(data.slice(0, 5_000))], data.schema()); + let reader2 = RecordBatchIterator::new(vec![Ok(data.slice(5_000, 5_000))], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 1_000, + ..Default::default() + }; + let mut dataset = Dataset::write(reader1, test_uri, Some(write_params.clone())) + .await + .unwrap(); + dataset.append(reader2, Some(write_params)).await.unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + let frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!(can_use_binary_copy(&dataset, &options, &frags)); +} + +#[tokio::test] +async fn test_can_use_binary_copy_schema_mismatch() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 1_000, + ..Default::default() + }; + let dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + ..Default::default() + }; + let mut frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + // Introduce a column index mismatch in the first data file + if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { + if let Some(first) = df.column_indices.get_mut(0) { + *first = -*first - 1; + } else { + df.column_indices.push(-1); + } + } + assert!(!can_use_binary_copy(&dataset, &options, &frags)); + + // Also introduce a version mismatch and ensure rejection + if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { + df.file_minor_version = if df.file_minor_version == 1 { 2 } else { 1 }; + } + assert!(!can_use_binary_copy(&dataset, &options, &frags)); +} + +#[tokio::test] +async fn test_can_use_binary_copy_version_mismatch() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 500, + data_storage_version: Some(LanceFileVersion::V2_0), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + // Append additional data and then mark its files as a newer format version (v2.1). + let reader_append = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + dataset.append(reader_append, None).await.unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + ..Default::default() + }; + let mut frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!( + frags.len() >= 2, + "expected multiple fragments for version mismatch test" + ); + + // Simulate mixed file versions by marking the second fragment as v2.1. + let (v21_major, v21_minor) = LanceFileVersion::V2_1.to_numbers(); + for file in &mut frags[1].files { + file.file_major_version = v21_major; + file.file_minor_version = v21_minor; + } + + assert!(!can_use_binary_copy(&dataset, &options, &frags)); +} + +#[tokio::test] +async fn test_can_use_binary_copy_reject_deletions() { + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let data = sample_data(); + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 1_000, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + dataset.delete("a < 10").await.unwrap(); + + let options = CompactionOptions { + enable_binary_copy: true, + ..Default::default() + }; + let frags: Vec = dataset + .get_fragments() + .into_iter() + .map(Into::into) + .collect(); + assert!(!can_use_binary_copy(&dataset, &options, &frags)); +} + +#[tokio::test] +async fn test_binary_copy_compaction_with_complex_schema() { + for version in LanceFileVersion::iter_non_legacy() { + do_test_binary_copy_compaction_with_complex_schema(version).await; + } +} + +async fn do_test_binary_copy_compaction_with_complex_schema(version: LanceFileVersion) { + use arrow_schema::{DataType, Field, Fields, TimeUnit}; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::{array, gen_batch, BatchCount, Dimension, RowCount}; + + let row_num = 1_000; + + let inner_fields = Fields::from(vec![ + Field::new("x", DataType::UInt32, true), + Field::new("y", DataType::LargeUtf8, true), + ]); + let nested_fields = Fields::from(vec![ + Field::new("inner", DataType::Struct(inner_fields.clone()), true), + Field::new("fsb", DataType::FixedSizeBinary(16), true), + Field::new("bin", DataType::Binary, true), + ]); + let event_fields = Fields::from(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("payload", DataType::Binary, true), + ]); + + let reader_full = gen_batch() + .col("vec1", array::rand_vec::(Dimension::from(12))) + .col("vec2", array::rand_vec::(Dimension::from(8))) + .col("i32", array::step::()) + .col("i64", array::step::()) + .col("f32", array::rand::()) + .col("f64", array::rand::()) + .col("bool", array::rand_boolean()) + .col("date32", array::rand_date32()) + .col("date64", array::rand_date64()) + .col( + "ts_ms", + array::rand_timestamp(&DataType::Timestamp(TimeUnit::Millisecond, None)), + ) + .col( + "utf8", + array::rand_utf8(lance_datagen::ByteCount::from(16), false), + ) + .col("large_utf8", array::random_sentence(1, 6, true)) + .col( + "bin", + array::rand_fixedbin(lance_datagen::ByteCount::from(24), false), + ) + .col( + "large_bin", + array::rand_fixedbin(lance_datagen::ByteCount::from(24), true), + ) + .col( + "varbin", + array::rand_varbin( + lance_datagen::ByteCount::from(8), + lance_datagen::ByteCount::from(32), + ), + ) + .col("fsb16", array::rand_fsb(16)) + .col( + "fsl4", + array::cycle_vec(array::rand::(), Dimension::from(4)), + ) + .col("struct_simple", array::rand_struct(inner_fields.clone())) + .col("struct_nested", array::rand_struct(nested_fields)) + .col( + "events", + array::rand_list_any(array::rand_struct(event_fields.clone()), true), + ) + .into_reader_rows(RowCount::from(row_num), BatchCount::from(10)); + + let full_dir = TempStrDir::default(); + let mut dataset = Dataset::write( + reader_full, + &*full_dir, + Some(WriteParams { + enable_stable_row_ids: true, + data_storage_version: Some(version), + max_rows_per_file: (row_num / 100) as usize, + ..Default::default() + }), + ) + .await + .unwrap(); + + let opt_full = CompactionOptions { + enable_binary_copy: false, + ..Default::default() + }; + let opt_binary = CompactionOptions { + enable_binary_copy: true, + enable_binary_copy_force: true, + ..Default::default() + }; + + let _ = compact_files(&mut dataset, opt_full, None).await.unwrap(); + let before = dataset.count_rows(None).await.unwrap(); + let batch_before = dataset.scan().try_into_batch().await.unwrap(); + + let mut dataset = dataset.checkout_version(1).await.unwrap(); + + // rollback and trigger another binary copy compaction + dataset.restore().await.unwrap(); + let _ = compact_files(&mut dataset, opt_binary, None).await.unwrap(); + let after = dataset.count_rows(None).await.unwrap(); + let batch_after = dataset.scan().try_into_batch().await.unwrap(); + + assert_eq!(before, after); + assert_eq!(batch_before, batch_after); +} From cf46643aaf95a13877650a4de6d623ebfd918c40 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 11 Dec 2025 19:10:52 +0800 Subject: [PATCH 10/24] code review --- rust/lance/src/dataset/optimize/binary_copy.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index 304db331112..babcb154adb 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -144,10 +144,10 @@ pub async fn rewrite_files_binary_copy( .open_file_with_priority(&full_path, 0, &df.file_size_bytes) .await?; let file_meta = LFReader::read_all_metadata(&file_scheduler).await?; - let src_colum_infos = file_meta.column_infos.clone(); + let src_column_infos = file_meta.column_infos.clone(); // Initialize current_page_table if current_page_table.is_empty() { - current_page_table = src_colum_infos + current_page_table = src_column_infos .iter() .map(|column_index| ColumnInfo { index: column_index.index, @@ -161,7 +161,7 @@ pub async fn rewrite_files_binary_copy( } // Iterate through each column of the current data file of the current fragment - for (col_idx, src_column_info) in src_colum_infos.iter().enumerate() { + for (col_idx, src_column_info) in src_column_infos.iter().enumerate() { // v2_0 compatibility: special handling for non-leaf structural header columns // - v2_0 expects structural header columns to have a SINGLE page; they carry layout // metadata only and are not true data carriers. From 9646e51d7324bf15640e5b09570f3794c77755f2 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 11 Dec 2025 19:29:46 +0800 Subject: [PATCH 11/24] code review --- java/lance-jni/Cargo.lock | 2 ++ rust/lance/src/dataset/optimize.rs | 7 ------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 06eb7e045f9..33397d98499 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3417,6 +3417,7 @@ dependencies = [ "arrow-buffer", "arrow-cast", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", "bytes", @@ -3451,6 +3452,7 @@ dependencies = [ "datafusion-sql", "deepsize", "futures", + "itertools 0.13.0", "lance-arrow", "libc", "log", diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 266334c331b..d641bf19408 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -490,13 +490,6 @@ pub async fn compact_files_with_planner( ) -> Result { let compaction_plan: CompactionPlan = planner.plan(dataset).await?; - if compaction_plan.tasks().is_empty() && options.enable_binary_copy_force { - return Err(Error::NotSupported { - source: "cannot execute binary copy compaction task".into(), - location: location!(), - }); - } - // If nothing to compact, don't make a commit. if compaction_plan.tasks().is_empty() { return Ok(CompactionMetrics::default()); From 582074a2ca16cea14d81f39c99b5839c0b068214 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 11 Dec 2025 19:30:38 +0800 Subject: [PATCH 12/24] code review --- java/lance-jni/Cargo.lock | 2 -- 1 file changed, 2 deletions(-) diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 33397d98499..06eb7e045f9 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3417,7 +3417,6 @@ dependencies = [ "arrow-buffer", "arrow-cast", "arrow-data", - "arrow-ord", "arrow-schema", "arrow-select", "bytes", @@ -3452,7 +3451,6 @@ dependencies = [ "datafusion-sql", "deepsize", "futures", - "itertools 0.13.0", "lance-arrow", "libc", "log", From a352b16bb5e33d6294a438944dbeefa138ef7ed3 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 16 Dec 2025 17:01:29 +0800 Subject: [PATCH 13/24] bug fix --- .../lance/src/dataset/optimize/binary_copy.rs | 40 +------------------ 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index babcb154adb..92fd99eb249 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -2,7 +2,6 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors use crate::dataset::fragment::write::generate_random_filename; -use crate::dataset::optimize::load_row_id_sequence; use crate::dataset::WriteParams; use crate::dataset::DATA_DIR; use crate::datatypes::Schema; @@ -17,8 +16,7 @@ use lance_file::writer::{FileWriter, FileWriterOptions}; use lance_io::object_writer::ObjectWriter; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::traits::Writer; -use lance_table::format::{DataFile, Fragment, RowIdMeta}; -use lance_table::rowids::{write_row_ids, RowIdSequence}; +use lance_table::format::{DataFile, Fragment}; use prost::Message; use prost_types::Any; use std::ops::Range; @@ -113,8 +111,6 @@ pub async fn rewrite_files_binary_copy( let mut col_buffers: Vec> = vec![Vec::new(); column_count]; let mut total_rows_in_current: u64 = 0; let max_rows_per_file = params.max_rows_per_file as u64; - let uses_stable_row_ids = dataset.manifest.uses_stable_row_ids(); - let mut current_row_ids = RowIdSequence::new(); // Align all writes to 64-byte boundaries to honor typical IO alignment and // keep buffer offsets valid across concatenated pages. @@ -123,12 +119,6 @@ pub async fn rewrite_files_binary_copy( let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); // Visit each fragment and all of its data files (a fragment may contain multiple files) for frag in fragments.iter() { - let mut frag_row_ids_offset: u64 = 0; - let frag_row_ids = if uses_stable_row_ids { - Some(load_row_id_sequence(dataset, frag).await?) - } else { - None - }; for df in frag.files.iter() { let object_store = if let Some(base_id) = df.base_id { dataset.object_store_for_base(base_id).await? @@ -300,23 +290,6 @@ pub async fn rewrite_files_binary_copy( } } // finished all columns in the current source file - if uses_stable_row_ids { - // When stable row IDs are enabled, incorporate the fragment's row IDs - if let Some(seq) = frag_row_ids.as_ref() { - // Number of rows in the current source file - let count = file_meta.num_rows as usize; - - // Take the subsequence of row IDs corresponding to this file - let slice = seq.slice(frag_row_ids_offset as usize, count); - - // Append these row IDs to the accumulated sequence for the current output - current_row_ids.extend(slice.iter().collect()); - - // Advance the offset so the next file reads the subsequent row IDs - frag_row_ids_offset += count as u64; - } - } - // Accumulate rows for the current output file and flush when reaching the threshold total_rows_in_current += file_meta.num_rows; if total_rows_in_current >= max_rows_per_file { @@ -370,10 +343,7 @@ pub async fn rewrite_files_binary_copy( data_file_out.column_indices = field_column_indices; fragment_out.files.push(data_file_out); fragment_out.physical_rows = Some(total_rows_in_current as usize); - if uses_stable_row_ids { - fragment_out.row_id_meta = - Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); - } + // Reset state for next output file current_writer = None; current_pos = 0; @@ -386,9 +356,6 @@ pub async fn rewrite_files_binary_copy( } out.push(fragment_out); total_rows_in_current = 0; - if uses_stable_row_ids { - current_row_ids = RowIdSequence::new(); - } } } } // Finished writing all fragments; any remaining data in memory will be flushed below @@ -446,9 +413,6 @@ pub async fn rewrite_files_binary_copy( df.column_indices = field_column_indices; frag.files.push(df); frag.physical_rows = Some(total_rows_in_current as usize); - if uses_stable_row_ids { - frag.row_id_meta = Some(RowIdMeta::Inline(write_row_ids(¤t_row_ids))); - } out.push(frag); } Ok(out) From 2faa84835b27d682900417eeaddaa0d237ef67a4 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 16 Dec 2025 17:07:57 +0800 Subject: [PATCH 14/24] bug fix --- rust/lance/src/dataset/optimize.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index d641bf19408..1c352a48572 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -113,7 +113,6 @@ use tracing::info; mod binary_copy; pub mod remapping; -use super::rowids::load_row_id_sequence; use crate::index::frag_reuse::build_new_frag_reuse_index; use crate::io::deletion::read_dataset_deletion_file; use binary_copy::rewrite_files_binary_copy; From c8255fff1e8f85b5275ccbbbc0525dd394f07160 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Wed, 17 Dec 2025 16:44:17 +0800 Subject: [PATCH 15/24] bug fix --- rust/lance-file/src/writer.rs | 3 +- .../lance/src/dataset/optimize/binary_copy.rs | 80 +++++++++++-------- 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index a18daa57383..77cc627a540 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -492,8 +492,7 @@ impl FileWriter { /// only need to write the footer and schema metadata. The provided /// `column_metadata` must describe the buffers already persisted by the /// underlying `ObjectWriter`, and `rows_written` should reflect the total number - /// of rows in those buffers. Call this on a lazily created writer before - /// invoking [`finish`]. + /// of rows in those buffers. pub fn initialize_with_external_metadata( &mut self, schema: lance_core::datatypes::Schema, diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index 92fd99eb249..63e4f3e480f 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -23,6 +23,32 @@ use std::ops::Range; use std::sync::Arc; use tokio::io::AsyncWriteExt; +const ALIGN: usize = 64; + +/// Apply 64-byte alignment padding for V2.1+ files. +/// +/// For V2.1+, writes padding bytes to align the current position to a 64-byte boundary. +/// For V2.0 and earlier, no padding is applied as alignment is not required. +/// +/// Returns the new position after padding (if any). +async fn apply_alignment_padding( + writer: &mut ObjectWriter, + current_pos: u64, + version: LanceFileVersion, +) -> Result { + if version >= LanceFileVersion::V2_1 { + static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); + let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); + + let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; + if pad != 0 { + writer.write_all(&zero_buf[..pad]).await?; + return Ok(current_pos + pad as u64); + } + } + Ok(current_pos) +} + /// Rewrite the files in a single task using binary copy semantics. /// /// Flow overview (per task): @@ -38,7 +64,7 @@ use tokio::io::AsyncWriteExt; /// - Assumes all input files share the same Lance file version; version drives column-count /// calculation (v2.0 includes structural headers, v2.1+ only leaf columns). /// - Preserves stable row ids by concatenating row-id sequences when enabled. -/// - Enforces 64-byte alignment for page and buffer writes to satisfy downstream readers. +/// - Enforces 64-byte alignment for page and buffer writes in V2.1+ files (V2.0 does not require alignment). /// - For v2.0, preserves single-page structural headers and normalizes their row counts/priority. /// - Flushes an output file once `max_rows_per_file` rows are accumulated, then repeats. /// @@ -71,19 +97,22 @@ pub async fn rewrite_files_binary_copy( ) .unwrap() .resolve(); - // v2_0 compatibility: column layout differs across file versions - // - v2_0 materializes BOTH leaf columns and non-leaf structural headers (e.g., Struct / List) + // v2.0 and v2.1+ handle structural headers differently during file writing: + // - v2_0 materializes ALL fields in pre-order traversal (leaf fields + non-leaf struct headers), // which means the ColumnInfo set includes all fields in pre-order traversal. - // - v2_1+ materializes ONLY leaf columns. Non-leaf structural headers are not stored as columns. - // As a result, the ColumnInfo set contains leaf fields only. + // - v2_1+ materializes fields that are either leaf columns OR packed structs. Non-leaf structural + // headers (unpacked structs with children) are not stored as columns. + // As a result, the ColumnInfo set contains leaf fields and packed structs. // To correctly align copy layout, we derive `column_count` by version: // - v2_0: use total number of fields in pre-order (leaf + non-leaf headers) - // - v2_1+: use only the number of leaf fields - let leaf_count = schema.fields_pre_order().filter(|f| f.is_leaf()).count(); + // - v2_1+: use only the number of leaf fields plus packed structs let column_count = if version == LanceFileVersion::V2_0 { schema.fields_pre_order().count() } else { - leaf_count + schema + .fields_pre_order() + .filter(|f| f.is_packed_struct() || f.is_leaf()) + .count() }; // v2_0 compatibility: build a map to identify non-leaf structural header columns @@ -112,11 +141,6 @@ pub async fn rewrite_files_binary_copy( let mut total_rows_in_current: u64 = 0; let max_rows_per_file = params.max_rows_per_file as u64; - // Align all writes to 64-byte boundaries to honor typical IO alignment and - // keep buffer offsets valid across concatenated pages. - const ALIGN: usize = 64; - static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); - let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); // Visit each fragment and all of its data files (a fragment may contain multiple files) for frag in fragments.iter() { for df in frag.files.iter() { @@ -236,11 +260,8 @@ pub async fn rewrite_files_binary_copy( for _ in 0..*buffer_count { if let Some(bytes) = bytes_iter.next() { let writer = current_writer.as_mut().unwrap(); - let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; - if pad != 0 { - writer.write_all(&zero_buf[..pad]).await?; - current_pos += pad as u64; - } + current_pos = + apply_alignment_padding(writer, current_pos, version).await?; let start = current_pos; writer.write_all(&bytes).await?; current_pos += bytes.len() as u64; @@ -277,11 +298,7 @@ pub async fn rewrite_files_binary_copy( let bytes_vec = file_scheduler.submit_request(ranges, 0).await?; for bytes in bytes_vec.into_iter() { let writer = current_writer.as_mut().unwrap(); - let pad = (ALIGN - (current_pos as usize % ALIGN)) % ALIGN; - if pad != 0 { - writer.write_all(&zero_buf[..pad]).await?; - current_pos += pad as u64; - } + current_pos = apply_alignment_padding(writer, current_pos, version).await?; let start = current_pos; writer.write_all(&bytes).await?; current_pos += bytes.len() as u64; @@ -332,7 +349,7 @@ pub async fn rewrite_files_binary_copy( let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); let mut curr_col_idx: i32 = 0; for field in schema.fields_pre_order() { - if field.is_packed_struct() || field.children.is_empty() || !is_structural { + if field.is_packed_struct() || field.is_leaf() || !is_structural { field_column_indices.push(curr_col_idx); curr_col_idx += 1; } else { @@ -402,7 +419,7 @@ pub async fn rewrite_files_binary_copy( let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); let mut curr_col_idx: i32 = 0; for field in schema.fields_pre_order() { - if field.is_packed_struct() || field.children.is_empty() || !is_structural { + if field.is_packed_struct() || field.is_leaf() || !is_structural { field_column_indices.push(curr_col_idx); curr_col_idx += 1; } else { @@ -442,16 +459,9 @@ async fn flush_footer( total_rows_in_current: u64, version: LanceFileVersion, ) -> Result<()> { - if version >= LanceFileVersion::V2_1 { - const ALIGN: usize = 64; - static ZERO_BUFFER: std::sync::OnceLock> = std::sync::OnceLock::new(); - let zero_buf = ZERO_BUFFER.get_or_init(|| vec![0u8; ALIGN]); - let pos = writer.tell().await? as u64; - let pad = (ALIGN as u64 - (pos % ALIGN as u64)) % ALIGN as u64; - if pad != 0 { - writer.write_all(&zero_buf[..pad as usize]).await?; - } - } + let pos = writer.tell().await? as u64; + let _new_pos = apply_alignment_padding(&mut writer, pos, version).await?; + let mut col_metadatas = Vec::with_capacity(final_cols.len()); for col in final_cols { let pages = col From ab7837214043d8aaed2ec92a2cdefd33e0af55e9 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 22 Dec 2025 15:38:46 +0800 Subject: [PATCH 16/24] Verify the consistency of column buffer encoding --- .../lance/src/dataset/optimize/binary_copy.rs | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index 63e4f3e480f..7f753347cdf 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -8,6 +8,7 @@ use crate::datatypes::Schema; use crate::Dataset; use crate::Result; use lance_arrow::DataTypeExt; +use lance_core::Error; use lance_encoding::decoder::{ColumnInfo, PageEncoding, PageInfo as DecPageInfo}; use lance_encoding::version::LanceFileVersion; use lance_file::format::pbfile; @@ -19,6 +20,7 @@ use lance_io::traits::Writer; use lance_table::format::{DataFile, Fragment}; use prost::Message; use prost_types::Any; +use snafu::location; use std::ops::Range; use std::sync::Arc; use tokio::io::AsyncWriteExt; @@ -79,6 +81,13 @@ pub async fn rewrite_files_binary_copy( params: &WriteParams, read_batch_bytes_opt: Option, ) -> Result> { + if fragments.is_empty() || fragments.iter().any(|fragment| fragment.files.is_empty()) { + return Err(Error::invalid_input( + "binary copy requires at least one data file", + location!(), + )); + } + // Binary copy algorithm overview: // - Reads page and buffer regions directly from source files in bounded batches // - Appends them to a new output file with alignment, updating offsets @@ -132,6 +141,9 @@ pub async fn rewrite_files_binary_copy( let mut current_filename: Option = None; let mut current_pos: u64 = 0; let mut current_page_table: Vec = Vec::new(); + // Baseline column encodings captured from the first source file; all subsequent + // files must match per-column to safely concatenate column-level buffers. + let mut baseline_col_encoding_bytes: Vec> = Vec::new(); // Column-list> let mut col_pages: Vec> = std::iter::repeat_with(Vec::::new) @@ -172,6 +184,10 @@ pub async fn rewrite_files_binary_copy( encoding: column_index.encoding.clone(), }) .collect(); + baseline_col_encoding_bytes = src_column_infos + .iter() + .map(|ci| Any::from_msg(&ci.encoding).unwrap().encode_to_vec()) + .collect(); } // Iterate through each column of the current data file of the current fragment @@ -288,8 +304,22 @@ pub async fn rewrite_files_binary_copy( } } // finished scheduling & copying pages for this column in the current source file - // Copy column-level buffers (outside page data) with alignment if !src_column_info.buffer_offsets_and_sizes.is_empty() { + // Validate column-level encoding compatibility before copying buffers + let src_col_encoding_bytes = Any::from_msg(&src_column_info.encoding) + .unwrap() + .encode_to_vec(); + let baseline_bytes = &baseline_col_encoding_bytes[col_idx]; + if src_col_encoding_bytes != *baseline_bytes { + return Err(Error::Execution { + message: format!( + "binary copy: The ColumnEncoding of column {} is incompatible with the first file, \ + making it impossible to safely concatenate buffers", + col_idx + ), + location: location!(), + }); + } let ranges: Vec> = src_column_info .buffer_offsets_and_sizes .iter() From 5b74fb7d442efad79c9161b824895de21edeb480 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Mon, 5 Jan 2026 20:21:59 +0800 Subject: [PATCH 17/24] code review --- java/lance-jni/Cargo.lock | 1 + rust/lance-table/src/rowids.rs | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 86319138591..156a118e70d 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3518,6 +3518,7 @@ dependencies = [ "half", "hex", "rand 0.9.2", + "rand_distr 0.5.1", "rand_xoshiro", "random_word", ] diff --git a/rust/lance-table/src/rowids.rs b/rust/lance-table/src/rowids.rs index 3434c06dc5d..81671e871d3 100644 --- a/rust/lance-table/src/rowids.rs +++ b/rust/lance-table/src/rowids.rs @@ -102,12 +102,6 @@ impl From<&[u64]> for RowIdSequence { } } -impl FromIterator for RowIdSequence { - fn from_iter>(iter: T) -> Self { - Self(vec![U64Segment::from_iter(iter)]) - } -} - impl RowIdSequence { pub fn new() -> Self { Self::default() From d35a13056115722a3aae1400107ea8ee79447590 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 6 Jan 2026 19:39:20 +0800 Subject: [PATCH 18/24] code review --- java/lance-jni/Cargo.lock | 1 - 1 file changed, 1 deletion(-) diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 156a118e70d..86319138591 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3518,7 +3518,6 @@ dependencies = [ "half", "hex", "rand 0.9.2", - "rand_distr 0.5.1", "rand_xoshiro", "random_word", ] From 902839dd3044b629fd094b61b1e9b53e4bf42d4e Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 13 Jan 2026 11:02:08 +0800 Subject: [PATCH 19/24] code review --- .../lance/src/dataset/optimize/binary_copy.rs | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index 7f753347cdf..4e4e05b439e 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -51,6 +51,28 @@ async fn apply_alignment_padding( Ok(current_pos) } +/// v2_0 vs v2_1+ field-to-column index mapping +/// - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping +/// - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index +fn compute_field_column_indices( + schema: &Schema, + full_field_ids_len: usize, + version: LanceFileVersion, +) -> Vec { + let is_structural = version >= LanceFileVersion::V2_1; + let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids_len); + let mut curr_col_idx: i32 = 0; + for field in schema.fields_pre_order() { + if field.is_packed_struct() || field.is_leaf() || !is_structural { + field_column_indices.push(curr_col_idx); + curr_col_idx += 1; + } else { + field_column_indices.push(-1); + } + } + field_column_indices +} + /// Rewrite the files in a single task using binary copy semantics. /// /// Flow overview (per task): @@ -375,19 +397,9 @@ pub async fn rewrite_files_binary_copy( // v2_0 vs v2_1+ field-to-column index mapping // - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping // - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index - let is_structural = version >= LanceFileVersion::V2_1; - let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); - let mut curr_col_idx: i32 = 0; - for field in schema.fields_pre_order() { - if field.is_packed_struct() || field.is_leaf() || !is_structural { - field_column_indices.push(curr_col_idx); - curr_col_idx += 1; - } else { - field_column_indices.push(-1); - } - } data_file_out.fields = full_field_ids.clone(); - data_file_out.column_indices = field_column_indices; + data_file_out.column_indices = + compute_field_column_indices(&schema, full_field_ids.len(), version); fragment_out.files.push(data_file_out); fragment_out.physical_rows = Some(total_rows_in_current as usize); @@ -445,19 +457,8 @@ pub async fn rewrite_files_binary_copy( let mut frag = Fragment::new(0); let mut df = DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); // v2_0 vs v2_1+ field-to-column index mapping for the final file - let is_structural = version >= LanceFileVersion::V2_1; - let mut field_column_indices: Vec = Vec::with_capacity(full_field_ids.len()); - let mut curr_col_idx: i32 = 0; - for field in schema.fields_pre_order() { - if field.is_packed_struct() || field.is_leaf() || !is_structural { - field_column_indices.push(curr_col_idx); - curr_col_idx += 1; - } else { - field_column_indices.push(-1); - } - } df.fields = full_field_ids.clone(); - df.column_indices = field_column_indices; + df.column_indices = compute_field_column_indices(&schema, full_field_ids.len(), version); frag.files.push(df); frag.physical_rows = Some(total_rows_in_current as usize); out.push(frag); From 1f7c5584b1c4e56350ec0945622c537e3c688ba9 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 13 Jan 2026 12:06:08 +0800 Subject: [PATCH 20/24] code review --- rust/lance/src/dataset/optimize.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 1c352a48572..f9c27983c16 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -250,7 +250,11 @@ fn can_use_binary_copy( Ok(version) => version.resolve(), Err(_) => return false, }; + // Capture schema mapping baseline from first data file + if fragments[0].files.is_empty() { + return false; + } let ref_fields = &fragments[0].files[0].fields; let ref_cols = &fragments[0].files[0].column_indices; // Single-pass verification across fragments and their files @@ -968,12 +972,17 @@ async fn rewrite_files( let mut addrs = RoaringTreemap::new(); for frag in &fragments { let frag_id = frag.id as u32; - let count = frag.physical_rows.unwrap_or(0); - for i in 0..count { - let addr = - lance_core::utils::address::RowAddress::new_from_parts(frag_id, i as u32); - addrs.insert(u64::from(addr)); - } + let count = u64::try_from(frag.physical_rows.unwrap_or(0)).map_err(|_| { + Error::Internal { + message: format!( + "Fragment {} has too many physical rows to represent as row addresses", + frag.id + ), + location: location!(), + } + })?; + let start = u64::from(lance_core::utils::address::RowAddress::first_row(frag_id)); + addrs.insert_range(start..start + count); } let captured = CapturedRowIds::AddressStyle(addrs); let _ = tx.send(captured); From 7492c4d43494884d4f2986b13126b0eb6882c24a Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 13 Jan 2026 15:23:53 +0800 Subject: [PATCH 21/24] code review --- .../lance/src/dataset/optimize/binary_copy.rs | 188 ++++++++++-------- 1 file changed, 102 insertions(+), 86 deletions(-) diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index 4e4e05b439e..b38b3699276 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -51,6 +51,22 @@ async fn apply_alignment_padding( Ok(current_pos) } +async fn init_writer_if_necessary( + dataset: &Dataset, + current_writer: &mut Option, + current_filename: &mut Option, +) -> Result { + if current_writer.is_none() { + let filename = format!("{}.lance", generate_random_filename()); + let path = dataset.base.child(DATA_DIR).child(filename.as_str()); + let writer = dataset.object_store.create(&path).await?; + *current_writer = Some(writer); + *current_filename = Some(filename); + return Ok(true); + } + Ok(false) +} + /// v2_0 vs v2_1+ field-to-column index mapping /// - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping /// - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index @@ -73,6 +89,62 @@ fn compute_field_column_indices( field_column_indices } +/// Finalize the current output file and return it as a single [Fragment]. +/// - Ensures an output writer / filename is present (creates a new file if needed). +/// - Converts the in-memory `col_pages` / `col_buffers` into `ColumnInfo` metadata, draining them. +/// - Applies v2_0 structural header rules (single page, normalized `num_rows` and `priority`). +/// - Writes the Lance footer via [flush_footer] and registers the resulting [DataFile] in a [Fragment]. +/// +/// PAY ATTENTION current function will: +/// - Takes (`Option::take`) the current writer and filename. +/// - Drains `col_pages` and `col_buffers` for all columns. +async fn finalize_current_output_file( + schema: &Schema, + full_field_ids: &[i32], + current_writer: &mut Option, + current_filename: &mut Option, + current_page_table: &[ColumnInfo], + col_pages: &mut [Vec], + col_buffers: &mut [Vec<(u64, u64)>], + is_non_leaf_column: &[bool], + total_rows_in_current: u64, + version: LanceFileVersion, +) -> Result { + let mut final_cols: Vec> = Vec::with_capacity(current_page_table.len()); + for (i, columnInfo) in current_page_table.iter().enumerate() { + let mut pages_vec = std::mem::take(&mut col_pages[i]); + // For v2_0 struct headers, force a single page and set num_rows to total + if version == LanceFileVersion::V2_0 + && is_non_leaf_column.get(i).copied().unwrap_or(false) + && !pages_vec.is_empty() + { + pages_vec[0].num_rows = total_rows_in_current; + pages_vec[0].priority = 0; + pages_vec.truncate(1); + } + let pages_arc = Arc::from(pages_vec.into_boxed_slice()); + let buffers_vec = std::mem::take(&mut col_buffers[i]); + final_cols.push(Arc::new(ColumnInfo::new( + columnInfo.index, + pages_arc, + buffers_vec, + columnInfo.encoding.clone(), + ))); + } + let writer = current_writer.take().unwrap(); + flush_footer(writer, schema, &final_cols, total_rows_in_current, version).await?; + + // Register the newly closed output file as a fragment data file + let (maj, min) = version.to_numbers(); + let mut fragment = Fragment::new(0); + let mut data_file = DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); + data_file.fields = full_field_ids.to_vec(); + data_file.column_indices = compute_field_column_indices(schema, full_field_ids.len(), version); + fragment.files.push(data_file); + fragment.physical_rows = Some(total_rows_in_current as usize); + Ok(fragment) +} + /// Rewrite the files in a single task using binary copy semantics. /// /// Flow overview (per task): @@ -228,12 +300,9 @@ pub async fn rewrite_files_binary_copy( continue; } - if current_writer.is_none() { - let filename = format!("{}.lance", generate_random_filename()); - let path = dataset.base.child(DATA_DIR).child(filename.as_str()); - let writer = dataset.object_store.create(&path).await?; - current_writer = Some(writer); - current_filename = Some(filename); + if init_writer_if_necessary(dataset, &mut current_writer, &mut current_filename) + .await? + { current_pos = 0; } @@ -362,46 +431,19 @@ pub async fn rewrite_files_binary_copy( // Accumulate rows for the current output file and flush when reaching the threshold total_rows_in_current += file_meta.num_rows; if total_rows_in_current >= max_rows_per_file { - // v2_0 compatibility: enforce single-page structural headers before file close - // - We truncate to a single page and rewrite the page’s `num_rows` to match the output - // file’s row count so downstream decoders see a consistent header. - let mut final_cols: Vec> = Vec::with_capacity(column_count); - for (i, column_info) in current_page_table.iter().enumerate() { - // For v2_0 struct headers, force a single page and set num_rows to total - let mut pages_vec = std::mem::take(&mut col_pages[i]); - if version == LanceFileVersion::V2_0 - && is_non_leaf_column.get(i).copied().unwrap_or(false) - && !pages_vec.is_empty() - { - pages_vec[0].num_rows = total_rows_in_current; - pages_vec[0].priority = 0; - pages_vec.truncate(1); - } - let pages_arc = Arc::from(pages_vec.into_boxed_slice()); - let buffers_vec = std::mem::take(&mut col_buffers[i]); - final_cols.push(Arc::new(ColumnInfo::new( - column_info.index, - pages_arc, - buffers_vec, - column_info.encoding.clone(), - ))); - } - let writer = current_writer.take().unwrap(); - flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; - - // Register the newly closed output file as a fragment data file - let (maj, min) = version.to_numbers(); - let mut fragment_out = Fragment::new(0); - let mut data_file_out = - DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); - // v2_0 vs v2_1+ field-to-column index mapping - // - v2_1+ stores only leaf columns; non-leaf fields get `-1` in the mapping - // - v2_0 includes structural headers as columns; non-leaf fields map to a concrete index - data_file_out.fields = full_field_ids.clone(); - data_file_out.column_indices = - compute_field_column_indices(&schema, full_field_ids.len(), version); - fragment_out.files.push(data_file_out); - fragment_out.physical_rows = Some(total_rows_in_current as usize); + let fragment_out = finalize_current_output_file( + &schema, + &full_field_ids, + &mut current_writer, + &mut current_filename, + ¤t_page_table, + &mut col_pages, + &mut col_buffers, + &is_non_leaf_column, + total_rows_in_current, + version, + ) + .await?; // Reset state for next output file current_writer = None; @@ -421,46 +463,20 @@ pub async fn rewrite_files_binary_copy( if total_rows_in_current > 0 { // Flush remaining rows as a final output file - // v2_0 compatibility: same single-page enforcement applies for the final file close - let mut final_cols: Vec> = Vec::with_capacity(column_count); - for (i, ci) in current_page_table.iter().enumerate() { - // For v2_0 struct headers, force a single page and set num_rows to total - let mut pages_vec = std::mem::take(&mut col_pages[i]); - if version == LanceFileVersion::V2_0 - && is_non_leaf_column.get(i).copied().unwrap_or(false) - && !pages_vec.is_empty() - { - pages_vec[0].num_rows = total_rows_in_current; - pages_vec[0].priority = 0; - pages_vec.truncate(1); - } - let pages_arc = Arc::from(pages_vec.into_boxed_slice()); - let buffers_vec = std::mem::take(&mut col_buffers[i]); - final_cols.push(Arc::new(ColumnInfo::new( - ci.index, - pages_arc, - buffers_vec, - ci.encoding.clone(), - ))); - } - if current_writer.is_none() { - let filename = format!("{}.lance", generate_random_filename()); - let path = dataset.base.child(DATA_DIR).child(filename.as_str()); - let writer = dataset.object_store.create(&path).await?; - current_writer = Some(writer); - current_filename = Some(filename); - } - let writer = current_writer.take().unwrap(); - flush_footer(writer, &schema, &final_cols, total_rows_in_current, version).await?; - // Register the final file - let (maj, min) = version.to_numbers(); - let mut frag = Fragment::new(0); - let mut df = DataFile::new_unstarted(current_filename.take().unwrap(), maj, min); - // v2_0 vs v2_1+ field-to-column index mapping for the final file - df.fields = full_field_ids.clone(); - df.column_indices = compute_field_column_indices(&schema, full_field_ids.len(), version); - frag.files.push(df); - frag.physical_rows = Some(total_rows_in_current as usize); + init_writer_if_necessary(dataset, &mut current_writer, &mut current_filename).await?; + let frag = finalize_current_output_file( + &schema, + &full_field_ids, + &mut current_writer, + &mut current_filename, + ¤t_page_table, + &mut col_pages, + &mut col_buffers, + &is_non_leaf_column, + total_rows_in_current, + version, + ) + .await?; out.push(frag); } Ok(out) From 9f0ac0fefd6da34528f32982d716c49eeb22b765 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Tue, 13 Jan 2026 16:14:24 +0800 Subject: [PATCH 22/24] code review --- rust/lance/src/dataset/optimize.rs | 83 +++++++++++++------ .../lance/src/dataset/optimize/binary_copy.rs | 7 +- .../src/dataset/optimize/tests/binary_copy.rs | 12 +-- 3 files changed, 69 insertions(+), 33 deletions(-) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index f9c27983c16..c8e76b0c0c6 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -210,27 +210,41 @@ impl CompactionOptions { /// - No fragment has a deletion file /// TODO: Need to support schema evolution case like add column and drop column /// - All data files share identical schema mappings (`fields`, `column_indices`) -fn can_use_binary_copy( +/// - Input data files must not contain extra global buffers (beyond schema / file descriptor) +async fn can_use_binary_copy( dataset: &Dataset, options: &CompactionOptions, fragments: &[Fragment], ) -> bool { + can_use_binary_copy_impl(dataset, options, fragments) + .await + .unwrap_or_else(|err| { + log::warn!("Binary copy disabled due to error: {}", err); + false + }) +} + +async fn can_use_binary_copy_impl( + dataset: &Dataset, + options: &CompactionOptions, + fragments: &[Fragment], +) -> Result { + use lance_file::reader::FileReader as LFReader; use lance_file::version::LanceFileVersion; + use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; + if !options.enable_binary_copy { - return false; + return Ok(false); } - // not support blob column for now let has_blob_columns = dataset .schema() .fields_pre_order() .any(|field| field.is_blob()); if has_blob_columns { - return false; + return Ok(false); } - // Check dataset storage version is supported - // Binary copy is not supported for legacy Lance file format let storage_ok = dataset .manifest .data_storage_format @@ -238,35 +252,31 @@ fn can_use_binary_copy( .map(|v| !matches!(v.resolve(), LanceFileVersion::Legacy)) .unwrap_or(false); if !storage_ok { - return false; + return Ok(false); } if fragments.is_empty() { - return false; + return Ok(false); } - // Establish version baseline from the dataset manifest - let storage_file_version = match dataset.manifest.data_storage_format.lance_file_version() { - Ok(version) => version.resolve(), - Err(_) => return false, - }; + let storage_file_version = dataset + .manifest + .data_storage_format + .lance_file_version()? + .resolve(); - // Capture schema mapping baseline from first data file if fragments[0].files.is_empty() { - return false; + return Ok(false); } let ref_fields = &fragments[0].files[0].fields; let ref_cols = &fragments[0].files[0].column_indices; - // Single-pass verification across fragments and their files let mut is_same_version = true; for fragment in fragments { - // Reject fragments with deletions (binary copy does not materialize deletions) if fragment.deletion_file.is_some() { - return false; + return Ok(false); } - // Check version and schema mapping equality for each data file for data_file in &fragment.files { let version_ok = LanceFileVersion::try_from_major_minor( data_file.file_major_version, @@ -278,17 +288,42 @@ fn can_use_binary_copy( if !version_ok { is_same_version = false; } - // Schema mapping must match exactly across all files if data_file.fields != *ref_fields || data_file.column_indices != *ref_cols { - return false; + return Ok(false); + } + + // check file global buffer + let object_store = match data_file.base_id { + Some(base_id) => dataset.object_store_for_base(base_id).await?, + None => dataset.object_store.clone(), + }; + let full_path = dataset + .data_file_dir(data_file)? + .child(data_file.path.as_str()); + let scan_scheduler = ScanScheduler::new( + object_store.clone(), + SchedulerConfig::max_bandwidth(&object_store), + ); + let file_scheduler = scan_scheduler + .open_file_with_priority(&full_path, 0, &data_file.file_size_bytes) + .await?; + let file_meta = LFReader::read_all_metadata(&file_scheduler).await?; + // Binary copy only preserves page and column-buffer bytes. The output file's footer + // (including global buffers) is re-generated, not copied from inputs. + // + // Therefore, we reject input files that contain any additional global buffers beyond + // the required schema / file descriptor global buffer (global buffer index 0). + if file_meta.file_buffers.len() > 1 { + return Ok(false); } } } if !is_same_version { - return false; + return Ok(false); } - true + + Ok(true) } /// Metrics returned by [compact_files]. @@ -899,7 +934,7 @@ async fn rewrite_files( num_rows, fragments.len() ); - let can_binary_copy = can_use_binary_copy(dataset.as_ref(), options, &fragments); + let can_binary_copy = can_use_binary_copy(dataset.as_ref(), options, &fragments).await; if !can_binary_copy && options.enable_binary_copy_force { return Err(Error::NotSupported { source: format!("compaction task {}: binary copy is not supported", task_id).into(), diff --git a/rust/lance/src/dataset/optimize/binary_copy.rs b/rust/lance/src/dataset/optimize/binary_copy.rs index b38b3699276..2a51e8aca9b 100644 --- a/rust/lance/src/dataset/optimize/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/binary_copy.rs @@ -98,6 +98,7 @@ fn compute_field_column_indices( /// PAY ATTENTION current function will: /// - Takes (`Option::take`) the current writer and filename. /// - Drains `col_pages` and `col_buffers` for all columns. +#[allow(clippy::too_many_arguments)] async fn finalize_current_output_file( schema: &Schema, full_field_ids: &[i32], @@ -111,7 +112,7 @@ async fn finalize_current_output_file( version: LanceFileVersion, ) -> Result { let mut final_cols: Vec> = Vec::with_capacity(current_page_table.len()); - for (i, columnInfo) in current_page_table.iter().enumerate() { + for (i, column_info) in current_page_table.iter().enumerate() { let mut pages_vec = std::mem::take(&mut col_pages[i]); // For v2_0 struct headers, force a single page and set num_rows to total if version == LanceFileVersion::V2_0 @@ -125,10 +126,10 @@ async fn finalize_current_output_file( let pages_arc = Arc::from(pages_vec.into_boxed_slice()); let buffers_vec = std::mem::take(&mut col_buffers[i]); final_cols.push(Arc::new(ColumnInfo::new( - columnInfo.index, + column_info.index, pages_arc, buffers_vec, - columnInfo.encoding.clone(), + column_info.encoding.clone(), ))); } let writer = current_writer.take().unwrap(); diff --git a/rust/lance/src/dataset/optimize/tests/binary_copy.rs b/rust/lance/src/dataset/optimize/tests/binary_copy.rs index b28a2324e9e..749bda685df 100644 --- a/rust/lance/src/dataset/optimize/tests/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/tests/binary_copy.rs @@ -511,7 +511,7 @@ async fn test_binary_copy_fallback_to_common_compaction() { .into_iter() .map(Into::into) .collect(); - assert!(!can_use_binary_copy(&dataset, &options, &frags)); + assert!(!can_use_binary_copy(&dataset, &options, &frags).await); let _metrics = compact_files(&mut dataset, options, None).await.unwrap(); @@ -545,7 +545,7 @@ async fn test_can_use_binary_copy_schema_consistency_ok() { .into_iter() .map(Into::into) .collect(); - assert!(can_use_binary_copy(&dataset, &options, &frags)); + assert!(can_use_binary_copy(&dataset, &options, &frags).await); } #[tokio::test] @@ -579,13 +579,13 @@ async fn test_can_use_binary_copy_schema_mismatch() { df.column_indices.push(-1); } } - assert!(!can_use_binary_copy(&dataset, &options, &frags)); + assert!(!can_use_binary_copy(&dataset, &options, &frags).await); // Also introduce a version mismatch and ensure rejection if let Some(df) = frags.get_mut(0).and_then(|f| f.files.get_mut(0)) { df.file_minor_version = if df.file_minor_version == 1 { 2 } else { 1 }; } - assert!(!can_use_binary_copy(&dataset, &options, &frags)); + assert!(!can_use_binary_copy(&dataset, &options, &frags).await); } #[tokio::test] @@ -628,7 +628,7 @@ async fn test_can_use_binary_copy_version_mismatch() { file.file_minor_version = v21_minor; } - assert!(!can_use_binary_copy(&dataset, &options, &frags)); + assert!(!can_use_binary_copy(&dataset, &options, &frags).await); } #[tokio::test] @@ -655,7 +655,7 @@ async fn test_can_use_binary_copy_reject_deletions() { .into_iter() .map(Into::into) .collect(); - assert!(!can_use_binary_copy(&dataset, &options, &frags)); + assert!(!can_use_binary_copy(&dataset, &options, &frags).await); } #[tokio::test] From 7c396ab2240ef39d829e146ef427f37f04f1fc83 Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 29 Jan 2026 10:50:03 +0800 Subject: [PATCH 23/24] code review --- rust/lance/src/dataset/optimize.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index c8e76b0c0c6..321fa4dfa27 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -160,6 +160,12 @@ pub struct CompactionOptions { /// is updated and will be used to perform remapping later. pub defer_index_remap: bool, /// Whether to enable binary copy optimization when eligible. + /// + /// This skips re-encoding the data and can lead to faster compaction + /// times. However, it cannot merge pages together and should not be + /// used when compacting small files together because the pages in the + /// compacted file will be too small and this could lead to poor I/O patterns. + /// /// Defaults to false. pub enable_binary_copy: bool, /// Whether to force binary copy optimization. If true, compaction will fail @@ -234,6 +240,7 @@ async fn can_use_binary_copy_impl( use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; if !options.enable_binary_copy { + log::debug!("Binary copy disabled: enable_binary_copy config is false"); return Ok(false); } @@ -242,6 +249,7 @@ async fn can_use_binary_copy_impl( .fields_pre_order() .any(|field| field.is_blob()); if has_blob_columns { + log::debug!("Binary copy disabled: dataset contains blob columns"); return Ok(false); } @@ -252,10 +260,12 @@ async fn can_use_binary_copy_impl( .map(|v| !matches!(v.resolve(), LanceFileVersion::Legacy)) .unwrap_or(false); if !storage_ok { + log::debug!("Binary copy disabled: dataset uses legacy storage format"); return Ok(false); } if fragments.is_empty() { + log::debug!("Binary copy disabled: no fragments to compact"); return Ok(false); } @@ -266,6 +276,10 @@ async fn can_use_binary_copy_impl( .resolve(); if fragments[0].files.is_empty() { + log::debug!( + "Binary copy disabled: fragment {} has no data files", + fragments[0].id + ); return Ok(false); } let ref_fields = &fragments[0].files[0].fields; @@ -274,6 +288,10 @@ async fn can_use_binary_copy_impl( for fragment in fragments { if fragment.deletion_file.is_some() { + log::debug!( + "Binary copy disabled: fragment {} has a deletion file", + fragment.id + ); return Ok(false); } @@ -314,12 +332,17 @@ async fn can_use_binary_copy_impl( // Therefore, we reject input files that contain any additional global buffers beyond // the required schema / file descriptor global buffer (global buffer index 0). if file_meta.file_buffers.len() > 1 { + log::debug!( + "Binary copy disabled: data file has extra global buffers (len={})", + file_meta.file_buffers.len() + ); return Ok(false); } } } if !is_same_version { + log::debug!("Binary copy disabled: data files use different file versions"); return Ok(false); } From 65befe647bd6831a4b1fcd4a11dcdcbe07a9367d Mon Sep 17 00:00:00 2001 From: YueZhang Date: Thu, 29 Jan 2026 11:56:02 +0800 Subject: [PATCH 24/24] code review --- rust/lance/src/dataset/optimize/tests/binary_copy.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/dataset/optimize/tests/binary_copy.rs b/rust/lance/src/dataset/optimize/tests/binary_copy.rs index 749bda685df..6418b34455f 100644 --- a/rust/lance/src/dataset/optimize/tests/binary_copy.rs +++ b/rust/lance/src/dataset/optimize/tests/binary_copy.rs @@ -693,7 +693,7 @@ async fn do_test_binary_copy_compaction_with_complex_schema(version: LanceFileVe .col("i64", array::step::()) .col("f32", array::rand::()) .col("f64", array::rand::()) - .col("bool", array::rand_boolean()) + .col("bool", array::cycle_bool(vec![false, true])) .col("date32", array::rand_date32()) .col("date64", array::rand_date64()) .col(