From e7fac774b2372b12146de1e1282b1173e7faab7d Mon Sep 17 00:00:00 2001 From: majin1102 Date: Mon, 25 Aug 2025 18:04:25 +0800 Subject: [PATCH 01/11] init --- rust/lance-table/src/format/manifest.rs | 172 +++++++++++++++++++++++- rust/lance/src/dataset.rs | 2 +- 2 files changed, 171 insertions(+), 3 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index bd71eb8c3b7..e18ec964b4b 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::ops::Range; use std::sync::Arc; @@ -437,6 +437,66 @@ impl Manifest { pub fn should_use_legacy_format(&self) -> bool { self.data_storage_format.version == LEGACY_FORMAT_VERSION } + + /// Get the summary information of a manifest. + /// + /// This function calculates various statistics about the manifest, including: + /// - total-records: Total number of records in the dataset + /// - total-file-sizes: Total size of all data files in bytes + /// - total-fragments: Number of fragments in the dataset + /// - total-data-files: Total number of data files across all fragments + /// - total-deletions: Total number of deleted records + /// - total-deletion-files: Number of fragments with deletion files + /// + /// # Returns + /// A BTreeMap containing the calculated metadata + pub fn summary(&self) -> BTreeMap { + let mut metadata = BTreeMap::new(); + + // Calculate total fragments + let total_fragments = self.fragments.len(); + metadata.insert("total-fragments".to_string(), total_fragments.to_string()); + + // Calculate total data files + let total_data_files: usize = self.fragments.iter().map(|f| f.files.len()).sum(); + metadata.insert("total-data-files".to_string(), total_data_files.to_string()); + + // Calculate total records + let total_records: usize = self.fragments.iter().filter_map(|f| f.num_rows()).sum(); + metadata.insert("total-records".to_string(), total_records.to_string()); + + // Calculate total file sizes + let total_file_sizes: u64 = self + .fragments + .iter() + .flat_map(|f| &f.files) + .filter_map(|df| df.file_size_bytes.get()) + .map(|size| size.get()) + .sum(); + metadata.insert("total-file-sizes".to_string(), total_file_sizes.to_string()); + + // Calculate total deletion files + let total_deletion_files = self + .fragments + .iter() + .filter(|f| f.deletion_file.is_some()) + .count(); + metadata.insert( + "total-deletion-files".to_string(), + total_deletion_files.to_string(), + ); + + // Calculate total deletions + let total_deletions: usize = self + .fragments + .iter() + .filter_map(|f| f.deletion_file.as_ref()) + .filter_map(|df| df.num_deleted_rows) + .sum(); + metadata.insert("total-deletions".to_string(), total_deletions.to_string()); + + metadata + } } #[derive(Debug, Clone, PartialEq, DeepSizeOf)] @@ -786,12 +846,13 @@ impl SelfDescribingFileReader for FileReader { #[cfg(test)] mod tests { - use crate::format::DataFile; + use crate::format::{DataFile, DeletionFile, DeletionFileType}; use super::*; use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; use lance_core::datatypes::Field; + use pretty_assertions::assert_eq; #[test] fn test_writer_version() { @@ -943,4 +1004,111 @@ mod tests { manifest.delete_config_keys(&["other-key"]); assert_eq!(manifest.config, config); } + + #[test] + fn test_manifest_summary() { + // Step 1: test empty manifest summary + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("id", arrow_schema::DataType::Int64, false), + ArrowField::new("name", arrow_schema::DataType::Utf8, true), + ]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + + let empty_manifest = Manifest::new( + schema.clone(), + Arc::new(vec![]), + DataStorageFormat::default(), + None, + HashMap::new(), + ); + + let empty_summary = empty_manifest.summary(); + assert_eq!(empty_summary.get("total-records").unwrap(), "0"); + assert_eq!(empty_summary.get("total-file-sizes").unwrap(), "0"); + assert_eq!(empty_summary.get("total-fragments").unwrap(), "0"); + assert_eq!(empty_summary.get("total-data-files").unwrap(), "0"); + assert_eq!(empty_summary.get("total-deletions").unwrap(), "0"); + assert_eq!(empty_summary.get("total-deletion-files").unwrap(), "0"); + assert_eq!(empty_summary.len(), 6); + + // Step 2: write empty files and verify summary + let empty_fragments = vec![ + Fragment::with_file_legacy(0, "empty_file1.lance", &schema, Some(0)), + Fragment::with_file_legacy(1, "empty_file2.lance", &schema, Some(0)), + ]; + + let empty_files_manifest = Manifest::new( + schema.clone(), + Arc::new(empty_fragments), + DataStorageFormat::default(), + None, + HashMap::new(), + ); + + let empty_files_summary = empty_files_manifest.summary(); + assert_eq!(empty_files_summary.get("total-records").unwrap(), "0"); + assert_eq!(empty_files_summary.get("total-file-sizes").unwrap(), "0"); // 文件大小未知时为0 + assert_eq!(empty_files_summary.get("total-fragments").unwrap(), "2"); + assert_eq!(empty_files_summary.get("total-data-files").unwrap(), "2"); + assert_eq!(empty_files_summary.get("total-deletions").unwrap(), "0"); + assert_eq!( + empty_files_summary.get("total-deletion-files").unwrap(), + "0" + ); + assert_eq!(empty_files_summary.len(), 6); + + // Step 3: write real data and verify summary + let real_fragments = vec![ + Fragment::with_file_legacy(0, "data_file1.lance", &schema, Some(100)), + Fragment::with_file_legacy(1, "data_file2.lance", &schema, Some(250)), + Fragment::with_file_legacy(2, "data_file3.lance", &schema, Some(75)), + ]; + + let real_data_manifest = Manifest::new( + schema.clone(), + Arc::new(real_fragments), + DataStorageFormat::default(), + None, + HashMap::new(), + ); + + let real_data_summary = real_data_manifest.summary(); + assert_eq!(real_data_summary.get("total-records").unwrap(), "425"); // 100 + 250 + 75 + assert_eq!(real_data_summary.get("total-file-sizes").unwrap(), "0"); // 文件大小未知时为0 + assert_eq!(real_data_summary.get("total-fragments").unwrap(), "3"); + assert_eq!(real_data_summary.get("total-data-files").unwrap(), "3"); + assert_eq!(real_data_summary.get("total-deletions").unwrap(), "0"); + assert_eq!(real_data_summary.get("total-deletion-files").unwrap(), "0"); + assert_eq!(real_data_summary.len(), 6); + + // Step 4: write deletion files and verify summary + let mut fragment_with_deletion = + Fragment::with_file_legacy(0, "data_with_deletion.lance", &schema, Some(50)); + fragment_with_deletion.deletion_file = Some(DeletionFile { + read_version: 123, + id: 456, + file_type: DeletionFileType::Array, + num_deleted_rows: Some(10), + base_id: None, + }); + + let manifest_with_deletion = Manifest::new( + schema, + Arc::new(vec![fragment_with_deletion]), + DataStorageFormat::default(), + None, + HashMap::new(), + ); + + let deletion_summary = manifest_with_deletion.summary(); + assert_eq!(deletion_summary.get("total-records").unwrap(), "40"); // 50 - 10 (删除的行数) + assert_eq!(deletion_summary.get("total-file-sizes").unwrap(), "0"); + assert_eq!(deletion_summary.get("total-fragments").unwrap(), "1"); + assert_eq!(deletion_summary.get("total-data-files").unwrap(), "1"); + assert_eq!(deletion_summary.get("total-deletions").unwrap(), "10"); + assert_eq!(deletion_summary.get("total-deletion-files").unwrap(), "1"); + + // 验证 BTreeMap 只包含这6个字段 + assert_eq!(deletion_summary.len(), 6); + } } diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index c3e157b8e01..3747214276a 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -186,7 +186,7 @@ impl From<&Manifest> for Version { Self { version: m.version, timestamp: m.timestamp(), - metadata: BTreeMap::default(), + metadata: m.summary(), } } } From 2e3322cb931761f89856dfc5467283eb094e101a Mon Sep 17 00:00:00 2001 From: majin1102 Date: Wed, 17 Sep 2025 18:16:16 +0800 Subject: [PATCH 02/11] optimize summary keys --- rust/lance-table/src/format/manifest.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index e18ec964b4b..24df15e741d 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -442,14 +442,11 @@ impl Manifest { /// /// This function calculates various statistics about the manifest, including: /// - total-records: Total number of records in the dataset - /// - total-file-sizes: Total size of all data files in bytes + /// - total-files-size: Total size of all data files in bytes /// - total-fragments: Number of fragments in the dataset /// - total-data-files: Total number of data files across all fragments /// - total-deletions: Total number of deleted records /// - total-deletion-files: Number of fragments with deletion files - /// - /// # Returns - /// A BTreeMap containing the calculated metadata pub fn summary(&self) -> BTreeMap { let mut metadata = BTreeMap::new(); @@ -473,7 +470,7 @@ impl Manifest { .filter_map(|df| df.file_size_bytes.get()) .map(|size| size.get()) .sum(); - metadata.insert("total-file-sizes".to_string(), total_file_sizes.to_string()); + metadata.insert("total-files-size".to_string(), total_file_sizes.to_string()); // Calculate total deletion files let total_deletion_files = self @@ -1024,7 +1021,7 @@ mod tests { let empty_summary = empty_manifest.summary(); assert_eq!(empty_summary.get("total-records").unwrap(), "0"); - assert_eq!(empty_summary.get("total-file-sizes").unwrap(), "0"); + assert_eq!(empty_summary.get("total-files-size").unwrap(), "0"); assert_eq!(empty_summary.get("total-fragments").unwrap(), "0"); assert_eq!(empty_summary.get("total-data-files").unwrap(), "0"); assert_eq!(empty_summary.get("total-deletions").unwrap(), "0"); @@ -1047,7 +1044,7 @@ mod tests { let empty_files_summary = empty_files_manifest.summary(); assert_eq!(empty_files_summary.get("total-records").unwrap(), "0"); - assert_eq!(empty_files_summary.get("total-file-sizes").unwrap(), "0"); // 文件大小未知时为0 + assert_eq!(empty_files_summary.get("total-files-size").unwrap(), "0"); // 文件大小未知时为0 assert_eq!(empty_files_summary.get("total-fragments").unwrap(), "2"); assert_eq!(empty_files_summary.get("total-data-files").unwrap(), "2"); assert_eq!(empty_files_summary.get("total-deletions").unwrap(), "0"); @@ -1074,7 +1071,7 @@ mod tests { let real_data_summary = real_data_manifest.summary(); assert_eq!(real_data_summary.get("total-records").unwrap(), "425"); // 100 + 250 + 75 - assert_eq!(real_data_summary.get("total-file-sizes").unwrap(), "0"); // 文件大小未知时为0 + assert_eq!(real_data_summary.get("total-files-size").unwrap(), "0"); // 文件大小未知时为0 assert_eq!(real_data_summary.get("total-fragments").unwrap(), "3"); assert_eq!(real_data_summary.get("total-data-files").unwrap(), "3"); assert_eq!(real_data_summary.get("total-deletions").unwrap(), "0"); @@ -1102,7 +1099,7 @@ mod tests { let deletion_summary = manifest_with_deletion.summary(); assert_eq!(deletion_summary.get("total-records").unwrap(), "40"); // 50 - 10 (删除的行数) - assert_eq!(deletion_summary.get("total-file-sizes").unwrap(), "0"); + assert_eq!(deletion_summary.get("total-files-size").unwrap(), "0"); assert_eq!(deletion_summary.get("total-fragments").unwrap(), "1"); assert_eq!(deletion_summary.get("total-data-files").unwrap(), "1"); assert_eq!(deletion_summary.get("total-deletions").unwrap(), "10"); From 39d0f184bde2d987c5f0b7f4d0291ae858b809ca Mon Sep 17 00:00:00 2001 From: majin1102 Date: Wed, 17 Sep 2025 22:28:05 +0800 Subject: [PATCH 03/11] optimize iteration --- rust/lance-table/src/format/manifest.rs | 105 +++++++++++++++--------- 1 file changed, 66 insertions(+), 39 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 24df15e741d..1d27dc8c457 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -117,6 +117,15 @@ fn compute_fragment_offsets(fragments: &[Fragment]) -> Vec { .collect() } +#[derive(Default)] +struct ManifestStats { + total_data_files: usize, + total_records: usize, + total_file_sizes: u64, + total_deletion_files: usize, + total_deletions: usize, +} + impl Manifest { pub fn new( schema: Schema, @@ -454,43 +463,53 @@ impl Manifest { let total_fragments = self.fragments.len(); metadata.insert("total-fragments".to_string(), total_fragments.to_string()); - // Calculate total data files - let total_data_files: usize = self.fragments.iter().map(|f| f.files.len()).sum(); - metadata.insert("total-data-files".to_string(), total_data_files.to_string()); - - // Calculate total records - let total_records: usize = self.fragments.iter().filter_map(|f| f.num_rows()).sum(); - metadata.insert("total-records".to_string(), total_records.to_string()); - - // Calculate total file sizes - let total_file_sizes: u64 = self - .fragments - .iter() - .flat_map(|f| &f.files) - .filter_map(|df| df.file_size_bytes.get()) - .map(|size| size.get()) - .sum(); - metadata.insert("total-files-size".to_string(), total_file_sizes.to_string()); - - // Calculate total deletion files - let total_deletion_files = self + let stats = self .fragments .iter() - .filter(|f| f.deletion_file.is_some()) - .count(); + .fold(ManifestStats::default(), |mut stats, f| { + // Count data files in the current fragment + stats.total_data_files += f.files.len(); + // Sum the number of rows for the current fragment (if available) + if let Some(num_rows) = f.num_rows() { + stats.total_records += num_rows; + } + // Sum file sizes for all data files in the current fragment (if available) + for data_file in &f.files { + if let Some(size_bytes) = data_file.file_size_bytes.get() { + stats.total_file_sizes += size_bytes.get(); + } + } + // Check and count if the current fragment has a deletion file + if f.deletion_file.is_some() { + stats.total_deletion_files += 1; + } + // Sum the number of deleted rows from the deletion file (if available) + if let Some(deletion_file) = &f.deletion_file { + if let Some(num_deleted) = deletion_file.num_deleted_rows { + stats.total_deletions += num_deleted; + } + } + stats + }); + + // Insert all statistical results into metadata + metadata.insert( + "total-data-files".to_string(), + stats.total_data_files.to_string(), + ); + metadata.insert("total-records".to_string(), stats.total_records.to_string()); + metadata.insert( + "total-files-size".to_string(), + stats.total_file_sizes.to_string(), + ); metadata.insert( "total-deletion-files".to_string(), - total_deletion_files.to_string(), + stats.total_deletion_files.to_string(), + ); + metadata.insert( + "total-deletions".to_string(), + stats.total_deletions.to_string(), ); - - // Calculate total deletions - let total_deletions: usize = self - .fragments - .iter() - .filter_map(|f| f.deletion_file.as_ref()) - .filter_map(|df| df.num_deleted_rows) - .sum(); - metadata.insert("total-deletions".to_string(), total_deletions.to_string()); metadata } @@ -844,6 +863,7 @@ impl SelfDescribingFileReader for FileReader { #[cfg(test)] mod tests { use crate::format::{DataFile, DeletionFile, DeletionFileType}; + use std::num::NonZero; use super::*; @@ -1044,7 +1064,7 @@ mod tests { let empty_files_summary = empty_files_manifest.summary(); assert_eq!(empty_files_summary.get("total-records").unwrap(), "0"); - assert_eq!(empty_files_summary.get("total-files-size").unwrap(), "0"); // 文件大小未知时为0 + assert_eq!(empty_files_summary.get("total-files-size").unwrap(), "0"); assert_eq!(empty_files_summary.get("total-fragments").unwrap(), "2"); assert_eq!(empty_files_summary.get("total-data-files").unwrap(), "2"); assert_eq!(empty_files_summary.get("total-deletions").unwrap(), "0"); @@ -1071,16 +1091,24 @@ mod tests { let real_data_summary = real_data_manifest.summary(); assert_eq!(real_data_summary.get("total-records").unwrap(), "425"); // 100 + 250 + 75 - assert_eq!(real_data_summary.get("total-files-size").unwrap(), "0"); // 文件大小未知时为0 + assert_eq!(real_data_summary.get("total-files-size").unwrap(), "0"); // Zero for unknown assert_eq!(real_data_summary.get("total-fragments").unwrap(), "3"); assert_eq!(real_data_summary.get("total-data-files").unwrap(), "3"); assert_eq!(real_data_summary.get("total-deletions").unwrap(), "0"); assert_eq!(real_data_summary.get("total-deletion-files").unwrap(), "0"); assert_eq!(real_data_summary.len(), 6); + let file_version = LanceFileVersion::default(); // Step 4: write deletion files and verify summary - let mut fragment_with_deletion = - Fragment::with_file_legacy(0, "data_with_deletion.lance", &schema, Some(50)); + let mut fragment_with_deletion = Fragment::new(0) + .with_file( + "data_with_deletion.lance", + vec![0, 1], + vec![0, 1], + &file_version, + NonZero::new(1000), + ) + .with_physical_rows(50); fragment_with_deletion.deletion_file = Some(DeletionFile { read_version: 123, id: 456, @@ -1098,14 +1126,13 @@ mod tests { ); let deletion_summary = manifest_with_deletion.summary(); - assert_eq!(deletion_summary.get("total-records").unwrap(), "40"); // 50 - 10 (删除的行数) - assert_eq!(deletion_summary.get("total-files-size").unwrap(), "0"); + assert_eq!(deletion_summary.get("total-records").unwrap(), "40"); // 50 - 10 + assert_eq!(deletion_summary.get("total-files-size").unwrap(), "1000"); assert_eq!(deletion_summary.get("total-fragments").unwrap(), "1"); assert_eq!(deletion_summary.get("total-data-files").unwrap(), "1"); assert_eq!(deletion_summary.get("total-deletions").unwrap(), "10"); assert_eq!(deletion_summary.get("total-deletion-files").unwrap(), "1"); - // 验证 BTreeMap 只包含这6个字段 assert_eq!(deletion_summary.len(), 6); } } From 77899727a669d58eec5cd5408da8de65a7fd9daa Mon Sep 17 00:00:00 2001 From: majin1102 Date: Wed, 17 Sep 2025 23:58:43 +0800 Subject: [PATCH 04/11] expose ManifestStats instead --- rust/lance-table/src/format/manifest.rs | 138 ++++++++++++------------ rust/lance/src/dataset.rs | 2 +- 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 1d27dc8c457..8a920c644a1 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -118,12 +118,43 @@ fn compute_fragment_offsets(fragments: &[Fragment]) -> Vec { } #[derive(Default)] -struct ManifestStats { - total_data_files: usize, - total_records: usize, - total_file_sizes: u64, - total_deletion_files: usize, - total_deletions: usize, +pub struct ManifestStats { + total_fragments: u64, + total_data_files: u64, + total_records: u64, + total_files_size: u64, + total_deletion_files: u64, + total_deletions: u64, +} + +impl Into> for ManifestStats { + fn into(self) -> BTreeMap { + let mut stats_map = BTreeMap::new(); + + stats_map.insert( + "total-fragments".to_string(), + self.total_fragments.to_string(), + ); + stats_map.insert( + "total-data-files".to_string(), + self.total_data_files.to_string(), + ); + stats_map.insert("total-records".to_string(), self.total_records.to_string()); + stats_map.insert( + "total-files-size".to_string(), + self.total_files_size.to_string(), + ); + stats_map.insert( + "total-deletion-files".to_string(), + self.total_deletion_files.to_string(), + ); + stats_map.insert( + "total-deletions".to_string(), + self.total_deletions.to_string(), + ); + + stats_map + } } impl Manifest { @@ -456,27 +487,22 @@ impl Manifest { /// - total-data-files: Total number of data files across all fragments /// - total-deletions: Total number of deleted records /// - total-deletion-files: Number of fragments with deletion files - pub fn summary(&self) -> BTreeMap { - let mut metadata = BTreeMap::new(); - + pub fn summary(&self) -> ManifestStats { // Calculate total fragments - let total_fragments = self.fragments.len(); - metadata.insert("total-fragments".to_string(), total_fragments.to_string()); - - let stats = self + let mut stats = self .fragments .iter() .fold(ManifestStats::default(), |mut stats, f| { // Count data files in the current fragment - stats.total_data_files += f.files.len(); + stats.total_data_files += f.files.len() as u64; // Sum the number of rows for the current fragment (if available) if let Some(num_rows) = f.num_rows() { - stats.total_records += num_rows; + stats.total_records += num_rows as u64; } // Sum file sizes for all data files in the current fragment (if available) for data_file in &f.files { if let Some(size_bytes) = data_file.file_size_bytes.get() { - stats.total_file_sizes += size_bytes.get(); + stats.total_files_size += size_bytes.get(); } } // Check and count if the current fragment has a deletion file @@ -486,32 +512,14 @@ impl Manifest { // Sum the number of deleted rows from the deletion file (if available) if let Some(deletion_file) = &f.deletion_file { if let Some(num_deleted) = deletion_file.num_deleted_rows { - stats.total_deletions += num_deleted; + stats.total_deletions += num_deleted as u64; } } stats }); + stats.total_fragments = self.fragments.len() as u64; - // Insert all statistical results into metadata - metadata.insert( - "total-data-files".to_string(), - stats.total_data_files.to_string(), - ); - metadata.insert("total-records".to_string(), stats.total_records.to_string()); - metadata.insert( - "total-files-size".to_string(), - stats.total_file_sizes.to_string(), - ); - metadata.insert( - "total-deletion-files".to_string(), - stats.total_deletion_files.to_string(), - ); - metadata.insert( - "total-deletions".to_string(), - stats.total_deletions.to_string(), - ); - - metadata + stats } } @@ -1040,13 +1048,12 @@ mod tests { ); let empty_summary = empty_manifest.summary(); - assert_eq!(empty_summary.get("total-records").unwrap(), "0"); - assert_eq!(empty_summary.get("total-files-size").unwrap(), "0"); - assert_eq!(empty_summary.get("total-fragments").unwrap(), "0"); - assert_eq!(empty_summary.get("total-data-files").unwrap(), "0"); - assert_eq!(empty_summary.get("total-deletions").unwrap(), "0"); - assert_eq!(empty_summary.get("total-deletion-files").unwrap(), "0"); - assert_eq!(empty_summary.len(), 6); + assert_eq!(empty_summary.total_records, 0); + assert_eq!(empty_summary.total_files_size, 0); + assert_eq!(empty_summary.total_fragments, 0); + assert_eq!(empty_summary.total_data_files, 0); + assert_eq!(empty_summary.total_deletions, 0); + assert_eq!(empty_summary.total_deletion_files, 0); // Step 2: write empty files and verify summary let empty_fragments = vec![ @@ -1063,16 +1070,12 @@ mod tests { ); let empty_files_summary = empty_files_manifest.summary(); - assert_eq!(empty_files_summary.get("total-records").unwrap(), "0"); - assert_eq!(empty_files_summary.get("total-files-size").unwrap(), "0"); - assert_eq!(empty_files_summary.get("total-fragments").unwrap(), "2"); - assert_eq!(empty_files_summary.get("total-data-files").unwrap(), "2"); - assert_eq!(empty_files_summary.get("total-deletions").unwrap(), "0"); - assert_eq!( - empty_files_summary.get("total-deletion-files").unwrap(), - "0" - ); - assert_eq!(empty_files_summary.len(), 6); + assert_eq!(empty_files_summary.total_records, 0); + assert_eq!(empty_files_summary.total_files_size, 0); + assert_eq!(empty_files_summary.total_fragments, 2); + assert_eq!(empty_files_summary.total_data_files, 2); + assert_eq!(empty_files_summary.total_deletions, 0); + assert_eq!(empty_files_summary.total_deletion_files, 0); // Step 3: write real data and verify summary let real_fragments = vec![ @@ -1090,13 +1093,12 @@ mod tests { ); let real_data_summary = real_data_manifest.summary(); - assert_eq!(real_data_summary.get("total-records").unwrap(), "425"); // 100 + 250 + 75 - assert_eq!(real_data_summary.get("total-files-size").unwrap(), "0"); // Zero for unknown - assert_eq!(real_data_summary.get("total-fragments").unwrap(), "3"); - assert_eq!(real_data_summary.get("total-data-files").unwrap(), "3"); - assert_eq!(real_data_summary.get("total-deletions").unwrap(), "0"); - assert_eq!(real_data_summary.get("total-deletion-files").unwrap(), "0"); - assert_eq!(real_data_summary.len(), 6); + assert_eq!(real_data_summary.total_records, 425); // 100 + 250 + 75 + assert_eq!(real_data_summary.total_files_size, 0); // Zero for unknown + assert_eq!(real_data_summary.total_fragments, 3); + assert_eq!(real_data_summary.total_data_files, 3); + assert_eq!(real_data_summary.total_deletions, 0); + assert_eq!(real_data_summary.total_deletion_files, 0); let file_version = LanceFileVersion::default(); // Step 4: write deletion files and verify summary @@ -1126,13 +1128,11 @@ mod tests { ); let deletion_summary = manifest_with_deletion.summary(); - assert_eq!(deletion_summary.get("total-records").unwrap(), "40"); // 50 - 10 - assert_eq!(deletion_summary.get("total-files-size").unwrap(), "1000"); - assert_eq!(deletion_summary.get("total-fragments").unwrap(), "1"); - assert_eq!(deletion_summary.get("total-data-files").unwrap(), "1"); - assert_eq!(deletion_summary.get("total-deletions").unwrap(), "10"); - assert_eq!(deletion_summary.get("total-deletion-files").unwrap(), "1"); - - assert_eq!(deletion_summary.len(), 6); + assert_eq!(deletion_summary.total_records, 40); // 50 - 10 + assert_eq!(deletion_summary.total_files_size, 1000); + assert_eq!(deletion_summary.total_fragments, 1); + assert_eq!(deletion_summary.total_data_files, 1); + assert_eq!(deletion_summary.total_deletions, 10); + assert_eq!(deletion_summary.total_deletion_files, 1); } } diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 3747214276a..ae0f55ae206 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -186,7 +186,7 @@ impl From<&Manifest> for Version { Self { version: m.version, timestamp: m.timestamp(), - metadata: m.summary(), + metadata: m.summary().into(), } } } From 0da0e510c5125a2d0e327c6a09db0ff04100ff8a Mon Sep 17 00:00:00 2001 From: majin1102 Date: Thu, 18 Sep 2025 00:03:05 +0800 Subject: [PATCH 05/11] remove unnecessary use --- rust/lance-table/src/format/manifest.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 8a920c644a1..cc6a4d88c50 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -877,7 +877,6 @@ mod tests { use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; use lance_core::datatypes::Field; - use pretty_assertions::assert_eq; #[test] fn test_writer_version() { From 6db7dbfd5591120487acc0984111b2d93ab0c764 Mon Sep 17 00:00:00 2001 From: majin1102 Date: Thu, 18 Sep 2025 01:05:36 +0800 Subject: [PATCH 06/11] fix clippy --- rust/lance-table/src/format/manifest.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index cc6a4d88c50..27454c9f816 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -127,32 +127,30 @@ pub struct ManifestStats { total_deletions: u64, } -impl Into> for ManifestStats { - fn into(self) -> BTreeMap { - let mut stats_map = BTreeMap::new(); - +impl From for BTreeMap { + fn from(val: ManifestStats) -> Self { + let mut stats_map = Self::new(); stats_map.insert( "total-fragments".to_string(), - self.total_fragments.to_string(), + val.total_fragments.to_string(), ); stats_map.insert( "total-data-files".to_string(), - self.total_data_files.to_string(), + val.total_data_files.to_string(), ); - stats_map.insert("total-records".to_string(), self.total_records.to_string()); + stats_map.insert("total-records".to_string(), val.total_records.to_string()); stats_map.insert( "total-files-size".to_string(), - self.total_files_size.to_string(), + val.total_files_size.to_string(), ); stats_map.insert( "total-deletion-files".to_string(), - self.total_deletion_files.to_string(), + val.total_deletion_files.to_string(), ); stats_map.insert( "total-deletions".to_string(), - self.total_deletions.to_string(), + val.total_deletions.to_string(), ); - stats_map } } @@ -1133,5 +1131,9 @@ mod tests { assert_eq!(deletion_summary.total_data_files, 1); assert_eq!(deletion_summary.total_deletions, 10); assert_eq!(deletion_summary.total_deletion_files, 1); + + //Just verify the transformation is OK + let stats_map: BTreeMap = deletion_summary.into(); + assert_eq!(stats_map.len(), 6) } } From 00df9892f1107140d1dca68d00a1048675084943 Mon Sep 17 00:00:00 2001 From: majin1102 Date: Thu, 18 Sep 2025 11:02:33 +0800 Subject: [PATCH 07/11] use snake_case for stats map --- rust/lance-table/src/format/manifest.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 27454c9f816..48e1d120aa9 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -131,24 +131,24 @@ impl From for BTreeMap { fn from(val: ManifestStats) -> Self { let mut stats_map = Self::new(); stats_map.insert( - "total-fragments".to_string(), + "total_fragments".to_string(), val.total_fragments.to_string(), ); stats_map.insert( - "total-data-files".to_string(), + "total_data_files".to_string(), val.total_data_files.to_string(), ); - stats_map.insert("total-records".to_string(), val.total_records.to_string()); + stats_map.insert("total_records".to_string(), val.total_records.to_string()); stats_map.insert( - "total-files-size".to_string(), + "total_files_size".to_string(), val.total_files_size.to_string(), ); stats_map.insert( - "total-deletion-files".to_string(), + "total_deletion_files".to_string(), val.total_deletion_files.to_string(), ); stats_map.insert( - "total-deletions".to_string(), + "total_deletions".to_string(), val.total_deletions.to_string(), ); stats_map From 8e95500e0bd221d7db643fcda122c940b369cce8 Mon Sep 17 00:00:00 2001 From: majin1102 Date: Thu, 18 Sep 2025 13:05:34 +0800 Subject: [PATCH 08/11] modify ManifestStats to ManifestSummary to match summary() func --- rust/lance-table/src/format/manifest.rs | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 48e1d120aa9..cb1996bcfc7 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -118,7 +118,7 @@ fn compute_fragment_offsets(fragments: &[Fragment]) -> Vec { } #[derive(Default)] -pub struct ManifestStats { +pub struct ManifestSummary { total_fragments: u64, total_data_files: u64, total_records: u64, @@ -127,8 +127,8 @@ pub struct ManifestStats { total_deletions: u64, } -impl From for BTreeMap { - fn from(val: ManifestStats) -> Self { +impl From for BTreeMap { + fn from(val: ManifestSummary) -> Self { let mut stats_map = Self::new(); stats_map.insert( "total_fragments".to_string(), @@ -485,39 +485,39 @@ impl Manifest { /// - total-data-files: Total number of data files across all fragments /// - total-deletions: Total number of deleted records /// - total-deletion-files: Number of fragments with deletion files - pub fn summary(&self) -> ManifestStats { + pub fn summary(&self) -> ManifestSummary { // Calculate total fragments - let mut stats = self + let mut summary = self .fragments .iter() - .fold(ManifestStats::default(), |mut stats, f| { + .fold(ManifestSummary::default(), |mut summary, f| { // Count data files in the current fragment - stats.total_data_files += f.files.len() as u64; + summary.total_data_files += f.files.len() as u64; // Sum the number of rows for the current fragment (if available) if let Some(num_rows) = f.num_rows() { - stats.total_records += num_rows as u64; + summary.total_records += num_rows as u64; } // Sum file sizes for all data files in the current fragment (if available) for data_file in &f.files { if let Some(size_bytes) = data_file.file_size_bytes.get() { - stats.total_files_size += size_bytes.get(); + summary.total_files_size += size_bytes.get(); } } // Check and count if the current fragment has a deletion file if f.deletion_file.is_some() { - stats.total_deletion_files += 1; + summary.total_deletion_files += 1; } // Sum the number of deleted rows from the deletion file (if available) if let Some(deletion_file) = &f.deletion_file { if let Some(num_deleted) = deletion_file.num_deleted_rows { - stats.total_deletions += num_deleted as u64; + summary.total_deletions += num_deleted as u64; } } - stats + summary }); - stats.total_fragments = self.fragments.len() as u64; + summary.total_fragments = self.fragments.len() as u64; - stats + summary } } From 7e72d79ff6acaab036e0a75f64b9ca9ad62505ce Mon Sep 17 00:00:00 2001 From: majin1102 Date: Thu, 18 Sep 2025 16:52:06 +0800 Subject: [PATCH 09/11] optimize stats --- rust/lance-table/src/format/manifest.rs | 100 +++++++++++++----------- 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index cb1996bcfc7..831fa9ab2da 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -121,36 +121,41 @@ fn compute_fragment_offsets(fragments: &[Fragment]) -> Vec { pub struct ManifestSummary { total_fragments: u64, total_data_files: u64, - total_records: u64, total_files_size: u64, total_deletion_files: u64, - total_deletions: u64, + total_data_file_rows: u64, + total_deletion_file_rows: u64, + total_rows: u64, } impl From for BTreeMap { - fn from(val: ManifestSummary) -> Self { + fn from(summary: ManifestSummary) -> Self { let mut stats_map = Self::new(); stats_map.insert( "total_fragments".to_string(), - val.total_fragments.to_string(), + summary.total_fragments.to_string(), ); stats_map.insert( "total_data_files".to_string(), - val.total_data_files.to_string(), + summary.total_data_files.to_string(), ); - stats_map.insert("total_records".to_string(), val.total_records.to_string()); stats_map.insert( "total_files_size".to_string(), - val.total_files_size.to_string(), + summary.total_files_size.to_string(), ); stats_map.insert( "total_deletion_files".to_string(), - val.total_deletion_files.to_string(), + summary.total_deletion_files.to_string(), ); stats_map.insert( - "total_deletions".to_string(), - val.total_deletions.to_string(), + "total_data_file_rows".to_string(), + summary.total_data_file_rows.to_string(), ); + stats_map.insert( + "total_deletion_file_rows".to_string(), + summary.total_deletion_file_rows.to_string(), + ); + stats_map.insert("total_rows".to_string(), summary.total_rows.to_string()); stats_map } } @@ -487,35 +492,36 @@ impl Manifest { /// - total-deletion-files: Number of fragments with deletion files pub fn summary(&self) -> ManifestSummary { // Calculate total fragments - let mut summary = self - .fragments - .iter() - .fold(ManifestSummary::default(), |mut summary, f| { - // Count data files in the current fragment - summary.total_data_files += f.files.len() as u64; - // Sum the number of rows for the current fragment (if available) - if let Some(num_rows) = f.num_rows() { - summary.total_records += num_rows as u64; - } - // Sum file sizes for all data files in the current fragment (if available) - for data_file in &f.files { - if let Some(size_bytes) = data_file.file_size_bytes.get() { - summary.total_files_size += size_bytes.get(); + let mut summary = + self.fragments + .iter() + .fold(ManifestSummary::default(), |mut summary, f| { + // Count data files in the current fragment + summary.total_data_files += f.files.len() as u64; + // Sum the number of rows for the current fragment (if available) + if let Some(num_rows) = f.num_rows() { + summary.total_rows += num_rows as u64; } - } - // Check and count if the current fragment has a deletion file - if f.deletion_file.is_some() { - summary.total_deletion_files += 1; - } - // Sum the number of deleted rows from the deletion file (if available) - if let Some(deletion_file) = &f.deletion_file { - if let Some(num_deleted) = deletion_file.num_deleted_rows { - summary.total_deletions += num_deleted as u64; + // Sum file sizes for all data files in the current fragment (if available) + for data_file in &f.files { + if let Some(size_bytes) = data_file.file_size_bytes.get() { + summary.total_files_size += size_bytes.get(); + } } - } - summary - }); + // Check and count if the current fragment has a deletion file + if f.deletion_file.is_some() { + summary.total_deletion_files += 1; + } + // Sum the number of deleted rows from the deletion file (if available) + if let Some(deletion_file) = &f.deletion_file { + if let Some(num_deleted) = deletion_file.num_deleted_rows { + summary.total_deletion_file_rows += num_deleted as u64; + } + } + summary + }); summary.total_fragments = self.fragments.len() as u64; + summary.total_data_file_rows = summary.total_rows + summary.total_deletion_file_rows; summary } @@ -1045,11 +1051,12 @@ mod tests { ); let empty_summary = empty_manifest.summary(); - assert_eq!(empty_summary.total_records, 0); + assert_eq!(empty_summary.total_rows, 0); assert_eq!(empty_summary.total_files_size, 0); assert_eq!(empty_summary.total_fragments, 0); assert_eq!(empty_summary.total_data_files, 0); - assert_eq!(empty_summary.total_deletions, 0); + assert_eq!(empty_summary.total_deletion_file_rows, 0); + assert_eq!(empty_summary.total_data_file_rows, 0); assert_eq!(empty_summary.total_deletion_files, 0); // Step 2: write empty files and verify summary @@ -1067,11 +1074,12 @@ mod tests { ); let empty_files_summary = empty_files_manifest.summary(); - assert_eq!(empty_files_summary.total_records, 0); + assert_eq!(empty_files_summary.total_rows, 0); assert_eq!(empty_files_summary.total_files_size, 0); assert_eq!(empty_files_summary.total_fragments, 2); assert_eq!(empty_files_summary.total_data_files, 2); - assert_eq!(empty_files_summary.total_deletions, 0); + assert_eq!(empty_files_summary.total_deletion_file_rows, 0); + assert_eq!(empty_files_summary.total_data_file_rows, 0); assert_eq!(empty_files_summary.total_deletion_files, 0); // Step 3: write real data and verify summary @@ -1090,11 +1098,12 @@ mod tests { ); let real_data_summary = real_data_manifest.summary(); - assert_eq!(real_data_summary.total_records, 425); // 100 + 250 + 75 + assert_eq!(real_data_summary.total_rows, 425); // 100 + 250 + 75 assert_eq!(real_data_summary.total_files_size, 0); // Zero for unknown assert_eq!(real_data_summary.total_fragments, 3); assert_eq!(real_data_summary.total_data_files, 3); - assert_eq!(real_data_summary.total_deletions, 0); + assert_eq!(real_data_summary.total_deletion_file_rows, 0); + assert_eq!(real_data_summary.total_data_file_rows, 425); assert_eq!(real_data_summary.total_deletion_files, 0); let file_version = LanceFileVersion::default(); @@ -1125,15 +1134,16 @@ mod tests { ); let deletion_summary = manifest_with_deletion.summary(); - assert_eq!(deletion_summary.total_records, 40); // 50 - 10 + assert_eq!(deletion_summary.total_rows, 40); // 50 - 10 assert_eq!(deletion_summary.total_files_size, 1000); assert_eq!(deletion_summary.total_fragments, 1); assert_eq!(deletion_summary.total_data_files, 1); - assert_eq!(deletion_summary.total_deletions, 10); + assert_eq!(deletion_summary.total_deletion_file_rows, 10); + assert_eq!(deletion_summary.total_data_file_rows, 50); assert_eq!(deletion_summary.total_deletion_files, 1); //Just verify the transformation is OK let stats_map: BTreeMap = deletion_summary.into(); - assert_eq!(stats_map.len(), 6) + assert_eq!(stats_map.len(), 7) } } From 7a5fc659f2a3c9355d46250744fc76eb36bd89c1 Mon Sep 17 00:00:00 2001 From: majin1102 Date: Thu, 18 Sep 2025 22:38:08 +0800 Subject: [PATCH 10/11] make summary fields pub --- rust/lance-table/src/format/manifest.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 831fa9ab2da..9f0dba796f9 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -119,13 +119,13 @@ fn compute_fragment_offsets(fragments: &[Fragment]) -> Vec { #[derive(Default)] pub struct ManifestSummary { - total_fragments: u64, - total_data_files: u64, - total_files_size: u64, - total_deletion_files: u64, - total_data_file_rows: u64, - total_deletion_file_rows: u64, - total_rows: u64, + pub total_fragments: u64, + pub total_data_files: u64, + pub total_files_size: u64, + pub total_deletion_files: u64, + pub total_data_file_rows: u64, + pub total_deletion_file_rows: u64, + pub total_rows: u64, } impl From for BTreeMap { From be4d34c89762fb9fe1d4823aa0d09d2fd32d28ba Mon Sep 17 00:00:00 2001 From: majin1102 Date: Sat, 20 Sep 2025 00:48:03 +0800 Subject: [PATCH 11/11] fix comment --- rust/lance-table/src/format/manifest.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 9f0dba796f9..83e6f3be929 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -484,12 +484,13 @@ impl Manifest { /// Get the summary information of a manifest. /// /// This function calculates various statistics about the manifest, including: - /// - total-records: Total number of records in the dataset - /// - total-files-size: Total size of all data files in bytes - /// - total-fragments: Number of fragments in the dataset - /// - total-data-files: Total number of data files across all fragments - /// - total-deletions: Total number of deleted records - /// - total-deletion-files: Number of fragments with deletion files + /// - total_files_size: Total size of all data files in bytes + /// - total_fragments: Total number of fragments in the dataset + /// - total_data_files: Total number of data files across all fragments + /// - total_deletion_files: Total number of deletion files + /// - total_data_file_rows: Total number of rows in data files + /// - total_deletion_file_rows: Total number of deleted rows in deletion files + /// - total_rows: Total number of rows in the dataset pub fn summary(&self) -> ManifestSummary { // Calculate total fragments let mut summary =