From 879e0cb146f275f04dc6bb4ab265f80f4e646ae4 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 4 Feb 2025 23:16:07 +0100 Subject: [PATCH 01/69] Fixed outdated documentation --- crates/modelardb_server/src/storage/compressed_data_buffer.rs | 4 ++-- .../modelardb_server/src/storage/compressed_data_manager.rs | 4 ++-- .../modelardb_server/src/storage/uncompressed_data_manager.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/modelardb_server/src/storage/compressed_data_buffer.rs b/crates/modelardb_server/src/storage/compressed_data_buffer.rs index 46a1c5c47..68402ff93 100644 --- a/crates/modelardb_server/src/storage/compressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/compressed_data_buffer.rs @@ -23,8 +23,8 @@ use modelardb_types::schemas::COMPRESSED_SCHEMA; use crate::error::{ModelarDbServerError, Result}; -/// Compressed segments representing data points from a column in a model table as one -/// [`RecordBatch`]. +/// Batch of compressed segments that were compressed together and are ready to be inserted into a +/// [`CompressedDataBuffer`] for a model table. #[derive(Clone, Debug)] pub(super) struct CompressedSegmentBatch { /// Metadata of the model table to insert the data points into. diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index 41254f034..861986431 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -129,8 +129,8 @@ impl CompressedDataManager { Ok(()) } - /// Insert the `compressed_segments` into the in-memory compressed data buffer for the model table - /// with `table_name`. If `compressed_segments` is saved successfully, return [`Ok`], otherwise + /// Insert `compressed_segment_batch` into the in-memory [`CompressedDataBuffer`] for the model + /// table. If `compressed_segment_batch` is inserted successfully, return [`Ok`], otherwise /// return [`ModelarDbServerError`](crate::error::ModelarDbServerError). async fn insert_compressed_segments( &self, diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 0b4800077..b22556452 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -199,7 +199,7 @@ impl UncompressedDataManager { .collect(); // For each data point, compute a hash from the tags and pass the fields to the storage - // engine so they can be added to the appropriate [`UncompressedDataBuffer`]. + // engine so they can be added to the appropriate UncompressedDataBuffer. for (index, timestamp) in timestamp_column_array.iter().enumerate() { let tag_values: Vec = tag_column_arrays .iter() From cc891942ef01ccee95506faf0eca2d3c74f225d3 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 11 Feb 2025 22:19:24 +0100 Subject: [PATCH 02/69] Add table name to file path for spilled buffers --- .../src/storage/uncompressed_data_buffer.rs | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 06ffa95e2..14329ae01 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -256,10 +256,11 @@ impl UncompressedOnDiskDataBuffer { ) -> Result { // Create a path that uses the first timestamp as the filename. let timestamps = modelardb_types::array!(data_points, 0, TimestampArray); - let file_path = Path::from(format!( - "{UNCOMPRESSED_DATA_FOLDER}/{tag_hash}/{}.parquet", - timestamps.value(0) - )); + let file_path = spilled_buffer_file_path( + &model_table_metadata.name, + tag_hash, + &format!("{}.parquet", timestamps.value(0)), + ); modelardb_storage::write_record_batch_to_apache_parquet_file( &file_path, @@ -288,7 +289,7 @@ impl UncompressedOnDiskDataBuffer { local_data_folder: Arc, file_name: &str, ) -> Result { - let file_path = Path::from(format!("{UNCOMPRESSED_DATA_FOLDER}/{tag_hash}/{file_name}")); + let file_path = spilled_buffer_file_path(&model_table_metadata.name, tag_hash, file_name); Ok(Self { tag_hash, @@ -377,6 +378,14 @@ impl Debug for UncompressedOnDiskDataBuffer { } } +/// Return the [`Path`] for a spilled buffer for the time series with `tag_hash` in the table with +/// `table_name`. +fn spilled_buffer_file_path(table_name: &str, tag_hash: u64, file_name: &str) -> Path { + Path::from(format!( + "{UNCOMPRESSED_DATA_FOLDER}/{table_name}/{tag_hash}/{file_name}", + )) +} + #[cfg(test)] mod tests { use super::*; @@ -555,9 +564,10 @@ mod tests { .await .unwrap(); - let uncompressed_path = temp_dir - .path() - .join(format!("{UNCOMPRESSED_DATA_FOLDER}/1")); + let uncompressed_path = temp_dir.path().join(format!( + "{UNCOMPRESSED_DATA_FOLDER}/{}/1", + test::MODEL_TABLE_NAME + )); assert_eq!(uncompressed_path.read_dir().unwrap().count(), 1) } @@ -579,9 +589,10 @@ mod tests { .await .unwrap(); - let uncompressed_path = temp_dir - .path() - .join(format!("{UNCOMPRESSED_DATA_FOLDER}/1")); + let uncompressed_path = temp_dir.path().join(format!( + "{UNCOMPRESSED_DATA_FOLDER}/{}/1", + test::MODEL_TABLE_NAME + )); assert_eq!(uncompressed_path.read_dir().unwrap().count(), 1) } @@ -594,6 +605,7 @@ mod tests { let spilled_buffer_path = temp_dir .path() .join(UNCOMPRESSED_DATA_FOLDER) + .join(test::MODEL_TABLE_NAME) .join("1") .join("1234567890123.parquet"); assert!(spilled_buffer_path.exists()); @@ -603,11 +615,6 @@ mod tests { assert_eq!(data.num_columns(), 3); assert_eq!(data.num_rows(), *UNCOMPRESSED_DATA_BUFFER_CAPACITY); - let spilled_buffer_path = temp_dir - .path() - .join(UNCOMPRESSED_DATA_FOLDER) - .join("1") - .join("1234567890123.parquet"); assert!(!spilled_buffer_path.exists()); } From b00ebd60549602b18c323322510fe7e724a9665c Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 11 Feb 2025 22:27:38 +0100 Subject: [PATCH 03/69] Use the table name in the spilled buffer file path when initializing --- .../src/storage/uncompressed_data_manager.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index b22556452..d57214a31 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -99,17 +99,14 @@ impl UncompressedDataManager { let spilled_buffer = maybe_spilled_buffer?; let path_parts: Vec = spilled_buffer.location.parts().collect(); + // unwrap() is safe since all spilled buffers are partitioned by their table name. + let table_name = path_parts.get(1).unwrap().as_ref(); + // unwrap() is safe since all spilled buffers are partitioned by their tag hash. - let tag_hash = path_parts.get(1).unwrap().as_ref().parse::().unwrap(); + let tag_hash = path_parts.get(2).unwrap().as_ref().parse::().unwrap(); // unwrap() is safe since all spilled buffers have a name generated by the system. - let file_name = path_parts.get(2).unwrap().as_ref(); - - let table_name = self - .local_data_folder - .table_metadata_manager - .tag_hash_to_model_table_name(tag_hash) - .await?; + let file_name = path_parts.get(3).unwrap().as_ref(); // unwrap() is safe as data cannot be ingested into a model table that does not exist. let model_table_metadata = context From a78eae8c668118dfb56caa24b029b9286a1108c5 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 11 Feb 2025 22:49:31 +0100 Subject: [PATCH 04/69] Remove model_table_hash_table_name from metadata Delta Lake --- .../src/metadata/table_metadata_manager.rs | 191 ++---------------- 1 file changed, 13 insertions(+), 178 deletions(-) diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 2f1903608..f3e2dbc60 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -161,8 +161,6 @@ impl TableMetadataManager { /// and model table metadata and register them with the Apache DataFusion session context. /// * The `normal_table_metadata` table contains the metadata for normal tables. /// * The `model_table_metadata` table contains the main metadata for model tables. - /// * The `model_table_hash_table_name` contains a mapping from each tag hash to the name of the - /// model table that contains the time series with that tag hash. /// * The `model_table_field_columns` table contains the name, index, error bound value, whether /// error bound is relative, and generation expression of the field columns in each model table. /// @@ -194,24 +192,6 @@ impl TableMetadataManager { register_metadata_table(&self.session_context, "model_table_metadata", delta_table)?; - // Create and register the model_table_hash_table_name table if it does not exist. - let delta_table = self - .delta_lake - .create_metadata_table( - "model_table_hash_table_name", - &Schema::new(vec![ - Field::new("hash", DataType::Int64, false), - Field::new("table_name", DataType::Utf8, false), - ]), - ) - .await?; - - register_metadata_table( - &self.session_context, - "model_table_hash_table_name", - delta_table, - )?; - // Create and register the model_table_field_columns table if it does not exist. Note that // column_index will only use a maximum of 10 bits. generated_column_expr is NULL if the // fields are stored as segments. @@ -454,8 +434,8 @@ impl TableMetadataManager { /// Drop the metadata for the model table with `table_name` from the metadata Delta Lake. /// This includes dropping the tags table for the model table, deleting a row from the /// `model_table_metadata` table, deleting a row from the `model_table_field_columns` table for - /// each field column, and deleting the tag metadata from the `model_table_hash_table_name` table - /// and the tag cache. If the metadata could not be dropped, [`ModelarDbStorageError`] is returned. + /// each field column, and deleting the tag metadata from the tag cache. If the metadata could + /// not be dropped, [`ModelarDbStorageError`] is returned. async fn drop_model_table_metadata(&self, table_name: &str) -> Result<()> { // Drop and deregister the model_table_name_tags table. let tags_table_name = format!("{table_name}_tags"); @@ -506,8 +486,8 @@ impl TableMetadataManager { /// Truncate the metadata for the model table with `table_name` from the metadata Delta Lake. /// This includes truncating the tags table for the model table and deleting the tag metadata - /// from the `model_table_hash_table_name` table and the tag cache. If the metadata could not - /// be truncated, [`ModelarDbStorageError`] is returned. + /// from the tag cache. If the metadata could not be truncated, [`ModelarDbStorageError`] is + /// returned. async fn truncate_model_table_metadata(&self, table_name: &str) -> Result<()> { // Truncate the model_table_name_tags table. self.delta_lake @@ -522,18 +502,9 @@ impl TableMetadataManager { Ok(()) } - /// Delete the tag hash metadata for the model table with `table_name` from the - /// `model_table_hash_table_name` table and the tag cache. If the metadata could not be deleted, - /// [`ModelarDbStorageError`] is returned. + /// Delete the tag hash metadata for the model table with `table_name` from the tag cache. If + /// the metadata could not be deleted, [`ModelarDbStorageError`] is returned. async fn delete_tag_hash_metadata(&self, table_name: &str) -> Result<()> { - // Delete the tag metadata from the model_table_hash_table_name table. - self.delta_lake - .metadata_delta_ops("model_table_hash_table_name") - .await? - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - // Delete the tag metadata from the tag cache. The table name is always the last part of // the cache key. self.tag_value_hashes @@ -696,11 +667,10 @@ impl TableMetadataManager { /// Return the tag hash for the given list of tag values either by retrieving it from a cache /// or, if the combination of tag values is not in the cache, by computing a new hash. If the - /// hash is not in the cache, it is saved to the cache, persisted to the `model_table_tags` - /// table if it does not already contain it, and persisted to the `model_table_hash_table_name` + /// hash is not in the cache, it is saved to the cache and persisted to the `model_table_tags` /// table if it does not already contain it. If the hash was saved to the metadata Delta Lake, - /// also return [`true`]. If the `model_table_tags` or the `model_table_hash_table_name` table - /// cannot be accessed, [`ModelarDbStorageError`] is returned. + /// also return [`true`]. If the `model_table_tags` table cannot be accessed, + /// [`ModelarDbStorageError`] is returned. pub async fn lookup_or_compute_tag_hash( &self, model_table_metadata: &ModelTableMetadata, @@ -743,10 +713,9 @@ impl TableMetadataManager { } /// Save the given tag hash metadata to the `model_table_tags` table if it does not already - /// contain it, and to the `model_table_hash_table_name` table if it does not already contain it. - /// If the tables did not contain the tag hash, meaning it is a new tag combination, return - /// [`true`]. If the metadata cannot be inserted into either `model_table_tags` or - /// `model_table_hash_table_name`, [`ModelarDbStorageError`] is returned. + /// contain it. If the table did not contain the tag hash, meaning it is a new tag combination, + /// return [`true`]. If the metadata cannot be inserted into `model_table_tags`, + /// [`ModelarDbStorageError`] is returned. pub async fn save_tag_hash_metadata( &self, model_table_metadata: &ModelTableMetadata, @@ -798,39 +767,7 @@ impl TableMetadataManager { })? .await?; - // Save the tag hash metadata in the model_table_hash_table_name table if it does not - // already contain it. - let source = self - .metadata_table_data_frame( - "model_table_hash_table_name", - vec![ - Arc::new(Int64Array::from(vec![signed_tag_hash])), - Arc::new(StringArray::from(vec![table_name])), - ], - ) - .await?; - - let delta_ops = self - .delta_lake - .metadata_delta_ops("model_table_hash_table_name") - .await?; - - // Merge the tag hash metadata in the source DataFrame into the model_table_hash_table_name - // table. For each hash, if the hash is not already in the target table, insert the hash and - // the table name from the source DataFrame. - let (_table, insert_into_hash_table_name_metrics) = delta_ops - .merge(source, col("target.hash").eq(col("source.hash"))) - .with_source_alias("source") - .with_target_alias("target") - .when_not_matched_insert(|insert| { - insert - .set("hash", col("source.hash")) - .set("table_name", col("source.table_name")) - })? - .await?; - - Ok(insert_into_tags_metrics.num_target_rows_inserted > 0 - || insert_into_hash_table_name_metrics.num_target_rows_inserted > 0) + Ok(insert_into_tags_metrics.num_target_rows_inserted > 0) } /// Return a [`DataFrame`] with the given `rows` for the metadata table with the given @@ -850,30 +787,6 @@ impl TableMetadataManager { Ok(self.session_context.read_batch(batch)?) } - /// Return the name of the model table that contains the time series with `tag_hash`. Returns a - /// [`ModelarDbStorageError`] if the necessary data cannot be retrieved from the metadata Delta - /// Lake. - pub async fn tag_hash_to_model_table_name(&self, tag_hash: u64) -> Result { - let signed_tag_hash = i64::from_ne_bytes(tag_hash.to_ne_bytes()); - - let sql = format!( - "SELECT table_name - FROM model_table_hash_table_name - WHERE hash = '{signed_tag_hash}' - LIMIT 1" - ); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - let table_names = modelardb_types::array!(batch, 0, StringArray); - if table_names.is_empty() { - Err(ModelarDbStorageError::InvalidArgument(format!( - "No model table contains a time series with tag hash '{tag_hash}'." - ))) - } else { - Ok(table_names.value(0).to_owned()) - } - } - /// Return a mapping from tag hashes to the tags in the columns with the names in /// `tag_column_names` for the time series in the model table with the name `model_table_name`. /// Returns a [`ModelarDbStorageError`] if the necessary data cannot be retrieved from the @@ -955,12 +868,6 @@ mod tests { .await .is_ok()); - assert!(metadata_manager - .session_context - .sql("SELECT hash, table_name FROM model_table_hash_table_name") - .await - .is_ok()); - assert!(metadata_manager .session_context .sql("SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ @@ -1199,14 +1106,6 @@ mod tests { assert_eq!(batch.num_rows(), 0); - // Verify that the tag metadata was deleted from the model_table_hash_table_name table. - let sql = "SELECT table_name FROM model_table_hash_table_name"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!(batch.num_rows(), 0); - // Verify that the tag cache was cleared. assert!(metadata_manager.tag_value_hashes.is_empty()); } @@ -1265,14 +1164,6 @@ mod tests { assert_eq!(batch.num_rows(), 0); - // Verify that the tag metadata was deleted from the model_table_hash_table_name table. - let sql = "SELECT table_name FROM model_table_hash_table_name"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!(batch.num_rows(), 0); - // Verify that the tag cache was cleared. assert!(metadata_manager.tag_value_hashes.is_empty()); } @@ -1465,24 +1356,6 @@ mod tests { ]) ); assert_eq!(**batch.column(1), StringArray::from(vec!["tag2", "tag1"])); - - // The tag hashes should be saved in the model_table_hash_table_name table. - let sql = "SELECT hash, table_name FROM model_table_hash_table_name"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!( - **batch.column(0), - Int64Array::from(vec![ - i64::from_ne_bytes(tag_hash_2.to_ne_bytes()), - i64::from_ne_bytes(tag_hash_1.to_ne_bytes()), - ]) - ); - assert_eq!( - **batch.column(1), - StringArray::from(vec![test::MODEL_TABLE_NAME, test::MODEL_TABLE_NAME]) - ); } #[tokio::test] @@ -1538,44 +1411,6 @@ mod tests { .unwrap(); assert!(batch.column(0).is_empty()); - - let sql = "SELECT hash FROM model_table_hash_table_name"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert!(batch.column(0).is_empty()); - } - - #[tokio::test] - async fn test_tag_hash_to_model_table_name() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let model_table_metadata = test::model_table_metadata(); - let (tag_hash, _tag_hash_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - - let table_name = metadata_manager - .tag_hash_to_model_table_name(tag_hash) - .await - .unwrap(); - - assert_eq!(table_name, test::MODEL_TABLE_NAME); - } - - #[tokio::test] - async fn test_invalid_tag_hash_to_model_table_name() { - let temp_dir = tempfile::tempdir().unwrap(); - let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path(), None) - .await - .unwrap(); - - assert!(metadata_manager - .tag_hash_to_model_table_name(0) - .await - .is_err()); } #[tokio::test] From c5baba5828d84192ff6cf2a1655d6f01dafb7dfc Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 12 Feb 2025 21:32:19 +0100 Subject: [PATCH 05/69] Remove limitation of 1024 on number of field columns --- .../src/metadata/model_table_metadata.rs | 9 --------- .../src/metadata/table_metadata_manager.rs | 1 - 2 files changed, 10 deletions(-) diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index a2eff4fa1..5100bfb21 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -60,7 +60,6 @@ impl ModelTableMetadata { /// * The number of error bounds does not match the number of columns. /// * The number of potentially generated columns does not match the number of columns. /// * A generated column includes another generated column in its expression. - /// * There are more than 1024 columns. /// * The `query_schema` does not include a single timestamp column. /// * The `query_schema` does not include at least one stored field column. pub fn try_new( @@ -95,14 +94,6 @@ impl ModelTableMetadata { } } - // If there are more than 1024 columns, return an error. This limitation is necessary since - // 10 bits are used to identify the column index of the data in the 64-bit univariate id. - if query_schema.fields.len() > 1024 { - return Err(ModelarDbStorageError::InvalidArgument( - "There cannot be more than 1024 columns in the model table.".to_owned(), - )); - } - // Remove the generated field columns from the query schema and the error bounds as these // columns should never be provided when inserting data points into the model table. let mut fields_without_generated = Vec::with_capacity(query_schema.fields().len()); diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index f3e2dbc60..385afdfa3 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -379,7 +379,6 @@ impl TableMetadataManager { (0.0, false) }; - // query_schema_index is simply cast as a model table contains at most 1024 columns. self.delta_lake .write_columns_to_metadata_table( "model_table_field_columns", From 999576dbe5d1b0535b2207aa02ba62a2bd65c2fc Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 20:56:26 +0100 Subject: [PATCH 06/69] No longer save tag metadata when inserting data points --- crates/modelardb_server/src/storage/mod.rs | 1 - .../src/storage/uncompressed_data_manager.rs | 40 +++++-------------- 2 files changed, 9 insertions(+), 32 deletions(-) diff --git a/crates/modelardb_server/src/storage/mod.rs b/crates/modelardb_server/src/storage/mod.rs index 05de92c0e..14a7f1ba9 100644 --- a/crates/modelardb_server/src/storage/mod.rs +++ b/crates/modelardb_server/src/storage/mod.rs @@ -109,7 +109,6 @@ impl StorageEngine { // Create the uncompressed data manager. let uncompressed_data_manager = Arc::new(UncompressedDataManager::new( data_folders.local_data_folder.clone(), - data_folders.maybe_remote_data_folder.clone(), memory_pool.clone(), channels.clone(), )); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index d57214a31..99ff43c1f 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -48,8 +48,6 @@ use crate::storage::UNCOMPRESSED_DATA_FOLDER; pub(super) struct UncompressedDataManager { /// Folder for storing metadata and data in Apache Parquet files on the local file system. pub local_data_folder: DataFolder, - /// Folder for storing metadata and data in Apache Parquet files in a remote object store. - pub maybe_remote_data_folder: Option, /// Counter incremented for each [`RecordBatch`](datafusion::arrow::array::RecordBatch) of data /// points ingested. The value is assigned to buffers that are created or updated and is used to /// flush buffers that are no longer used. @@ -73,13 +71,11 @@ pub(super) struct UncompressedDataManager { impl UncompressedDataManager { pub(super) fn new( local_data_folder: DataFolder, - maybe_remote_data_folder: Option, memory_pool: Arc, channels: Arc, ) -> Self { Self { local_data_folder, - maybe_remote_data_folder, current_batch_index: AtomicU64::new(0), uncompressed_in_memory_data_buffers: DashMap::new(), uncompressed_on_disk_data_buffers: DashMap::new(), @@ -203,30 +199,12 @@ impl UncompressedDataManager { .map(|array| array.value(index).to_string()) .collect(); - let (tag_hash, tag_hash_is_saved) = self - .local_data_folder - .table_metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &tag_values) - .await?; - - // If the server was started with a manager, transfer the tag hash metadata if it was - // saved to the server metadata Delta Lake. We purposely transfer tag metadata before the - // associated files for convenience. This does not cause problems when querying. - if let Some(remote_data_folder) = &self.maybe_remote_data_folder { - if tag_hash_is_saved { - remote_data_folder - .table_metadata_manager - .save_tag_hash_metadata(&model_table_metadata, tag_hash, &tag_values) - .await?; - } - } - let mut values = field_column_arrays.iter().map(|array| array.value(index)); // unwrap() is safe to use since the timestamps array cannot contain null values. buffers_are_spilled |= self .insert_data_point( - tag_hash, + tag_values, timestamp.unwrap(), &mut values, model_table_metadata.clone(), @@ -252,15 +230,15 @@ impl UncompressedDataManager { Ok(()) } - /// Insert a single data point into the in-memory buffer for `tag_hash` if one exists. If the - /// buffer has been spilled, read it back into memory. If no buffer exists for `tag_hash`, - /// allocate a new buffer that will be compressed within the error bound in - /// `model_table_metadata`. Returns [`true`] if a buffer was spilled, [`false`] if not, and - /// [`ModelarDbServerError`](crate::error::ModelarDbServerError) if the error bound cannot be - /// retrieved from the metadata Delta Lake. + /// Insert a single data point into the in-memory buffer with the tag hash that corresponds to + /// `tag_values` if one exists. If the buffer has been spilled, read it back into memory. If no + /// buffer exists for the tag hash, allocate a new buffer that will be compressed within the + /// error bound in `model_table_metadata`. Returns [`true`] if a buffer was spilled, [`false`] + /// if not, and [`ModelarDbServerError`](crate::error::ModelarDbServerError) if the error bound + /// cannot be retrieved from the metadata Delta Lake. async fn insert_data_point( &self, - tag_hash: u64, + tag_values: Vec, timestamp: Timestamp, values: &mut dyn Iterator, model_table_metadata: Arc, @@ -1323,7 +1301,7 @@ mod tests { let channels = Arc::new(Channels::new()); let uncompressed_data_manager = - UncompressedDataManager::new(local_data_folder, None, memory_pool, channels); + UncompressedDataManager::new(local_data_folder, memory_pool, channels); (uncompressed_data_manager, Arc::new(model_table_metadata)) } From 6794ffcc6d80aa39075cd35f4cb211e127110831 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 21:12:17 +0100 Subject: [PATCH 07/69] Use a new function to calculate tag hash outside table metadata manager --- .../src/storage/uncompressed_data_manager.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 99ff43c1f..7093c8d0a 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -16,6 +16,7 @@ //! Support for managing all uncompressed data that is ingested into the //! [`StorageEngine`](crate::storage::StorageEngine). +use std::hash::{DefaultHasher, Hasher}; use std::io::{Error as IOError, ErrorKind as IOErrorKind}; use std::mem; use std::sync::atomic::{AtomicU64, Ordering}; @@ -244,6 +245,8 @@ impl UncompressedDataManager { model_table_metadata: Arc, current_batch_index: u64, ) -> Result { + let tag_hash = calculate_tag_hash(&model_table_metadata.name, &tag_values); + debug!("Add data point at {timestamp} to uncompressed data buffer for {tag_hash}."); // Track if any buffers are spilled during ingestion so this information can be returned to @@ -647,6 +650,18 @@ impl UncompressedDataManager { } } +/// Calculate a unique hash for a specific combination of `table_name` and `tag_values`. The hash +/// can be used to identify a specific multivariate time series during ingestion. +fn calculate_tag_hash(table_name: &str, tag_values: &[String]) -> u64 { + let mut hash_data = tag_values.to_vec(); + hash_data.push(table_name.to_string()); + + let mut hasher = DefaultHasher::new(); + hasher.write(hash_data.join(";").as_bytes()); + + hasher.finish() +} + #[cfg(test)] mod tests { use super::*; From cd5e9bcf7b22951c5be971922ab8595ee167042c Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 21:20:25 +0100 Subject: [PATCH 08/69] Remove methods to lookup and save tag hash metadata --- .../src/storage/uncompressed_data_manager.rs | 6 - .../src/metadata/table_metadata_manager.rs | 267 +----------------- 2 files changed, 1 insertion(+), 272 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 7093c8d0a..b8b57c779 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -1301,12 +1301,6 @@ mod tests { .await .unwrap(); - local_data_folder - .table_metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag".to_owned()]) - .await - .unwrap(); - let memory_pool = Arc::new(MemoryPool::new( INGESTED_RESERVED_MEMORY_IN_BYTES, UNCOMPRESSED_RESERVED_MEMORY_IN_BYTES, diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 385afdfa3..7939f64e8 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -18,19 +18,16 @@ //! through this metadata manager, while it only supports a subset of the manager metadata Delta Lake. use std::collections::HashMap; -use std::hash::{DefaultHasher, Hasher}; use std::path::Path as StdPath; use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, Int16Array, Int64Array, RecordBatch, + Array, BinaryArray, BooleanArray, Float32Array, Int16Array, Int64Array, StringArray, }; use arrow::datatypes::{DataType, Field, Schema}; use dashmap::DashMap; -use datafusion::catalog::TableProvider; use datafusion::common::{DFSchema, ToDFSchema}; -use datafusion::dataframe::DataFrame; use datafusion::logical_expr::lit; use datafusion::prelude::{col, SessionContext}; use modelardb_common::test::ERROR_BOUND_ZERO; @@ -664,128 +661,6 @@ impl TableMetadataManager { Ok(generated_columns) } - /// Return the tag hash for the given list of tag values either by retrieving it from a cache - /// or, if the combination of tag values is not in the cache, by computing a new hash. If the - /// hash is not in the cache, it is saved to the cache and persisted to the `model_table_tags` - /// table if it does not already contain it. If the hash was saved to the metadata Delta Lake, - /// also return [`true`]. If the `model_table_tags` table cannot be accessed, - /// [`ModelarDbStorageError`] is returned. - pub async fn lookup_or_compute_tag_hash( - &self, - model_table_metadata: &ModelTableMetadata, - tag_values: &[String], - ) -> Result<(u64, bool)> { - let cache_key = { - let mut cache_key_list = tag_values.to_vec(); - cache_key_list.push(model_table_metadata.name.clone()); - - cache_key_list.join(";") - }; - - // Check if the tag hash is in the cache. If it is, retrieve it. If it is not, create a new - // one and save it both in the cache and in the metadata Delta Lake. There is a minor - // race condition because the check if a tag hash is in the cache and the addition of the - // hash is done without taking a lock on the tag_value_hashes. However, by allowing a hash - // to possibly be computed more than once, the cache can be used without an explicit lock. - if let Some(tag_hash) = self.tag_value_hashes.get(&cache_key) { - Ok((*tag_hash, false)) - } else { - // Generate the 54-bit tag hash based on the tag values of the record batch and model - // table name. - let tag_hash = { - let mut hasher = DefaultHasher::new(); - hasher.write(cache_key.as_bytes()); - - // The 64-bit hash is shifted to make the 10 least significant bits 0. - hasher.finish() << 10 - }; - - // Save the tag hash in the metadata Delta Lake and in the cache. - let tag_hash_is_saved = self - .save_tag_hash_metadata(model_table_metadata, tag_hash, tag_values) - .await?; - - self.tag_value_hashes.insert(cache_key, tag_hash); - - Ok((tag_hash, tag_hash_is_saved)) - } - } - - /// Save the given tag hash metadata to the `model_table_tags` table if it does not already - /// contain it. If the table did not contain the tag hash, meaning it is a new tag combination, - /// return [`true`]. If the metadata cannot be inserted into `model_table_tags`, - /// [`ModelarDbStorageError`] is returned. - pub async fn save_tag_hash_metadata( - &self, - model_table_metadata: &ModelTableMetadata, - tag_hash: u64, - tag_values: &[String], - ) -> Result { - let table_name = model_table_metadata.name.as_str(); - let tag_columns = &model_table_metadata - .tag_column_indices - .iter() - .map(|index| model_table_metadata.schema.field(*index).name().clone()) - .collect::>(); - - let signed_tag_hash = i64::from_ne_bytes(tag_hash.to_ne_bytes()); - - // Save the tag hash metadata in the model_table_tags table if it does not already contain it. - let mut table_name_tags_columns: Vec = - vec![Arc::new(Int64Array::from(vec![signed_tag_hash]))]; - - table_name_tags_columns.append( - &mut tag_values - .iter() - .map(|tag_value| Arc::new(StringArray::from(vec![tag_value.clone()])) as ArrayRef) - .collect::>(), - ); - - let source = self - .metadata_table_data_frame(&format!("{table_name}_tags"), table_name_tags_columns) - .await?; - - let delta_ops = self - .delta_lake - .metadata_delta_ops(&format!("{table_name}_tags")) - .await?; - - // Merge the tag hash metadata in the source DataFrame into the model_table_tags table. - // For each hash, if the hash is not already in the target table, insert the hash and the - // tag values from the source DataFrame. - let (_table, insert_into_tags_metrics) = delta_ops - .merge(source, col("target.hash").eq(col("source.hash"))) - .with_source_alias("source") - .with_target_alias("target") - .when_not_matched_insert(|mut insert| { - for tag_column in tag_columns { - insert = insert.set(tag_column, col(format!("source.{tag_column}"))) - } - - insert.set("hash", col("source.hash")) - })? - .await?; - - Ok(insert_into_tags_metrics.num_target_rows_inserted > 0) - } - - /// Return a [`DataFrame`] with the given `rows` for the metadata table with the given - /// `table_name`. If the table does not exist or the [`DataFrame`] cannot be created, return - /// [`ModelarDbStorageError`]. - async fn metadata_table_data_frame( - &self, - table_name: &str, - rows: Vec, - ) -> Result { - let table = self.delta_lake.metadata_delta_table(table_name).await?; - - // TableProvider::schema(&table) is used instead of table.schema() because table.schema() - // returns the Delta Lake schema instead of the Apache Arrow DataFusion schema. - let batch = RecordBatch::try_new(TableProvider::schema(&table), rows)?; - - Ok(self.session_context.read_batch(batch)?) - } - /// Return a mapping from tag hashes to the tags in the columns with the names in /// `tag_column_names` for the time series in the model table with the name `model_table_name`. /// Returns a [`ModelarDbStorageError`] if the necessary data cannot be retrieved from the @@ -1070,12 +945,6 @@ mod tests { async fn test_drop_model_table_metadata() { let (temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - let model_table_metadata = test::model_table_metadata(); - metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - metadata_manager .drop_table_metadata(test::MODEL_TABLE_NAME) .await @@ -1144,12 +1013,6 @@ mod tests { async fn test_truncate_model_table_metadata() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - let model_table_metadata = test::model_table_metadata(); - metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - metadata_manager .truncate_table_metadata(test::MODEL_TABLE_NAME) .await @@ -1321,128 +1184,6 @@ mod tests { ); } - #[tokio::test] - async fn test_compute_new_tag_hash() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let model_table_metadata = test::model_table_metadata(); - let (tag_hash_1, tag_hash_1_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - - let (tag_hash_2, tag_hash_2_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag2".to_owned()]) - .await - .unwrap(); - - assert!(tag_hash_1_is_saved && tag_hash_2_is_saved); - - // The tag hashes should be saved in the cache. - assert_eq!(metadata_manager.tag_value_hashes.len(), 2); - - // The tag hashes should be saved in the model_table_tags table. - let sql = format!("SELECT hash, tag FROM {}_tags", test::MODEL_TABLE_NAME); - let batch = sql_and_concat(&metadata_manager.session_context, &sql) - .await - .unwrap(); - - assert_eq!( - **batch.column(0), - Int64Array::from(vec![ - i64::from_ne_bytes(tag_hash_2.to_ne_bytes()), - i64::from_ne_bytes(tag_hash_1.to_ne_bytes()), - ]) - ); - assert_eq!(**batch.column(1), StringArray::from(vec!["tag2", "tag1"])); - } - - #[tokio::test] - async fn test_lookup_existing_tag_hash() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let model_table_metadata = test::model_table_metadata(); - let (tag_hash_compute, tag_hash_compute_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - - assert!(tag_hash_compute_is_saved); - assert_eq!(metadata_manager.tag_value_hashes.len(), 1); - - // When getting the same tag hash again, it should be retrieved from the cache. - let (tag_hash_lookup, tag_hash_lookup_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - - assert!(!tag_hash_lookup_is_saved); - assert_eq!(metadata_manager.tag_value_hashes.len(), 1); - - assert_eq!(tag_hash_compute, tag_hash_lookup); - } - - #[tokio::test] - async fn test_compute_tag_hash_with_invalid_tag_values() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let model_table_metadata = test::model_table_metadata(); - let zero_tags_result = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &[]) - .await; - - let two_tags_result = metadata_manager - .lookup_or_compute_tag_hash( - &model_table_metadata, - &["tag1".to_owned(), "tag2".to_owned()], - ) - .await; - - assert!(zero_tags_result.is_err()); - assert!(two_tags_result.is_err()); - - // The tag hashes should not be saved in either the cache or the metadata Delta Lake. - assert_eq!(metadata_manager.tag_value_hashes.len(), 0); - - let sql = format!("SELECT hash FROM {}_tags", test::MODEL_TABLE_NAME); - let batch = sql_and_concat(&metadata_manager.session_context, &sql) - .await - .unwrap(); - - assert!(batch.column(0).is_empty()); - } - - #[tokio::test] - async fn test_mapping_from_hash_to_tags() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let model_table_metadata = test::model_table_metadata(); - let (tag_hash_1, _tag_hash_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - - let (tag_hash_2, _tag_hash_is_saved) = metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag2".to_owned()]) - .await - .unwrap(); - - let mapping_from_hash_to_tags = metadata_manager - .mapping_from_hash_to_tags(test::MODEL_TABLE_NAME, &["tag"]) - .await - .unwrap(); - - assert_eq!(mapping_from_hash_to_tags.len(), 2); - assert_eq!( - mapping_from_hash_to_tags.get(&tag_hash_1).unwrap(), - &vec!["tag1".to_owned()] - ); - assert_eq!( - mapping_from_hash_to_tags.get(&tag_hash_2).unwrap(), - &vec!["tag2".to_owned()] - ); - } - #[tokio::test] async fn test_mapping_from_hash_to_tags_with_missing_model_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; @@ -1458,12 +1199,6 @@ mod tests { async fn test_mapping_from_hash_to_tags_with_invalid_tag_column() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - let model_table_metadata = test::model_table_metadata(); - metadata_manager - .lookup_or_compute_tag_hash(&model_table_metadata, &["tag1".to_owned()]) - .await - .unwrap(); - let result = metadata_manager .mapping_from_hash_to_tags(test::MODEL_TABLE_NAME, &["invalid_tag"]) .await; From c4d0c9086dbe3828d8c7650690c8393d5213c856 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 21:25:26 +0100 Subject: [PATCH 09/69] Remove tag cache from table metadata manager --- .../src/metadata/table_metadata_manager.rs | 44 +++---------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 7939f64e8..103c27b0a 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -22,11 +22,9 @@ use std::path::Path as StdPath; use std::sync::Arc; use arrow::array::{ - Array, BinaryArray, BooleanArray, Float32Array, Int16Array, Int64Array, - StringArray, + Array, BinaryArray, BooleanArray, Float32Array, Int16Array, Int64Array, StringArray, }; use arrow::datatypes::{DataType, Field, Schema}; -use dashmap::DashMap; use datafusion::common::{DFSchema, ToDFSchema}; use datafusion::logical_expr::lit; use datafusion::prelude::{col, SessionContext}; @@ -52,8 +50,6 @@ enum TableType { pub struct TableMetadataManager { /// Delta Lake with functionality to read and write to and from the metadata tables. delta_lake: DeltaLake, - /// Cache of tag value hashes used to signify when to persist new unsaved tag combinations. - tag_value_hashes: DashMap, /// Session context used to query the metadata Delta Lake tables using Apache DataFusion. session_context: Arc, } @@ -68,7 +64,6 @@ impl TableMetadataManager { ) -> Result { let table_metadata_manager = Self { delta_lake: DeltaLake::try_from_local_path(folder_path)?, - tag_value_hashes: DashMap::new(), session_context: maybe_session_context .unwrap_or_else(|| Arc::new(SessionContext::new())), }; @@ -90,7 +85,6 @@ impl TableMetadataManager { ) -> Result { let table_metadata_manager = Self { delta_lake: DeltaLake::try_remote_from_connection_info(connection_info)?, - tag_value_hashes: DashMap::new(), session_context: maybe_session_context .unwrap_or_else(|| Arc::new(SessionContext::new())), }; @@ -118,7 +112,6 @@ impl TableMetadataManager { access_key_id, secret_access_key, )?, - tag_value_hashes: DashMap::new(), session_context: Arc::new(SessionContext::new()), }; @@ -143,7 +136,6 @@ impl TableMetadataManager { access_key, container_name, )?, - tag_value_hashes: DashMap::new(), session_context: Arc::new(SessionContext::new()), }; @@ -429,9 +421,9 @@ impl TableMetadataManager { /// Drop the metadata for the model table with `table_name` from the metadata Delta Lake. /// This includes dropping the tags table for the model table, deleting a row from the - /// `model_table_metadata` table, deleting a row from the `model_table_field_columns` table for - /// each field column, and deleting the tag metadata from the tag cache. If the metadata could - /// not be dropped, [`ModelarDbStorageError`] is returned. + /// `model_table_metadata` table, and deleting a row from the `model_table_field_columns` table + /// for each field column. If the metadata could not be dropped, [`ModelarDbStorageError`] is + /// returned. async fn drop_model_table_metadata(&self, table_name: &str) -> Result<()> { // Drop and deregister the model_table_name_tags table. let tags_table_name = format!("{table_name}_tags"); @@ -457,9 +449,6 @@ impl TableMetadataManager { .with_predicate(col("table_name").eq(lit(table_name))) .await?; - // Delete the tag hash metadata from the metadata Delta Lake and the tag cache. - self.delete_tag_hash_metadata(table_name).await?; - Ok(()) } @@ -481,9 +470,8 @@ impl TableMetadataManager { } /// Truncate the metadata for the model table with `table_name` from the metadata Delta Lake. - /// This includes truncating the tags table for the model table and deleting the tag metadata - /// from the tag cache. If the metadata could not be truncated, [`ModelarDbStorageError`] is - /// returned. + /// This includes truncating the tags table for the model table. If the metadata could not be + /// truncated, [`ModelarDbStorageError`] is returned. async fn truncate_model_table_metadata(&self, table_name: &str) -> Result<()> { // Truncate the model_table_name_tags table. self.delta_lake @@ -492,20 +480,6 @@ impl TableMetadataManager { .delete() .await?; - // Delete the tag hash metadata from the metadata Delta Lake and the tag cache. - self.delete_tag_hash_metadata(table_name).await?; - - Ok(()) - } - - /// Delete the tag hash metadata for the model table with `table_name` from the tag cache. If - /// the metadata could not be deleted, [`ModelarDbStorageError`] is returned. - async fn delete_tag_hash_metadata(&self, table_name: &str) -> Result<()> { - // Delete the tag metadata from the tag cache. The table name is always the last part of - // the cache key. - self.tag_value_hashes - .retain(|key, _| key.split(';').last() != Some(table_name)); - Ok(()) } @@ -973,9 +947,6 @@ mod tests { .unwrap(); assert_eq!(batch.num_rows(), 0); - - // Verify that the tag cache was cleared. - assert!(metadata_manager.tag_value_hashes.is_empty()); } #[tokio::test] @@ -1025,9 +996,6 @@ mod tests { .unwrap(); assert_eq!(batch.num_rows(), 0); - - // Verify that the tag cache was cleared. - assert!(metadata_manager.tag_value_hashes.is_empty()); } #[tokio::test] From 92831e310887c4ff2473439761bb9d8b22762766 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 21:40:18 +0100 Subject: [PATCH 10/69] Remove mapping_from_hash_to_tags() --- .../src/metadata/table_metadata_manager.rs | 73 +------------------ 1 file changed, 1 insertion(+), 72 deletions(-) diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 103c27b0a..bff1942a1 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -17,13 +17,10 @@ //! and the manager metadata Delta Lake. Note that the entire server metadata Delta Lake can be accessed //! through this metadata manager, while it only supports a subset of the manager metadata Delta Lake. -use std::collections::HashMap; use std::path::Path as StdPath; use std::sync::Arc; -use arrow::array::{ - Array, BinaryArray, BooleanArray, Float32Array, Int16Array, Int64Array, StringArray, -}; +use arrow::array::{Array, BinaryArray, BooleanArray, Float32Array, Int16Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::{DFSchema, ToDFSchema}; use datafusion::logical_expr::lit; @@ -634,52 +631,6 @@ impl TableMetadataManager { Ok(generated_columns) } - - /// Return a mapping from tag hashes to the tags in the columns with the names in - /// `tag_column_names` for the time series in the model table with the name `model_table_name`. - /// Returns a [`ModelarDbStorageError`] if the necessary data cannot be retrieved from the - /// metadata Delta Lake. - pub async fn mapping_from_hash_to_tags( - &self, - model_table_name: &str, - tag_column_names: &[&str], - ) -> Result>> { - // Return an empty HashMap if no tag column names are passed to keep the signature simple. - if tag_column_names.is_empty() { - return Ok(HashMap::new()); - } - - let sql = format!( - "SELECT hash, {} FROM {model_table_name}_tags", - tag_column_names.join(","), - ); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - let hash_array = modelardb_types::array!(batch, 0, Int64Array); - - // For each tag column, get the corresponding column array. - let tag_arrays: Vec<&StringArray> = tag_column_names - .iter() - .enumerate() - .map(|(index, _tag_column)| modelardb_types::array!(batch, index + 1, StringArray)) - .collect(); - - let mut hash_to_tags = HashMap::new(); - for row_index in 0..batch.num_rows() { - let signed_tag_hash = hash_array.value(row_index); - let tag_hash = u64::from_ne_bytes(signed_tag_hash.to_ne_bytes()); - - // For each tag array, add the row index value to the tags for this tag hash. - let tags: Vec = tag_arrays - .iter() - .map(|tag_array| tag_array.value(row_index).to_owned()) - .collect(); - - hash_to_tags.insert(tag_hash, tags); - } - - Ok(hash_to_tags) - } } #[cfg(test)] @@ -1152,28 +1103,6 @@ mod tests { ); } - #[tokio::test] - async fn test_mapping_from_hash_to_tags_with_missing_model_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let result = metadata_manager - .mapping_from_hash_to_tags("missing_table", &["tag"]) - .await; - - assert!(result.is_err()); - } - - #[tokio::test] - async fn test_mapping_from_hash_to_tags_with_invalid_tag_column() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let result = metadata_manager - .mapping_from_hash_to_tags(test::MODEL_TABLE_NAME, &["invalid_tag"]) - .await; - - assert!(result.is_err()); - } - async fn create_metadata_manager_and_save_model_table() -> (TempDir, TableMetadataManager) { let temp_dir = tempfile::tempdir().unwrap(); let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path(), None) From 13e3ad35cface3cd19e9fc53b015f0c59f5e9273 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 21:48:04 +0100 Subject: [PATCH 11/69] Remove model_table_tags table from metadata Delta Lake --- crates/modelardb_server/src/context.rs | 7 -- .../src/metadata/table_metadata_manager.rs | 114 +----------------- 2 files changed, 6 insertions(+), 115 deletions(-) diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 9d2c354b0..88a7c35d9 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -184,13 +184,6 @@ impl Context { .save_model_table_metadata(model_table_metadata) .await?; - // Register the metadata table needed for querying the model table if it is not already - // registered. The tags table is already registered if the query data folder and local data - // folder is the same. - query_folder_table_metadata_manager - .register_tags_table(&model_table_metadata.name) - .await?; - info!("Created model table '{}'.", model_table_metadata.name); Ok(()) diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index bff1942a1..55247b1f8 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -202,29 +202,6 @@ impl TableMetadataManager { delta_table, )?; - // Register the model_table_name_tags table for each model table. - for model_table_name in self.model_table_names().await? { - self.register_tags_table(&model_table_name).await?; - } - - Ok(()) - } - - /// Register the tags table for the model table with `model_table_name` if it is not already - /// registered. The tags table is required to be registered to allow querying a model table. - /// If the tags table could not be registered, [`ModelarDbStorageError`] is returned. - pub async fn register_tags_table(&self, model_table_name: &str) -> Result<()> { - let tags_table_name = format!("{}_tags", model_table_name); - - let delta_table = self - .delta_lake - .metadata_delta_table(&tags_table_name) - .await?; - - if !self.session_context.table_exist(&tags_table_name)? { - register_metadata_table(&self.session_context, &tags_table_name, delta_table)?; - } - Ok(()) } @@ -299,33 +276,13 @@ impl TableMetadataManager { Ok(()) } - /// Save the created model table to the metadata Delta Lake. This includes creating a tags table - /// for the model table, adding a row to the `model_table_metadata` table, and adding a row to - /// the `model_table_field_columns` table for each field column. + /// Save the created model table to the metadata Delta Lake. This includes adding a row to the + /// `model_table_metadata` table and adding a row to the `model_table_field_columns` table for + /// each field column. pub async fn save_model_table_metadata( &self, model_table_metadata: &ModelTableMetadata, ) -> Result<()> { - // Create and register a table_name_tags table to save the 54-bit tag hashes when ingesting data. - let mut table_name_tags_columns = vec![Field::new("hash", DataType::Int64, false)]; - - // Add a column definition for each tag column in the query schema. - table_name_tags_columns.append( - &mut model_table_metadata - .tag_column_indices - .iter() - .map(|index| model_table_metadata.query_schema.field(*index).clone()) - .collect::>(), - ); - - let tags_table_name = format!("{}_tags", model_table_metadata.name); - let delta_table = self - .delta_lake - .create_metadata_table(&tags_table_name, &Schema::new(table_name_tags_columns)) - .await?; - - register_metadata_table(&self.session_context, &tags_table_name, delta_table)?; - // Convert the query schema to bytes, so it can be saved in the metadata Delta Lake. let query_schema_bytes = try_convert_schema_to_bytes(&model_table_metadata.query_schema)?; @@ -417,19 +374,10 @@ impl TableMetadataManager { } /// Drop the metadata for the model table with `table_name` from the metadata Delta Lake. - /// This includes dropping the tags table for the model table, deleting a row from the - /// `model_table_metadata` table, and deleting a row from the `model_table_field_columns` table - /// for each field column. If the metadata could not be dropped, [`ModelarDbStorageError`] is - /// returned. + /// This includes deleting a row from the `model_table_metadata` table and deleting a row from + /// the `model_table_field_columns` table for each field column. If the metadata could not be + /// dropped, [`ModelarDbStorageError`] is returned. async fn drop_model_table_metadata(&self, table_name: &str) -> Result<()> { - // Drop and deregister the model_table_name_tags table. - let tags_table_name = format!("{table_name}_tags"); - self.delta_lake - .drop_metadata_table(&tags_table_name) - .await?; - - self.session_context.deregister_table(&tags_table_name)?; - // Delete the table metadata from the model_table_metadata table. self.delta_lake .metadata_delta_ops("model_table_metadata") @@ -675,44 +623,6 @@ mod tests { .is_ok()); } - #[tokio::test] - async fn test_register_tags_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - let session_context = &metadata_manager.session_context; - - let tags_table_name = format!("{}_tags", test::MODEL_TABLE_NAME); - session_context.deregister_table(&tags_table_name).unwrap(); - assert!(!session_context.table_exist(&tags_table_name).unwrap()); - - metadata_manager - .register_tags_table(test::MODEL_TABLE_NAME) - .await - .unwrap(); - - assert!(session_context.table_exist(&tags_table_name).unwrap()); - - // If the table is already registered, it should not be registered again. - let result = metadata_manager - .register_tags_table(test::MODEL_TABLE_NAME) - .await; - - assert!(result.is_ok()); - assert!(session_context.table_exist(&tags_table_name).unwrap()); - } - - #[tokio::test] - async fn test_register_missing_model_table_tags_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - let result = metadata_manager.register_tags_table("missing_table").await; - - assert!(result.is_err()); - assert!(!metadata_manager - .session_context - .table_exist("missing_table_tags") - .unwrap()); - } - #[tokio::test] async fn test_normal_table_is_normal_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; @@ -802,10 +712,6 @@ mod tests { async fn test_save_model_table_metadata() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - // Verify that the table was created and has the expected columns. - let sql = format!("SELECT hash, tag FROM {}_tags", test::MODEL_TABLE_NAME); - assert!(metadata_manager.session_context.sql(&sql).await.is_ok()); - // Check that a row has been added to the model_table_metadata table. let sql = "SELECT table_name, query_schema FROM model_table_metadata"; let batch = sql_and_concat(&metadata_manager.session_context, sql) @@ -875,14 +781,6 @@ mod tests { .await .unwrap(); - // Verify that the tags table was deleted from the Delta Lake. - let tags_table_name = format!("{}_tags", test::MODEL_TABLE_NAME); - assert!(!temp_dir - .path() - .join("metadata") - .join(tags_table_name) - .exists()); - // Verify that the model table was deleted from the model_table_metadata table. let sql = "SELECT table_name FROM model_table_metadata"; let batch = sql_and_concat(&metadata_manager.session_context, sql) From ec961f6923709bf55a0dd02a8deab78d2d956778 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 17 Feb 2025 22:02:08 +0100 Subject: [PATCH 12/69] Remove method to truncate table metadata --- crates/modelardb_manager/src/remote.rs | 20 ++--- crates/modelardb_server/src/context.rs | 11 +-- .../src/metadata/table_metadata_manager.rs | 80 ------------------- 3 files changed, 10 insertions(+), 101 deletions(-) diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index b52746353..ca0ef8e0b 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -282,19 +282,15 @@ impl FlightServiceHandler { Ok(()) } - /// Truncate the table in the metadata Delta Lake, the data Delta Lake, and in each node - /// controlled by the manager. If the table does not exist or the table cannot be truncated in - /// the remote data folder and in each node, return [`Status`]. + /// Truncate the table in the data Delta Lake and in each node controlled by the manager. If the + /// table does not exist or the table cannot be truncated in the remote data folder and in each + /// node, return [`Status`]. async fn truncate_cluster_table(&self, table_name: &str) -> StdResult<(), Status> { - // Truncate the table in the remote data folder metadata Delta Lake. This will return an - // error if the table does not exist. - self.context - .remote_data_folder - .metadata_manager - .table_metadata_manager - .truncate_table_metadata(table_name) - .await - .map_err(error_to_status_internal)?; + if self.check_if_table_exists(table_name).await.is_ok() { + return Err(Status::invalid_argument(format!( + "Table with name '{table_name}' does not exist.", + ))); + } // Truncate the table in the remote data folder data Delta lake. self.context diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 88a7c35d9..f5417b942 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -329,8 +329,8 @@ impl Context { } /// Delete all data from the table with `table_name` if it exists. The table data is deleted - /// from the storage engine, metadata Delta Lake, and data Delta Lake. If the table does not - /// exist or if it could not be truncated, [`ModelarDbServerError`] is returned. + /// from the storage engine and data Delta Lake. If the table does not exist or if it could not + /// be truncated, [`ModelarDbServerError`] is returned. pub async fn truncate_table(&self, table_name: &str) -> Result<()> { // Deleting the table from the storage engine does not require the table to exist, so the // table is checked first. @@ -342,13 +342,6 @@ impl Context { self.drop_table_from_storage_engine(table_name).await?; - // Delete the table metadata from the metadata Delta Lake. - self.data_folders - .local_data_folder - .table_metadata_manager - .truncate_table_metadata(table_name) - .await?; - // Delete the table data from the data Delta Lake. self.data_folders .local_data_folder diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 55247b1f8..b73c923d8 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -397,37 +397,6 @@ impl TableMetadataManager { Ok(()) } - /// Depending on the type of the table with `table_name`, truncate either the normal table - /// metadata or the model table metadata from the metadata Delta Lake. Note that if truncating - /// the metadata of a normal table, the metadata Delta Lake is unaffected, but it is allowed to - /// keep the interface consistent. If the table does not exist or the metadata could not be - /// truncated, [`ModelarDbStorageError`] is returned. - pub async fn truncate_table_metadata(&self, table_name: &str) -> Result<()> { - if self.is_normal_table(table_name).await? { - Ok(()) - } else if self.is_model_table(table_name).await? { - self.truncate_model_table_metadata(table_name).await - } else { - Err(ModelarDbStorageError::InvalidArgument(format!( - "Table with name '{table_name}' does not exist." - ))) - } - } - - /// Truncate the metadata for the model table with `table_name` from the metadata Delta Lake. - /// This includes truncating the tags table for the model table. If the metadata could not be - /// truncated, [`ModelarDbStorageError`] is returned. - async fn truncate_model_table_metadata(&self, table_name: &str) -> Result<()> { - // Truncate the model_table_name_tags table. - self.delta_lake - .metadata_delta_ops(&format!("{table_name}_tags")) - .await? - .delete() - .await?; - - Ok(()) - } - /// Return the [`ModelTableMetadata`] of each model table currently in the metadata Delta Lake. /// If the [`ModelTableMetadata`] cannot be retrieved, [`ModelarDbStorageError`] is returned. pub async fn model_table_metadata(&self) -> Result>> { @@ -808,55 +777,6 @@ mod tests { .is_err()); } - #[tokio::test] - async fn test_truncate_normal_table_metadata() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - metadata_manager - .truncate_table_metadata("normal_table_1") - .await - .unwrap(); - - // Verify that the metadata Delta Lake was left unchanged. - let sql = "SELECT table_name FROM normal_table_metadata"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!( - **batch.column(0), - StringArray::from(vec!["normal_table_2", "normal_table_1"]) - ); - } - - #[tokio::test] - async fn test_truncate_model_table_metadata() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - - metadata_manager - .truncate_table_metadata(test::MODEL_TABLE_NAME) - .await - .unwrap(); - - // Verify that the tags table was truncated. - let sql = format!("SELECT hash FROM {}_tags", test::MODEL_TABLE_NAME); - let batch = sql_and_concat(&metadata_manager.session_context, &sql) - .await - .unwrap(); - - assert_eq!(batch.num_rows(), 0); - } - - #[tokio::test] - async fn test_truncate_table_metadata_for_missing_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - assert!(metadata_manager - .truncate_table_metadata("missing_table") - .await - .is_err()); - } - async fn create_metadata_manager_and_save_normal_tables() -> (TempDir, TableMetadataManager) { let temp_dir = tempfile::tempdir().unwrap(); let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path(), None) From e7e1aed192478fea5b1bf3a9e8e87daa255f67fa Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 18 Feb 2025 23:44:02 +0100 Subject: [PATCH 13/69] Remove separate schema for uncompressed data --- .../src/metadata/model_table_metadata.rs | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index 5100bfb21..8ec3f36a8 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -43,8 +43,6 @@ pub struct ModelTableMetadata { pub tag_column_indices: Vec, /// Error bounds of the columns in `schema`. It can only be non-zero for field columns. pub error_bounds: Vec, - /// Schema of the data that can be compressed in the model table. - pub uncompressed_schema: Arc, /// Schema of the data that can be read from the model table. pub query_schema: Arc, /// Projection that changes `query_schema` to `schema`. @@ -114,14 +112,6 @@ impl ModelTableMetadata { query_schema.clone() }; - // Schema containing timestamps and stored field columns for use by uncompressed buffers. - let uncompressed_schema = Arc::new(schema_without_generated.project( - &compute_indices_of_columns_without_data_type( - &schema_without_generated, - DataType::Utf8, - ), - )?); - // A model table must only contain one stored timestamp column, one or more stored field // columns, zero or more generated field columns, and zero or more stored tag columns. let timestamp_column_indices = compute_indices_of_columns_with_data_type( @@ -156,7 +146,6 @@ impl ModelTableMetadata { field_column_indices, tag_column_indices, error_bounds: error_bounds_without_generated, - uncompressed_schema, query_schema, query_schema_to_schema: field_indices_without_generated, generated_columns, @@ -187,17 +176,6 @@ fn compute_indices_of_columns_with_data_type(schema: &Schema, data_type: DataTyp .collect() } -/// Compute the indices of all columns in `schema` without `data_type`. -fn compute_indices_of_columns_without_data_type( - schema: &Schema, - data_type: DataType, -) -> Vec { - let fields = schema.fields(); - (0..fields.len()) - .filter(|index| *fields[*index].data_type() != data_type) - .collect() -} - /// Column that is generated by a [`Expr`] using zero or more stored columns as input. #[derive(Clone, Debug, PartialEq)] pub struct GeneratedColumn { From be0103567a3e46b403a4d2db8d8fe9a22f85ce5d Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 18 Feb 2025 23:44:25 +0100 Subject: [PATCH 14/69] Include tag values in uncompressed data buffer data --- .../src/storage/uncompressed_data_buffer.rs | 69 +++++++++++++++---- .../src/storage/uncompressed_data_manager.rs | 11 +-- 2 files changed, 61 insertions(+), 19 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 14329ae01..c3aa6597b 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -22,7 +22,7 @@ use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::mem; use std::sync::Arc; -use datafusion::arrow::array::{Array, ArrayBuilder}; +use datafusion::arrow::array::{Array, ArrayBuilder, StringArray}; use datafusion::arrow::compute; use datafusion::arrow::record_batch::RecordBatch; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; @@ -82,11 +82,14 @@ pub(super) struct UncompressedInMemoryDataBuffer { timestamps: TimestampBuilder, /// Builders for each stored field that float values are appended to. values: Vec, + /// The tag values for the time series the buffer stores data points for. + tag_values: Vec, } impl UncompressedInMemoryDataBuffer { pub(super) fn new( tag_hash: u64, + tag_values: Vec, model_table_metadata: Arc, current_batch_index: u64, ) -> Self { @@ -101,6 +104,7 @@ impl UncompressedInMemoryDataBuffer { updated_by_batch_index: current_batch_index, timestamps, values, + tag_values, } } @@ -153,23 +157,40 @@ impl UncompressedInMemoryDataBuffer { /// Finish the array builders and return the data in a [`RecordBatch`] sorted by time. pub(super) async fn record_batch(&mut self) -> Result { + let buffer_length = self.len(); let timestamps = self.timestamps.finish(); // lexsort() is not used as it is unclear in what order it sorts multiple arrays, instead a // combination of sort_to_indices() and take(), like how lexsort() is implemented, is used. let sorted_indices = compute::sort_to_indices(×tamps, None, None)?; - let mut columns = Vec::with_capacity(1 + self.values.len()); - columns.push(compute::take(×tamps, &sorted_indices, None)?); - for value in &mut self.values { - columns.push(compute::take(&value.finish(), &sorted_indices, None)?); + let mut field_column_index = 0; + let mut tag_column_index = 0; + let mut columns = Vec::with_capacity(self.model_table_metadata.schema.fields().len()); + + // Iterate over the column indices in the schema and add the sorted data to the columns. + for column_index in 0..self.model_table_metadata.schema.fields().len() { + if self.model_table_metadata.is_timestamp(column_index) { + columns.push(compute::take(×tamps, &sorted_indices, None)?); + } else if self.model_table_metadata.is_tag(column_index) { + // The tag value is the same for each data point so it is not sorted. + let tag_value = self.tag_values[tag_column_index].clone(); + let tag_array: StringArray = std::iter::repeat(Some(tag_value)) + .take(buffer_length) + .collect(); + columns.push(Arc::new(tag_array)); + + tag_column_index += 1; + } else { + let values = &self.values[field_column_index].finish(); + columns.push(compute::take(&values, &sorted_indices, None)?); + + field_column_index += 1; + } } - RecordBatch::try_new( - self.model_table_metadata.uncompressed_schema.clone(), - columns, - ) - .map_err(|error| error.into()) + RecordBatch::try_new(self.model_table_metadata.schema.clone(), columns) + .map_err(|error| error.into()) } /// Return the tag hash that identifies the time series the buffer stores data points from. @@ -255,7 +276,8 @@ impl UncompressedOnDiskDataBuffer { data_points: RecordBatch, ) -> Result { // Create a path that uses the first timestamp as the filename. - let timestamps = modelardb_types::array!(data_points, 0, TimestampArray); + let timestamp_index = model_table_metadata.timestamp_column_index; + let timestamps = modelardb_types::array!(data_points, timestamp_index, TimestampArray); let file_path = spilled_buffer_file_path( &model_table_metadata.name, tag_hash, @@ -343,13 +365,32 @@ impl UncompressedOnDiskDataBuffer { ) -> Result { let data_points = self.record_batch().await?; - let timestamp_column_array = modelardb_types::array!(data_points, 0, TimestampArray); - let field_column_arrays: Vec<_> = (1..data_points.num_columns()) - .map(|index| modelardb_types::array!(data_points, index, ValueArray)) + let timestamp_index = self.model_table_metadata.timestamp_column_index; + let timestamp_column_array = + modelardb_types::array!(data_points, timestamp_index, TimestampArray); + + let field_column_arrays: Vec<_> = self + .model_table_metadata + .field_column_indices + .iter() + .map(|index| modelardb_types::array!(data_points, *index, ValueArray)) + .collect(); + + let tag_column_arrays: Vec<_> = self + .model_table_metadata + .tag_column_indices + .iter() + .map(|index| modelardb_types::array!(data_points, *index, StringArray)) + .collect(); + + let tag_values: Vec = tag_column_arrays + .iter() + .map(|array| array.value(0).to_string()) .collect(); let mut in_memory_buffer = UncompressedInMemoryDataBuffer::new( self.tag_hash, + tag_values, self.model_table_metadata.clone(), current_batch_index, ); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index b8b57c779..ff353a718 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -313,6 +313,7 @@ impl UncompressedDataManager { let mut uncompressed_in_memory_data_buffer = UncompressedInMemoryDataBuffer::new( tag_hash, + tag_values, model_table_metadata, current_batch_index, ); @@ -591,16 +592,16 @@ impl UncompressedDataManager { }; let data_points = maybe_data_points?; - let uncompressed_timestamps = modelardb_types::array!(data_points, 0, TimestampArray); + let timestamp_index = model_table_metadata.timestamp_column_index; + let uncompressed_timestamps = + modelardb_types::array!(data_points, timestamp_index, TimestampArray); let compressed_segments = model_table_metadata .field_column_indices .iter() - .enumerate() - .map(|(value_index, field_column_index)| { - // One is added to value_index as the first array contains the timestamps. + .map(|field_column_index| { let uncompressed_values = - modelardb_types::array!(data_points, value_index + 1, ValueArray); + modelardb_types::array!(data_points, *field_column_index, ValueArray); let univariate_id = tag_hash | (*field_column_index as u64); let error_bound = model_table_metadata.error_bounds[*field_column_index]; From 9ada71e0027d74a2240fc30d1d0bbb97090876c7 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 00:27:38 +0100 Subject: [PATCH 15/69] Add a test method to get uncompressed data for a model table --- .../src/storage/uncompressed_data_manager.rs | 29 +++---------------- crates/modelardb_storage/src/test.rs | 27 +++++++++++++++-- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index ff353a718..4f622d18d 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -670,7 +670,6 @@ mod tests { use std::sync::Arc; use datafusion::arrow::array::StringBuilder; - use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; use modelardb_common::test::{ COMPRESSED_RESERVED_MEMORY_IN_BYTES, INGESTED_RESERVED_MEMORY_IN_BYTES, @@ -714,7 +713,7 @@ mod tests { // Ingest a single data point and sleep to allow the ingestion thread to finish. let mut storage_engine = context.storage_engine.write().await; - let data = uncompressed_data(1, model_table_metadata.schema.clone()); + let data = test::uncompressed_model_table_record_batch(1); storage_engine .insert_data_points(model_table_metadata, data) @@ -759,7 +758,7 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (data_manager, model_table_metadata) = create_managers(&temp_dir).await; - let data = uncompressed_data(1, model_table_metadata.schema.clone()); + let data = test::uncompressed_model_table_record_batch(1); let ingested_data_buffer = IngestedDataBuffer::new(model_table_metadata, data); data_manager @@ -776,7 +775,7 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (data_manager, model_table_metadata) = create_managers(&temp_dir).await; - let data = uncompressed_data(2, model_table_metadata.schema.clone()); + let data = test::uncompressed_model_table_record_batch(2); let ingested_data_buffer = IngestedDataBuffer::new(model_table_metadata, data); data_manager @@ -793,7 +792,7 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (data_manager, model_table_metadata) = create_managers(&temp_dir).await; - let data = uncompressed_data(2, model_table_metadata.schema.clone()); + let data = test::uncompressed_model_table_record_batch(2); let data_size = data.get_array_memory_size(); // Simulate StorageEngine decrementing ingested memory when receiving ingested data. @@ -816,26 +815,6 @@ mod tests { ); } - /// Create a [`RecordBatch`] with data that resembles uncompressed data with a single tag and two - /// field columns. The returned data has `row_count` rows, with a different tag for each row. - /// Also create model table metadata for a model table that matches the created data. - fn uncompressed_data(row_count: usize, schema: SchemaRef) -> RecordBatch { - let tags: Vec = (0..row_count).map(|tag| tag.to_string()).collect(); - let timestamps: Vec = (0..row_count).map(|ts| ts as Timestamp).collect(); - let values: Vec = (0..row_count).map(|value| value as Value).collect(); - - RecordBatch::try_new( - schema, - vec![ - Arc::new(TimestampArray::from(timestamps)), - Arc::new(ValueArray::from(values.clone())), - Arc::new(ValueArray::from(values)), - Arc::new(StringArray::from(tags)), - ], - ) - .unwrap() - } - #[tokio::test] async fn test_can_insert_data_point_into_new_uncompressed_data_buffer() { let temp_dir = tempfile::tempdir().unwrap(); diff --git a/crates/modelardb_storage/src/test.rs b/crates/modelardb_storage/src/test.rs index 48641e437..68dd658f5 100644 --- a/crates/modelardb_storage/src/test.rs +++ b/crates/modelardb_storage/src/test.rs @@ -17,13 +17,17 @@ use std::sync::Arc; -use arrow::array::{BinaryArray, Float32Array, RecordBatch, UInt16Array, UInt64Array, UInt8Array}; +use arrow::array::{ + BinaryArray, Float32Array, RecordBatch, StringArray, UInt16Array, UInt64Array, UInt8Array, +}; use arrow::compute::concat_batches; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ONE, ERROR_BOUND_ZERO}; use modelardb_types::functions; use modelardb_types::schemas::{COMPRESSED_SCHEMA, TABLE_METADATA_SCHEMA}; -use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound, TimestampArray, ValueArray}; +use modelardb_types::types::{ + ArrowTimestamp, ArrowValue, ErrorBound, Timestamp, TimestampArray, Value, ValueArray, +}; use crate::metadata::model_table_metadata::ModelTableMetadata; use crate::{model_table_metadata_to_record_batch, normal_table_metadata_to_record_batch}; @@ -114,6 +118,25 @@ pub fn model_table_metadata_arc() -> Arc { Arc::new(model_table_metadata()) } +/// Create a [`RecordBatch`] with data that resembles uncompressed data with a single tag and two +/// field columns. The returned data has `row_count` rows, with a different tag for each row. +pub fn uncompressed_model_table_record_batch(row_count: usize) -> RecordBatch { + let tags: Vec = (0..row_count).map(|tag| tag.to_string()).collect(); + let timestamps: Vec = (0..row_count).map(|ts| ts as Timestamp).collect(); + let values: Vec = (0..row_count).map(|value| value as Value).collect(); + + RecordBatch::try_new( + model_table_metadata().schema.clone(), + vec![ + Arc::new(TimestampArray::from(timestamps)), + Arc::new(ValueArray::from(values.clone())), + Arc::new(ValueArray::from(values)), + Arc::new(StringArray::from(tags)), + ], + ) + .unwrap() +} + /// Return a [`RecordBatch`] containing three compressed segments. pub fn compressed_segments_record_batch() -> RecordBatch { compressed_segments_record_batch_with_time(1, 0, 0.0) From 91723dc505d19c33a6efc7b82006b4f446ccf563 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 00:40:54 +0100 Subject: [PATCH 16/69] Add method to get column arrays from model table metadata --- .../src/metadata/model_table_metadata.rs | 81 ++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index 8ec3f36a8..42d9a5feb 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -19,11 +19,13 @@ use std::result::Result as StdResult; use std::sync::Arc; +use arrow::array::StringArray; +use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{ArrowPrimitiveType, DataType, Schema}; use datafusion::common::DFSchema; use datafusion::error::DataFusionError; use datafusion::logical_expr::expr::Expr; -use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound}; +use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound, TimestampArray, ValueArray}; use crate::error::{ModelarDbStorageError, Result}; use crate::parser::tokenize_and_parse_sql_expression; @@ -166,6 +168,44 @@ impl ModelTableMetadata { pub fn is_tag(&self, index: usize) -> bool { self.tag_column_indices.contains(&index) } + + /// Return the column arrays for the timestamp, field, and tag columns in `record_batch`. If + /// `record_batch` does not contain the required columns, return [`ModelarDbStorageError`]. + pub fn column_arrays<'a>( + &self, + record_batch: &'a RecordBatch, + ) -> Result<( + &'a TimestampArray, + Vec<&'a ValueArray>, + Vec<&'a StringArray>, + )> { + if record_batch.schema() != self.schema { + return Err(ModelarDbStorageError::InvalidArgument( + "The record batch does not match the schema of the model table.".to_owned(), + )); + } + + let timestamp_column_array = + modelardb_types::array!(record_batch, self.timestamp_column_index, TimestampArray); + + let field_column_arrays: Vec<_> = self + .field_column_indices + .iter() + .map(|index| modelardb_types::array!(record_batch, *index, ValueArray)) + .collect(); + + let tag_column_arrays: Vec<_> = self + .tag_column_indices + .iter() + .map(|index| modelardb_types::array!(record_batch, *index, StringArray)) + .collect(); + + Ok(( + timestamp_column_array, + field_column_arrays, + tag_column_arrays, + )) + } } /// Compute the indices of all columns in `schema` with `data_type`. @@ -442,6 +482,45 @@ mod test { assert!(model_table_metadata.is_tag(3)); } + #[test] + fn test_column_arrays() { + let model_table_metadata = test::model_table_metadata(); + let record_batch = test::uncompressed_model_table_record_batch(1); + + let (timestamp_column_array, field_column_arrays, tag_column_arrays) = + model_table_metadata.column_arrays(&record_batch).unwrap(); + + assert_eq!( + modelardb_types::array!(record_batch, 0, TimestampArray), + timestamp_column_array + ); + assert_eq!( + modelardb_types::array!(record_batch, 1, ValueArray), + field_column_arrays[0] + ); + assert_eq!( + modelardb_types::array!(record_batch, 2, ValueArray), + field_column_arrays[1] + ); + assert_eq!( + modelardb_types::array!(record_batch, 3, StringArray), + tag_column_arrays[0] + ); + } + + #[test] + fn test_column_arrays_with_invalid_schema() { + let model_table_metadata = test::model_table_metadata(); + let record_batch = test::normal_table_record_batch(); + + let result = model_table_metadata.column_arrays(&record_batch); + + assert_eq!( + result.unwrap_err().to_string(), + "Invalid argument: The record batch does not match the schema of the model table." + ); + } + // Tests for GeneratedColumn. #[test] fn test_can_create_generated_column() { From af7b4408dbccd62b4fb386c81348d90ea21346ec Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 00:42:18 +0100 Subject: [PATCH 17/69] Use method to get column arrays instead of doing it manually --- .../src/storage/uncompressed_data_buffer.rs | 21 ++------------ .../src/storage/uncompressed_data_manager.rs | 29 ++++--------------- 2 files changed, 9 insertions(+), 41 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index c3aa6597b..0dfd4700f 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -27,7 +27,7 @@ use datafusion::arrow::compute; use datafusion::arrow::record_batch::RecordBatch; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; use modelardb_types::types::{ - Timestamp, TimestampArray, TimestampBuilder, Value, ValueArray, ValueBuilder, + Timestamp, TimestampArray, TimestampBuilder, Value, ValueBuilder, }; use object_store::path::Path; use object_store::ObjectStore; @@ -365,23 +365,8 @@ impl UncompressedOnDiskDataBuffer { ) -> Result { let data_points = self.record_batch().await?; - let timestamp_index = self.model_table_metadata.timestamp_column_index; - let timestamp_column_array = - modelardb_types::array!(data_points, timestamp_index, TimestampArray); - - let field_column_arrays: Vec<_> = self - .model_table_metadata - .field_column_indices - .iter() - .map(|index| modelardb_types::array!(data_points, *index, ValueArray)) - .collect(); - - let tag_column_arrays: Vec<_> = self - .model_table_metadata - .tag_column_indices - .iter() - .map(|index| modelardb_types::array!(data_points, *index, StringArray)) - .collect(); + let (timestamp_column_array, field_column_arrays, tag_column_arrays) = + self.model_table_metadata.column_arrays(&data_points)?; let tag_values: Vec = tag_column_arrays .iter() diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 4f622d18d..dd536610f 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -23,10 +23,9 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use dashmap::DashMap; -use datafusion::arrow::array::StringArray; use futures::StreamExt; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; -use modelardb_types::types::{Timestamp, TimestampArray, Value, ValueArray}; +use modelardb_types::types::{Timestamp, Value, ValueArray}; use object_store::path::{Path, PathPart}; use tokio::runtime::Runtime; use tracing::{debug, error, warn}; @@ -173,24 +172,9 @@ impl UncompressedDataManager { // Read the current batch index as it may be updated in parallel. let current_batch_index = self.current_batch_index.load(Ordering::Relaxed); - // Prepare the timestamp column for iteration. - let timestamp_index = model_table_metadata.timestamp_column_index; - let timestamp_column_array = - modelardb_types::array!(data_points, timestamp_index, TimestampArray); - - // Prepare the tag columns for iteration. - let tag_column_arrays: Vec<_> = model_table_metadata - .tag_column_indices - .iter() - .map(|index| modelardb_types::array!(data_points, *index, StringArray)) - .collect(); - - // Prepare the field columns for iteration. - let field_column_arrays: Vec<_> = model_table_metadata - .field_column_indices - .iter() - .map(|index| modelardb_types::array!(data_points, *index, ValueArray)) - .collect(); + // Prepare the columns for iteration. + let (timestamp_column_array, field_column_arrays, tag_column_arrays) = + model_table_metadata.column_arrays(&data_points)?; // For each data point, compute a hash from the tags and pass the fields to the storage // engine so they can be added to the appropriate UncompressedDataBuffer. @@ -592,9 +576,8 @@ impl UncompressedDataManager { }; let data_points = maybe_data_points?; - let timestamp_index = model_table_metadata.timestamp_column_index; - let uncompressed_timestamps = - modelardb_types::array!(data_points, timestamp_index, TimestampArray); + let (uncompressed_timestamps, _field_column_arrays, _tag_column_arrays) = + model_table_metadata.column_arrays(&data_points)?; let compressed_segments = model_table_metadata .field_column_indices From 84c08a18a893239169968da27b50ac5b0e1c771d Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 18:25:26 +0100 Subject: [PATCH 18/69] Fix tests after changes to uncompressed data buffers --- .../src/storage/uncompressed_data_buffer.rs | 47 ++++++++++++------ .../src/storage/uncompressed_data_manager.rs | 49 +++++++++++-------- .../src/metadata/table_metadata_manager.rs | 2 +- 3 files changed, 61 insertions(+), 37 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 0dfd4700f..f4c9daf4f 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -26,9 +26,7 @@ use datafusion::arrow::array::{Array, ArrayBuilder, StringArray}; use datafusion::arrow::compute; use datafusion::arrow::record_batch::RecordBatch; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; -use modelardb_types::types::{ - Timestamp, TimestampArray, TimestampBuilder, Value, ValueBuilder, -}; +use modelardb_types::types::{Timestamp, TimestampArray, TimestampBuilder, Value, ValueBuilder}; use object_store::path::Path; use object_store::ObjectStore; use tracing::debug; @@ -116,7 +114,7 @@ impl UncompressedInMemoryDataBuffer { /// Return how many data points the [`UncompressedInMemoryDataBuffer`] currently contains. pub(super) fn len(&self) -> usize { - // The length is always the same for both builders. + // The length is always the same for all builders. self.timestamps.len() } @@ -426,13 +424,15 @@ mod tests { use tokio::runtime::Runtime; const CURRENT_BATCH_INDEX: u64 = 1; - const TAG_HASH: u64 = 1; + const TAG_VALUE: &str = "tag"; + const TAG_HASH: u64 = 15537859409877038916; // Tests for UncompressedInMemoryDataBuffer. #[test] fn test_get_in_memory_data_buffer_memory_size() { let uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -459,6 +459,7 @@ mod tests { fn test_get_in_memory_data_buffer_len() { let uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -470,6 +471,7 @@ mod tests { fn test_can_insert_data_point_into_in_memory_data_buffer() { let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -482,6 +484,7 @@ mod tests { fn test_check_if_in_memory_data_buffer_is_unused() { let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX - 1, ); @@ -500,6 +503,7 @@ mod tests { fn test_check_is_in_memory_data_buffer_full() { let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -512,6 +516,7 @@ mod tests { fn test_check_is_in_memory_data_buffer_not_full() { let uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -525,6 +530,7 @@ mod tests { fn test_in_memory_data_buffer_panic_if_inserting_data_point_when_full() { let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -534,17 +540,20 @@ mod tests { #[tokio::test] async fn test_get_record_batch_from_in_memory_data_buffer() { + let model_table_metadata = test::model_table_metadata_arc(); let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, - test::model_table_metadata_arc(), + vec![TAG_VALUE.to_owned()], + model_table_metadata.clone(), CURRENT_BATCH_INDEX, ); insert_data_points(uncompressed_buffer.capacity(), &mut uncompressed_buffer); let capacity = uncompressed_buffer.capacity(); let data = uncompressed_buffer.record_batch().await.unwrap(); - assert_eq!(data.num_columns(), 3); + assert_eq!(data.num_rows(), capacity); + assert_eq!(data.schema(), model_table_metadata.schema); } proptest! { @@ -553,9 +562,11 @@ mod tests { // tokio::test is not supported in proptest! due to proptest-rs/proptest/issues/179. let runtime = Runtime::new().unwrap(); + let model_table_metadata = test::model_table_metadata_arc(); let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, - test::model_table_metadata_arc(), + vec![TAG_VALUE.to_owned()], + model_table_metadata.clone(), CURRENT_BATCH_INDEX, ); @@ -566,7 +577,7 @@ mod tests { } let data = runtime.block_on(uncompressed_buffer.record_batch()).unwrap(); - assert_eq!(data.num_columns(), 3); + assert_eq!(data.schema(), model_table_metadata.schema); let timestamps = modelardb_types::array!(data, 0, TimestampArray); assert!(timestamps.values().windows(2).all(|pair| pair[0] <= pair[1])); } @@ -576,6 +587,7 @@ mod tests { async fn test_in_memory_data_buffer_can_spill_not_full_buffer() { let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -591,7 +603,7 @@ mod tests { .unwrap(); let uncompressed_path = temp_dir.path().join(format!( - "{UNCOMPRESSED_DATA_FOLDER}/{}/1", + "{UNCOMPRESSED_DATA_FOLDER}/{}/{TAG_HASH}", test::MODEL_TABLE_NAME )); assert_eq!(uncompressed_path.read_dir().unwrap().count(), 1) @@ -601,6 +613,7 @@ mod tests { async fn test_in_memory_data_buffer_can_spill_full_buffer() { let mut uncompressed_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -616,7 +629,7 @@ mod tests { .unwrap(); let uncompressed_path = temp_dir.path().join(format!( - "{UNCOMPRESSED_DATA_FOLDER}/{}/1", + "{UNCOMPRESSED_DATA_FOLDER}/{}/{TAG_HASH}", test::MODEL_TABLE_NAME )); assert_eq!(uncompressed_path.read_dir().unwrap().count(), 1) @@ -632,13 +645,13 @@ mod tests { .path() .join(UNCOMPRESSED_DATA_FOLDER) .join(test::MODEL_TABLE_NAME) - .join("1") + .join(TAG_HASH.to_string()) .join("1234567890123.parquet"); assert!(spilled_buffer_path.exists()); let data = uncompressed_on_disk_buffer.record_batch().await.unwrap(); - assert_eq!(data.num_columns(), 3); + assert_eq!(data.schema(), test::model_table_metadata().schema); assert_eq!(data.num_rows(), *UNCOMPRESSED_DATA_BUFFER_CAPACITY); assert!(!spilled_buffer_path.exists()); @@ -649,9 +662,11 @@ mod tests { // tokio::test is not supported in proptest! due to proptest-rs/proptest/issues/179. let runtime = Runtime::new().unwrap(); + let model_table_metadata = test::model_table_metadata_arc(); let mut uncompressed_in_memory_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, - test::model_table_metadata_arc(), + vec![TAG_VALUE.to_owned()], + model_table_metadata.clone(), CURRENT_BATCH_INDEX, ); @@ -672,7 +687,7 @@ mod tests { assert_eq!(spilled_buffers.len(), 1); let data = runtime.block_on(uncompressed_on_disk_buffer.record_batch()).unwrap(); - assert_eq!(data.num_columns(), 3); + assert_eq!(data.schema(), model_table_metadata.schema); let timestamps = modelardb_types::array!(data, 0, TimestampArray); assert!(timestamps.values().windows(2).all(|pair| pair[0] <= pair[1])); @@ -703,6 +718,7 @@ mod tests { // The creation of record_batch empties uncompressed_in_memory_buffer_to_be_spilled. let mut uncompressed_in_memory_buffer = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); @@ -738,6 +754,7 @@ mod tests { let mut uncompressed_in_memory_buffer_to_be_spilled = UncompressedInMemoryDataBuffer::new( TAG_HASH, + vec![TAG_VALUE.to_owned()], test::model_table_metadata_arc(), CURRENT_BATCH_INDEX, ); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index dd536610f..842dbaa35 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -660,7 +660,7 @@ mod tests { }; use modelardb_storage::test; use modelardb_types::schemas::UNCOMPRESSED_SCHEMA; - use modelardb_types::types::{TimestampBuilder, ValueBuilder}; + use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueBuilder}; use object_store::local::LocalFileSystem; use tempfile::TempDir; use tokio::time::{sleep, Duration}; @@ -668,7 +668,8 @@ mod tests { use crate::storage::UNCOMPRESSED_DATA_BUFFER_CAPACITY; use crate::{ClusterMode, DataFolders}; - const TAG_HASH: u64 = 9674644176454356993; + const TAG_VALUE: &str = "tag"; + const TAG_HASH: u64 = 15537859409877038916; // Tests for UncompressedDataManager. #[tokio::test] @@ -803,7 +804,7 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (mut data_manager, model_table_metadata) = create_managers(&temp_dir).await; - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; assert!(data_manager .uncompressed_in_memory_data_buffers @@ -823,11 +824,11 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (mut data_manager, model_table_metadata) = create_managers(&temp_dir).await; - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 0); - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 0); @@ -849,7 +850,7 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (mut data_manager, model_table_metadata) = create_managers(&temp_dir).await; - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 0); @@ -857,7 +858,7 @@ mod tests { assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 0); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 1); - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 0); @@ -890,9 +891,9 @@ mod tests { field_2.append_slice(&[50.0, 100.0, 150.0]); let mut tag = StringBuilder::new(); - tag.append_value("A"); - tag.append_value("A"); - tag.append_value("A"); + tag.append_value(TAG_VALUE); + tag.append_value(TAG_VALUE); + tag.append_value(TAG_VALUE); let data = RecordBatch::try_new( model_table_metadata.schema.clone(), @@ -916,7 +917,7 @@ mod tests { assert_eq!( data_manager .uncompressed_in_memory_data_buffers - .get(&11395701956291516416) + .get(&TAG_HASH) .unwrap() .len(), 3 @@ -945,7 +946,7 @@ mod tests { *UNCOMPRESSED_DATA_BUFFER_CAPACITY, &mut data_manager, &model_table_metadata, - TAG_HASH, + TAG_VALUE, ) .await; @@ -965,7 +966,7 @@ mod tests { *UNCOMPRESSED_DATA_BUFFER_CAPACITY * 2, &mut data_manager, &model_table_metadata, - TAG_HASH, + TAG_VALUE, ) .await; @@ -1014,7 +1015,7 @@ mod tests { 1, &mut data_manager, &model_table_metadata.clone(), - tag_hash as u64, + &tag_hash.to_string(), ) .await; } @@ -1034,7 +1035,7 @@ mod tests { ); // If there is enough memory to hold n full buffers, n + 1 are needed to spill a buffer. - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; // One of the buffers should be spilled due to the memory limit being exceeded. assert_eq!( @@ -1065,7 +1066,7 @@ mod tests { .memory_pool .remaining_uncompressed_memory_in_bytes(); - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH).await; + insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; assert!( reserved_memory @@ -1086,7 +1087,7 @@ mod tests { *UNCOMPRESSED_DATA_BUFFER_CAPACITY, &mut data_manager, &model_table_metadata, - TAG_HASH, + TAG_VALUE, )); let remaining_memory = data_manager @@ -1194,7 +1195,7 @@ mod tests { 1, &mut data_manager, &model_table_metadata.clone(), - TAG_HASH, + TAG_VALUE, ) .await; @@ -1218,7 +1219,13 @@ mod tests { ); // Insert data that should force the existing data to now be spilled. - insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_HASH + 1).await; + insert_data_points( + 1, + &mut data_manager, + &model_table_metadata, + &format!("{TAG_VALUE}_2"), + ) + .await; assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 1); @@ -1229,7 +1236,7 @@ mod tests { count: usize, data_manager: &mut UncompressedDataManager, model_table_metadata: &Arc, - tag_hash: u64, + tag_value: &str, ) { let values: &[Value] = &[37.0, 73.0]; let current_batch_index = 0; @@ -1237,7 +1244,7 @@ mod tests { for i in 0..count { data_manager .insert_data_point( - tag_hash, + vec![tag_value.to_owned()], i as i64, &mut values.iter().copied(), model_table_metadata.clone(), diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index b73c923d8..ec48af975 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -743,7 +743,7 @@ mod tests { #[tokio::test] async fn test_drop_model_table_metadata() { - let (temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; + let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; metadata_manager .drop_table_metadata(test::MODEL_TABLE_NAME) From d2e6f1d3fc2a21a3c16e842fbe309e55905a263e Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 20:12:09 +0100 Subject: [PATCH 19/69] Pass tag values and field column index to try_compress() instead of univariate ID --- .../src/storage/uncompressed_data_buffer.rs | 10 -------- .../src/storage/uncompressed_data_manager.rs | 25 ++++++++++--------- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index f4c9daf4f..5da1a6c63 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -191,11 +191,6 @@ impl UncompressedInMemoryDataBuffer { .map_err(|error| error.into()) } - /// Return the tag hash that identifies the time series the buffer stores data points from. - pub(super) fn tag_hash(&self) -> u64 { - self.tag_hash - } - /// Return the metadata for the model table the buffer stores data points for. pub(super) fn model_table_metadata(&self) -> &Arc { &self.model_table_metadata @@ -336,11 +331,6 @@ impl UncompressedOnDiskDataBuffer { Ok(data_points) } - /// Return the tag hash that identifies the time series the buffer stores data points from. - pub(super) fn tag_hash(&self) -> u64 { - self.tag_hash - } - /// Return the metadata for the model table the buffer stores data points for. pub(super) fn model_table_metadata(&self) -> &Arc { &self.model_table_metadata diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 842dbaa35..611afa654 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -25,7 +25,7 @@ use std::sync::Arc; use dashmap::DashMap; use futures::StreamExt; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; -use modelardb_types::types::{Timestamp, Value, ValueArray}; +use modelardb_types::types::{Timestamp, Value}; use object_store::path::{Path, PathPart}; use tokio::runtime::Runtime; use tracing::{debug, error, warn}; @@ -555,12 +555,11 @@ impl UncompressedDataManager { &self, uncompressed_data_buffer: UncompressedDataBuffer, ) -> Result<()> { - let (memory_use, maybe_data_points, tag_hash, model_table_metadata) = + let (memory_use, maybe_data_points, model_table_metadata) = match uncompressed_data_buffer { UncompressedDataBuffer::InMemory(mut uncompressed_in_memory_data_buffer) => ( uncompressed_in_memory_data_buffer.memory_size(), uncompressed_in_memory_data_buffer.record_batch().await, - uncompressed_in_memory_data_buffer.tag_hash(), uncompressed_in_memory_data_buffer .model_table_metadata() .clone(), @@ -568,7 +567,6 @@ impl UncompressedDataManager { UncompressedDataBuffer::OnDisk(uncompressed_on_disk_data_buffer) => ( 0, uncompressed_on_disk_data_buffer.record_batch().await, - uncompressed_on_disk_data_buffer.tag_hash(), uncompressed_on_disk_data_buffer .model_table_metadata() .clone(), @@ -576,21 +574,24 @@ impl UncompressedDataManager { }; let data_points = maybe_data_points?; - let (uncompressed_timestamps, _field_column_arrays, _tag_column_arrays) = + let (uncompressed_timestamps, field_column_arrays, tag_column_arrays) = model_table_metadata.column_arrays(&data_points)?; - let compressed_segments = model_table_metadata - .field_column_indices + let tag_values: Vec = tag_column_arrays .iter() - .map(|field_column_index| { - let uncompressed_values = - modelardb_types::array!(data_points, *field_column_index, ValueArray); - let univariate_id = tag_hash | (*field_column_index as u64); + .map(|array| array.value(0).to_string()) + .collect(); + + let compressed_segments = field_column_arrays + .iter() + .zip(model_table_metadata.field_column_indices.iter()) + .map(|(uncompressed_values, field_column_index)| { let error_bound = model_table_metadata.error_bounds[*field_column_index]; // unwrap() is safe as uncompressed_timestamps and uncompressed_values have the same length. modelardb_compression::try_compress( - univariate_id, + tag_values.clone(), + field_column_index, error_bound, uncompressed_timestamps, uncompressed_values, From 44dd8b5a936faeb6d62221c1ef987925c68023b6 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 20:25:42 +0100 Subject: [PATCH 20/69] Remove UNCOMPRESSED_SCHEMA --- .../src/storage/uncompressed_data_manager.rs | 46 ++++++++----------- crates/modelardb_types/src/macros.rs | 11 +++-- crates/modelardb_types/src/schemas.rs | 10 +--- crates/modelardb_types/src/types.rs | 5 +- 4 files changed, 28 insertions(+), 44 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 611afa654..1d784fe9f 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -555,23 +555,22 @@ impl UncompressedDataManager { &self, uncompressed_data_buffer: UncompressedDataBuffer, ) -> Result<()> { - let (memory_use, maybe_data_points, model_table_metadata) = - match uncompressed_data_buffer { - UncompressedDataBuffer::InMemory(mut uncompressed_in_memory_data_buffer) => ( - uncompressed_in_memory_data_buffer.memory_size(), - uncompressed_in_memory_data_buffer.record_batch().await, - uncompressed_in_memory_data_buffer - .model_table_metadata() - .clone(), - ), - UncompressedDataBuffer::OnDisk(uncompressed_on_disk_data_buffer) => ( - 0, - uncompressed_on_disk_data_buffer.record_batch().await, - uncompressed_on_disk_data_buffer - .model_table_metadata() - .clone(), - ), - }; + let (memory_use, maybe_data_points, model_table_metadata) = match uncompressed_data_buffer { + UncompressedDataBuffer::InMemory(mut uncompressed_in_memory_data_buffer) => ( + uncompressed_in_memory_data_buffer.memory_size(), + uncompressed_in_memory_data_buffer.record_batch().await, + uncompressed_in_memory_data_buffer + .model_table_metadata() + .clone(), + ), + UncompressedDataBuffer::OnDisk(uncompressed_on_disk_data_buffer) => ( + 0, + uncompressed_on_disk_data_buffer.record_batch().await, + uncompressed_on_disk_data_buffer + .model_table_metadata() + .clone(), + ), + }; let data_points = maybe_data_points?; let (uncompressed_timestamps, field_column_arrays, tag_column_arrays) = @@ -660,8 +659,7 @@ mod tests { UNCOMPRESSED_RESERVED_MEMORY_IN_BYTES, }; use modelardb_storage::test; - use modelardb_types::schemas::UNCOMPRESSED_SCHEMA; - use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueBuilder}; + use modelardb_types::types::{TimestampBuilder, ValueBuilder}; use object_store::local::LocalFileSystem; use tempfile::TempDir; use tokio::time::{sleep, Duration}; @@ -1120,15 +1118,7 @@ mod tests { let (data_manager, model_table_metadata) = runtime.block_on(create_managers(&temp_dir)); // Add the spilled buffer. - let uncompressed_data = RecordBatch::try_new( - UNCOMPRESSED_SCHEMA.0.clone(), - vec![ - Arc::new(TimestampArray::from(vec![0, 1, 2])), - Arc::new(ValueArray::from(vec![0.2, 0.5, 0.1])), - ], - ) - .unwrap(); - + let uncompressed_data = test::uncompressed_model_table_record_batch(3); let spilled_buffer = runtime .block_on(UncompressedOnDiskDataBuffer::try_spill( 0, diff --git a/crates/modelardb_types/src/macros.rs b/crates/modelardb_types/src/macros.rs index 7fc5efcfe..e02d8e4c4 100644 --- a/crates/modelardb_types/src/macros.rs +++ b/crates/modelardb_types/src/macros.rs @@ -71,12 +71,17 @@ macro_rules! value { /// ``` /// # use std::sync::Arc; /// # +/// # use arrow::datatypes::{ArrowPrimitiveType, Field, Schema}; /// # use arrow::record_batch::RecordBatch; -/// # use modelardb_types::schemas::UNCOMPRESSED_SCHEMA; -/// # use modelardb_types::types::{Timestamp, TimestampArray, Value, ValueArray}; +/// # use modelardb_types::types::{ArrowTimestamp, ArrowValue, Timestamp, TimestampArray, Value, ValueArray}; +/// # +/// # let schema = Schema::new(vec![ +/// # Field::new("timestamps", ArrowTimestamp::DATA_TYPE, false), +/// # Field::new("values", ArrowValue::DATA_TYPE, false), +/// # ]); /// # /// # let record_batch = RecordBatch::try_new( -/// # UNCOMPRESSED_SCHEMA.0.clone(), +/// # Arc::new(schema), /// # vec![ /// # Arc::new(TimestampArray::from(Vec::::new())), /// # Arc::new(ValueArray::from(Vec::::new())), diff --git a/crates/modelardb_types/src/schemas.rs b/crates/modelardb_types/src/schemas.rs index 2bcea1751..8b461d778 100644 --- a/crates/modelardb_types/src/schemas.rs +++ b/crates/modelardb_types/src/schemas.rs @@ -22,20 +22,12 @@ use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; use crate::types::{ ArrowTimestamp, ArrowUnivariateId, ArrowValue, CompressedSchema, ConfigurationSchema, - QueryCompressedSchema, QuerySchema, TableMetadataSchema, UncompressedSchema, + QueryCompressedSchema, QuerySchema, TableMetadataSchema, }; /// Name of the column used to partition the compressed segments. pub const FIELD_COLUMN: &str = "field_column"; -/// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used for uncompressed data buffers. -pub static UNCOMPRESSED_SCHEMA: LazyLock = LazyLock::new(|| { - UncompressedSchema(Arc::new(Schema::new(vec![ - Field::new("timestamps", ArrowTimestamp::DATA_TYPE, false), - Field::new("values", ArrowValue::DATA_TYPE, false), - ]))) -}); - /// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used for compressed segments. pub static COMPRESSED_SCHEMA: LazyLock = LazyLock::new(|| { let mut query_compressed_schema_fields = QUERY_COMPRESSED_SCHEMA.0.fields().to_vec(); diff --git a/crates/modelardb_types/src/types.rs b/crates/modelardb_types/src/types.rs index f3bd33034..1192f9cb4 100644 --- a/crates/modelardb_types/src/types.rs +++ b/crates/modelardb_types/src/types.rs @@ -47,10 +47,7 @@ pub type ArrowValue = arrow::datatypes::Float32Type; pub type ValueBuilder = arrow::array::PrimitiveBuilder; pub type ValueArray = arrow::array::PrimitiveArray; -// Types used for the schema of uncompressed data, compressed data, the configuration, and table metadata. -#[derive(Clone)] -pub struct UncompressedSchema(pub arrow::datatypes::SchemaRef); - +// Types used for the schema of compressed data, the configuration, and table metadata. #[derive(Clone)] pub struct CompressedSchema(pub arrow::datatypes::SchemaRef); From 5b95831e99802cd8e2bb4a6dca4f8b9fc6d65623 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 21:55:08 +0100 Subject: [PATCH 21/69] Remove univaraite_ids from macros --- crates/modelardb_types/src/macros.rs | 55 +++++++++++++--------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/crates/modelardb_types/src/macros.rs b/crates/modelardb_types/src/macros.rs index e02d8e4c4..633222b2a 100644 --- a/crates/modelardb_types/src/macros.rs +++ b/crates/modelardb_types/src/macros.rs @@ -115,7 +115,6 @@ macro_rules! array { /// # let record_batch = RecordBatch::try_new( /// # COMPRESSED_SCHEMA.0.clone(), /// # vec![ -/// # Arc::new(UInt64Array::from(Vec::::new())), /// # Arc::new(UInt8Array::from(Vec::::new())), /// # Arc::new(TimestampArray::from(Vec::::new())), /// # Arc::new(TimestampArray::from(Vec::::new())), @@ -128,41 +127,39 @@ macro_rules! array { /// # Arc::new(UInt16Array::from(Vec::::new())), /// # ], /// # ).unwrap(); -/// modelardb_types::arrays!(record_batch, field_columns, univariate_ids, model_type_ids, -/// start_times, end_times, timestamps, min_values, max_values, values, residuals, errors); +/// modelardb_types::arrays!(record_batch, field_columns, model_type_ids, start_times, end_times, +/// timestamps, min_values, max_values, values, residuals, errors); /// ``` /// /// # Panics /// -/// Panics if `batch` does not contain ten columns of type UInt64Array, UInt8Array, TimestampArray, +/// Panics if `batch` does not contain nine columns of type UInt8Array, TimestampArray, /// TimestampArray, BinaryArray, ValueArray, ValueArray, BinaryArray, BinaryArray, and Float32Array -/// or eleven columns of type UInt64Array, UInt8Array, TimestampArray, TimestampArray, -/// BinaryArray, ValueArray, ValueArray, BinaryArray, BinaryArray, Float32Array, and UInt16Array. +/// or ten columns of type UInt8Array, TimestampArray, TimestampArray, BinaryArray, ValueArray, +/// ValueArray, BinaryArray, BinaryArray, Float32Array, and UInt16Array. #[macro_export] macro_rules! arrays { - ($batch:ident, $univariate_ids:ident, $model_type_ids:ident, $start_times:ident, $end_times:ident, $timestamps:ident, $min_values:ident, $max_values:ident, $values:ident, $residuals:ident, $errors:ident) => { - let $univariate_ids = $crate::array!($batch, 0, UInt64Array); - let $model_type_ids = $crate::array!($batch, 1, UInt8Array); - let $start_times = $crate::array!($batch, 2, TimestampArray); - let $end_times = $crate::array!($batch, 3, TimestampArray); - let $timestamps = $crate::array!($batch, 4, BinaryArray); - let $min_values = $crate::array!($batch, 5, ValueArray); - let $max_values = $crate::array!($batch, 6, ValueArray); - let $values = $crate::array!($batch, 7, BinaryArray); - let $residuals = $crate::array!($batch, 8, BinaryArray); - let $errors = $crate::array!($batch, 9, Float32Array); + ($batch:ident, $model_type_ids:ident, $start_times:ident, $end_times:ident, $timestamps:ident, $min_values:ident, $max_values:ident, $values:ident, $residuals:ident, $errors:ident) => { + let $model_type_ids = $crate::array!($batch, 0, UInt8Array); + let $start_times = $crate::array!($batch, 1, TimestampArray); + let $end_times = $crate::array!($batch, 2, TimestampArray); + let $timestamps = $crate::array!($batch, 3, BinaryArray); + let $min_values = $crate::array!($batch, 4, ValueArray); + let $max_values = $crate::array!($batch, 5, ValueArray); + let $values = $crate::array!($batch, 6, BinaryArray); + let $residuals = $crate::array!($batch, 7, BinaryArray); + let $errors = $crate::array!($batch, 8, Float32Array); }; - ($batch:ident, $univariate_ids:ident, $model_type_ids:ident, $start_times:ident, $end_times:ident, $timestamps:ident, $min_values:ident, $max_values:ident, $values:ident, $residuals:ident, $errors:ident, $field_columns:ident) => { - let $univariate_ids = $crate::array!($batch, 0, UInt64Array); - let $model_type_ids = $crate::array!($batch, 1, UInt8Array); - let $start_times = $crate::array!($batch, 2, TimestampArray); - let $end_times = $crate::array!($batch, 3, TimestampArray); - let $timestamps = $crate::array!($batch, 4, BinaryArray); - let $min_values = $crate::array!($batch, 5, ValueArray); - let $max_values = $crate::array!($batch, 6, ValueArray); - let $values = $crate::array!($batch, 7, BinaryArray); - let $residuals = $crate::array!($batch, 8, BinaryArray); - let $errors = $crate::array!($batch, 9, Float32Array); - let $field_columns = $crate::array!($batch, 10, UInt16Array); + ($batch:ident, $model_type_ids:ident, $start_times:ident, $end_times:ident, $timestamps:ident, $min_values:ident, $max_values:ident, $values:ident, $residuals:ident, $errors:ident, $field_columns:ident) => { + let $model_type_ids = $crate::array!($batch, 0, UInt8Array); + let $start_times = $crate::array!($batch, 1, TimestampArray); + let $end_times = $crate::array!($batch, 2, TimestampArray); + let $timestamps = $crate::array!($batch, 3, BinaryArray); + let $min_values = $crate::array!($batch, 4, ValueArray); + let $max_values = $crate::array!($batch, 5, ValueArray); + let $values = $crate::array!($batch, 6, BinaryArray); + let $residuals = $crate::array!($batch, 7, BinaryArray); + let $errors = $crate::array!($batch, 8, Float32Array); + let $field_columns = $crate::array!($batch, 9, UInt16Array); }; } From 2bf27395fe559f68c9bd3b78b981101ebc56c194 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 21:57:10 +0100 Subject: [PATCH 22/69] Remove methods to convert univariate ids between int64 and uint64 --- crates/modelardb_storage/src/delta_lake.rs | 8 +- crates/modelardb_storage/src/lib.rs | 73 +------------------ .../modelardb_storage/src/query/grid_exec.rs | 4 - 3 files changed, 4 insertions(+), 81 deletions(-) diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 16d014992..130efe07e 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -39,10 +39,7 @@ use object_store::ObjectStore; use url::Url; use crate::error::{ModelarDbStorageError, Result}; -use crate::{ - apache_parquet_writer_properties, maybe_univariate_ids_uint64_to_int64, METADATA_FOLDER, - TABLE_FOLDER, -}; +use crate::{apache_parquet_writer_properties, METADATA_FOLDER, TABLE_FOLDER}; /// Functionality for managing Delta Lake tables in a local folder or an object store. pub struct DeltaLake { @@ -447,9 +444,6 @@ impl DeltaLake { table_name: &str, mut compressed_segments: Vec, ) -> Result { - // Reinterpret univariate_ids from uint64 to int64 if necessary to fix #187 as a stopgap until #197. - maybe_univariate_ids_uint64_to_int64(&mut compressed_segments); - // Specify that the file must be sorted by univariate_id and then by start_time. let sorting_columns = Some(vec![ SortingColumn::new(0, false, false), diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index b9a99fee8..16548a0b8 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -28,8 +28,8 @@ use std::result::Result as StdResult; use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, Float32Builder, Int64Array, - ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, UInt64Array, + Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, Float32Builder, + ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, }; use arrow::compute; use arrow::compute::concat_batches; @@ -55,9 +55,7 @@ use datafusion::prelude::SessionContext; use datafusion::sql::parser::Statement as DFStatement; use deltalake::DeltaTable; use futures::StreamExt; -use modelardb_types::schemas::{ - DISK_COMPRESSED_SCHEMA, QUERY_COMPRESSED_SCHEMA, TABLE_METADATA_SCHEMA, -}; +use modelardb_types::schemas::TABLE_METADATA_SCHEMA; use modelardb_types::types::ErrorBound; use object_store::path::Path; use object_store::ObjectStore; @@ -186,48 +184,6 @@ pub async fn sql_and_concat(session_context: &SessionContext, sql: &str) -> Resu Ok(record_batch) } -/// Reinterpret the bits used for univariate ids in `compressed_segments` to convert the column from -/// [`UInt64Array`] to [`Int64Array`] if the column is currently [`UInt64Array`], as the Delta Lake -/// Protocol does not support unsigned integers. `compressed_segments` is modified in-place as -/// `maybe_univariate_ids_uint64_to_int64()` is designed to be used by -/// `write_compressed_segments_to_model_table()` which owns `compressed_segments`. -pub(crate) fn maybe_univariate_ids_uint64_to_int64(compressed_segments: &mut Vec) { - for record_batch in compressed_segments { - // Only convert the univariate ids if they are stored as unsigned integers. The univariate - // ids can be stored as signed integers already if the compressed segments have been saved - // to disk previously. - if record_batch.schema().field(0).data_type() == &DataType::UInt64 { - let mut columns = record_batch.columns().to_vec(); - let univariate_ids = modelardb_types::array!(record_batch, 0, UInt64Array); - let signed_univariate_ids: Int64Array = - univariate_ids.unary(|value| i64::from_ne_bytes(value.to_ne_bytes())); - columns[0] = Arc::new(signed_univariate_ids); - - // unwrap() is safe as columns is constructed to match DISK_COMPRESSED_SCHEMA. - *record_batch = - RecordBatch::try_new(DISK_COMPRESSED_SCHEMA.0.clone(), columns).unwrap(); - } - } -} - -/// Reinterpret the bits used for univariate ids in `compressed_segments` to convert the column from -/// [`Int64Array`] to [`UInt64Array`] as the Delta Lake Protocol does not support unsigned integers. -/// Returns a new [`RecordBatch`] with the univariate ids stored in an [`UInt64Array`] as -/// `univariate_ids_int64_to_uint64()` is designed to be used by -/// [`futures::stream::Stream::poll_next()`] and -/// [`datafusion::physical_plan::PhysicalExpr::evaluate()`] and -/// [`datafusion::physical_plan::PhysicalExpr::evaluate()`] borrows `compressed_segments` immutably. -pub fn univariate_ids_int64_to_uint64(compressed_segments: &RecordBatch) -> RecordBatch { - let mut columns = compressed_segments.columns().to_vec(); - let signed_univariate_ids = modelardb_types::array!(compressed_segments, 0, Int64Array); - let univariate_ids: UInt64Array = - signed_univariate_ids.unary(|value| u64::from_ne_bytes(value.to_ne_bytes())); - columns[0] = Arc::new(univariate_ids); - - // unwrap() is safe as columns is constructed to match QUERY_COMPRESSED_SCHEMA. - RecordBatch::try_new(QUERY_COMPRESSED_SCHEMA.0.clone(), columns).unwrap() -} - /// Read all rows from the Apache Parquet file at the location given by `file_path` in /// `object_store` and return them as a [`RecordBatch`]. If the file could not be read successfully, /// [`ModelarDbStorageError`] is returned. @@ -561,33 +517,10 @@ mod tests { use arrow::datatypes::{ArrowPrimitiveType, Field, Schema}; use modelardb_types::types::ArrowValue; use object_store::local::LocalFileSystem; - use proptest::num::u64 as ProptestUnivariateId; - use proptest::{prop_assert_eq, proptest}; use tempfile::TempDir; use crate::test; - // Tests for maybe_univariate_ids_uint64_to_int64() and univariate_ids_int64_to_uint64(). - proptest! { - #[test] - fn test_univariate_ids_uint64_to_int64_to_uint64(univariate_id in ProptestUnivariateId::ANY) { - let record_batch = test::compressed_segments_record_batch_with_time(univariate_id, 0, 0.0); - let mut expected_record_batch = record_batch.clone(); - expected_record_batch.remove_column(10); - - let mut record_batches = vec![record_batch.clone()]; - maybe_univariate_ids_uint64_to_int64(&mut record_batches); - - // maybe_univariate_ids_uint64_to_int64 should not panic when called twice. - maybe_univariate_ids_uint64_to_int64(&mut record_batches); - - record_batches[0].remove_column(10); - let computed_record_batch = univariate_ids_int64_to_uint64(&record_batches[0]); - - prop_assert_eq!(expected_record_batch, computed_record_batch); - } - } - // Tests for read_record_batch_from_apache_parquet_file(). #[tokio::test] async fn test_read_record_batch_from_apache_parquet_file() { diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index c01bfd801..088a2dfcf 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -50,7 +50,6 @@ use modelardb_types::schemas::GRID_SCHEMA; use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueArray, ValueBuilder}; use crate::query::{QUERY_ORDER_DATA_POINT, QUERY_REQUIREMENT_SEGMENT}; -use crate::univariate_ids_int64_to_uint64; /// An execution plan that reconstructs the data points stored as compressed segments containing /// metadata and models. It is `pub(crate)` so the additional rules added to Apache DataFusion's @@ -264,9 +263,6 @@ impl GridStream { .elapsed_compute() .timer(); - // Reinterpret univariate_ids from int64 to uint64 to fix #187 as a stopgap until #197. - let batch = univariate_ids_int64_to_uint64(batch); - // Retrieve the arrays from batch and cast them to their concrete type. modelardb_types::arrays!( batch, From 0040f10fcaf106b9b19d98f3680e4e50b37b5c4f Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 19 Feb 2025 22:00:56 +0100 Subject: [PATCH 23/69] Remove DISK schemas --- crates/modelardb_storage/src/delta_lake.rs | 6 +++--- .../modelardb_storage/src/query/model_table.rs | 8 ++++---- crates/modelardb_types/src/schemas.rs | 18 ------------------ 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 130efe07e..d96849c69 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -31,7 +31,7 @@ use deltalake::protocol::SaveMode; use deltalake::{DeltaOps, DeltaTable, DeltaTableError}; use futures::{StreamExt, TryStreamExt}; use modelardb_common::arguments; -use modelardb_types::schemas::{DISK_COMPRESSED_SCHEMA, FIELD_COLUMN}; +use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; use object_store::aws::AmazonS3Builder; use object_store::local::LocalFileSystem; use object_store::path::Path; @@ -288,13 +288,13 @@ impl DeltaLake { .await } - /// Create a Delta Lake table for a model table with `table_name` and [`DISK_COMPRESSED_SCHEMA`] + /// Create a Delta Lake table for a model table with `table_name` and [`COMPRESSED_SCHEMA`] /// if it does not already exist. Returns [`DeltaTable`] if the table could be created and /// [`ModelarDbStorageError`] if it could not. pub async fn create_model_table(&self, table_name: &str) -> Result { self.create_table( table_name, - &DISK_COMPRESSED_SCHEMA.0, + &COMPRESSED_SCHEMA.0, &[FIELD_COLUMN.to_owned()], self.location_of_compressed_table(table_name), SaveMode::ErrorIfExists, diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 3e6ef84b7..5f065b74d 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -42,7 +42,7 @@ use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr}; use deltalake::kernel::LogicalFile; use deltalake::{DeltaTable, DeltaTableError, ObjectMeta, PartitionFilter, PartitionValue}; -use modelardb_types::schemas::{DISK_QUERY_COMPRESSED_SCHEMA, FIELD_COLUMN, GRID_SCHEMA}; +use modelardb_types::schemas::{QUERY_COMPRESSED_SCHEMA, FIELD_COLUMN, GRID_SCHEMA}; use modelardb_types::types::{ArrowTimestamp, ArrowValue}; use crate::metadata::model_table_metadata::ModelTableMetadata; @@ -324,9 +324,9 @@ fn new_apache_parquet_exec( let log_store = delta_table.log_store(); let file_scan_config = FileScanConfig { object_store_url: log_store.object_store_url(), - file_schema: DISK_QUERY_COMPRESSED_SCHEMA.0.clone(), + file_schema: QUERY_COMPRESSED_SCHEMA.0.clone(), file_groups: vec![partitioned_files], - statistics: Statistics::new_unknown(&DISK_QUERY_COMPRESSED_SCHEMA.0), + statistics: Statistics::new_unknown(&QUERY_COMPRESSED_SCHEMA.0), projection: None, limit: maybe_limit, table_partition_cols: vec![], @@ -501,7 +501,7 @@ impl TableProvider for ModelTable { let maybe_physical_parquet_filters = maybe_convert_logical_expr_to_physical_expr( maybe_rewritten_parquet_filters.as_ref(), - DISK_QUERY_COMPRESSED_SCHEMA.0.clone(), + QUERY_COMPRESSED_SCHEMA.0.clone(), )?; let maybe_physical_grid_filters = maybe_convert_logical_expr_to_physical_expr( diff --git a/crates/modelardb_types/src/schemas.rs b/crates/modelardb_types/src/schemas.rs index 8b461d778..e8f93188d 100644 --- a/crates/modelardb_types/src/schemas.rs +++ b/crates/modelardb_types/src/schemas.rs @@ -36,19 +36,10 @@ pub static COMPRESSED_SCHEMA: LazyLock = LazyLock::new(|| { CompressedSchema(Arc::new(Schema::new(query_compressed_schema_fields))) }); -/// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used when writing compressed -/// segments to disk as the Delta Lake Protocol does not support unsigned integers. -pub static DISK_COMPRESSED_SCHEMA: LazyLock = LazyLock::new(|| { - let mut compressed_schema_fields = COMPRESSED_SCHEMA.0.fields().to_vec(); - compressed_schema_fields[0] = Arc::new(Field::new("univariate_id", DataType::Int64, false)); - CompressedSchema(Arc::new(Schema::new(compressed_schema_fields))) -}); - /// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used for compressed segments when /// executing queries as [`FIELD_COLUMN`] is not stored in the Apache Parquet files. pub static QUERY_COMPRESSED_SCHEMA: LazyLock = LazyLock::new(|| { QueryCompressedSchema(Arc::new(Schema::new(vec![ - Field::new("univariate_id", DataType::UInt64, false), Field::new("model_type_id", DataType::UInt8, false), Field::new("start_time", ArrowTimestamp::DATA_TYPE, false), Field::new("end_time", ArrowTimestamp::DATA_TYPE, false), @@ -61,15 +52,6 @@ pub static QUERY_COMPRESSED_SCHEMA: LazyLock = LazyLock:: ]))) }); -/// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used when reading compressed -/// segments from disk as the Delta Lake Protocol does not support unsigned integers. -pub static DISK_QUERY_COMPRESSED_SCHEMA: LazyLock = LazyLock::new(|| { - let mut query_compressed_schema_fields = QUERY_COMPRESSED_SCHEMA.0.fields().to_vec(); - query_compressed_schema_fields[0] = - Arc::new(Field::new("univariate_id", DataType::Int64, false)); - CompressedSchema(Arc::new(Schema::new(query_compressed_schema_fields))) -}); - /// Minimum size of the metadata required for a compressed segment. Meaning that the sizes of /// `timestamps` and `values` are not included as they are [`DataType::Binary`] and thus their size /// depend on which model is selected to represent the values for that compressed segment. From a7207d39551f4974ad9fda1edf8282252461d0cf Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 19:28:49 +0100 Subject: [PATCH 24/69] Add compressed schema to model table metadata --- .../src/metadata/model_table_metadata.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index 42d9a5feb..0c416298d 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -25,6 +25,7 @@ use datafusion::arrow::datatypes::{ArrowPrimitiveType, DataType, Schema}; use datafusion::common::DFSchema; use datafusion::error::DataFusionError; use datafusion::logical_expr::expr::Expr; +use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound, TimestampArray, ValueArray}; use crate::error::{ModelarDbStorageError, Result}; @@ -52,6 +53,8 @@ pub struct ModelTableMetadata { /// Expressions to create generated columns in the `query_schema`. Only field columns can be /// generated by [`Expr`], so [`None`] is stored for timestamp, tag, and stored field columns. pub generated_columns: Vec>, + /// Schema of the compressed segments that are stored in the model table. + pub compressed_schema: Arc, } impl ModelTableMetadata { @@ -141,6 +144,14 @@ impl ModelTableMetadata { let tag_column_indices = compute_indices_of_columns_with_data_type(&schema_without_generated, DataType::Utf8); + // Add the tag columns to the base schema for compressed segments. + let mut compressed_schema_fields = COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + for index in &tag_column_indices { + compressed_schema_fields.push(Arc::new(schema_without_generated.field(*index).clone())); + } + + let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); + Ok(Self { name, schema: schema_without_generated, @@ -151,6 +162,7 @@ impl ModelTableMetadata { query_schema, query_schema_to_schema: field_indices_without_generated, generated_columns, + compressed_schema, }) } From 68c9bdb9cca6868e16d715503d2114b4b08d91e1 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 19:38:17 +0100 Subject: [PATCH 25/69] Update compression to use tag values instead of univariate id --- .../modelardb_compression/src/compression.rs | 36 ++++----- .../modelardb_compression/src/models/swing.rs | 1 - crates/modelardb_compression/src/types.rs | 79 +++++++++++-------- .../src/storage/uncompressed_data_manager.rs | 1 + crates/modelardb_storage/src/delta_lake.rs | 2 +- 5 files changed, 64 insertions(+), 55 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 62c7906d9..fa5572dbc 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -17,6 +17,9 @@ //! using the model types in [`models`] to produce compressed segments containing metadata and //! models. +use std::sync::Arc; + +use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{ErrorBound, TimestampArray, ValueArray}; @@ -41,7 +44,9 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; /// and `uncompressed_values` have different lengths, otherwise the resulting compressed segments /// are returned as a [`RecordBatch`] with the [`COMPRESSED_SCHEMA`] schema. pub fn try_compress( - univariate_id: u64, + compressed_schema: Arc, + tag_values: Vec, + field_column_index: &usize, error_bound: ErrorBound, uncompressed_timestamps: &TimestampArray, uncompressed_values: &ValueArray, @@ -63,7 +68,12 @@ pub fn try_compress( // Enough memory for end_index compressed segments are allocated to never require reallocation // as one compressed segment is created per data point in the absolute worst case. let end_index = uncompressed_timestamps.len(); - let mut compressed_segment_batch_builder = CompressedSegmentBatchBuilder::new(end_index); + let mut compressed_segment_batch_builder = CompressedSegmentBatchBuilder::new( + compressed_schema, + tag_values, + *field_column_index as u16, + end_index, + ); // Compress the uncompressed timestamps and uncompressed values. let mut current_start_index = 0; @@ -84,7 +94,6 @@ pub fn try_compress( // Flush the previous model and any residual value if either exists. if current_start_index > 0 { store_compressed_segments_with_model_and_or_residuals( - univariate_id, error_bound, previous_model, current_start_index - 1, @@ -109,7 +118,6 @@ pub fn try_compress( } store_compressed_segments_with_model_and_or_residuals( - univariate_id, error_bound, previous_model, end_index - 1, @@ -155,7 +163,6 @@ pub(crate) fn fit_next_model( /// - One compressed segment that stores residuals as a single model if `maybe_model` is /// [`None`]. fn store_compressed_segments_with_model_and_or_residuals( - univariate_id: u64, error_bound: ErrorBound, maybe_model: Option, residuals_end_index: usize, @@ -168,7 +175,6 @@ fn store_compressed_segments_with_model_and_or_residuals( if (residuals_end_index - model.end_index) <= RESIDUAL_VALUES_MAX_LENGTH.into() { // Few or no residuals exists so the model and any residuals are put into one segment. model.finish( - univariate_id, error_bound, residuals_end_index, uncompressed_timestamps, @@ -180,7 +186,6 @@ fn store_compressed_segments_with_model_and_or_residuals( let model_end_index = model.end_index; model.finish( - univariate_id, error_bound, model_end_index, // No residuals are stored. uncompressed_timestamps, @@ -189,7 +194,6 @@ fn store_compressed_segments_with_model_and_or_residuals( ); compress_and_store_residuals_in_a_separate_segment( - univariate_id, error_bound, model_end_index + 1, residuals_end_index, @@ -202,7 +206,6 @@ fn store_compressed_segments_with_model_and_or_residuals( // The residuals are stored as a separate segment as the first sub-sequence of values in // `uncompressed_values` are residuals, thus the residuals must be stored in a segment. compress_and_store_residuals_in_a_separate_segment( - univariate_id, error_bound, 0, residuals_end_index, @@ -213,12 +216,10 @@ fn store_compressed_segments_with_model_and_or_residuals( } } -/// For the time series with `univariate_id`, compress the values from `start_index` to and -/// including `end_index` in `uncompressed_values` using [`Gorilla`] and store the resulting model -/// with the corresponding timestamps from `uncompressed_timestamps` as a segment in -/// `compressed_segment_batch_builder`. +/// Compress the values from `start_index` to and including `end_index` in `uncompressed_values` +/// using [`Gorilla`] and store the resulting model with the corresponding timestamps from +/// `uncompressed_timestamps` as a segment in `compressed_segment_batch_builder`. fn compress_and_store_residuals_in_a_separate_segment( - univariate_id: u64, error_bound: ErrorBound, start_index: usize, end_index: usize, @@ -241,7 +242,6 @@ fn compress_and_store_residuals_in_a_separate_segment( let (values, min_value, max_value) = gorilla.model(); compressed_segment_batch_builder.append_compressed_segment( - univariate_id, GORILLA_ID, start_time, end_time, @@ -260,9 +260,7 @@ mod tests { use super::*; - use arrow::array::{ - ArrayBuilder, BinaryArray, Float32Array, UInt64Array, UInt64Builder, UInt8Array, - }; + use arrow::array::{ArrayBuilder, BinaryArray, Float32Array, UInt64Builder, UInt8Array}; use modelardb_common::test::data_generation::{self, ValuesStructure}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ZERO}; use modelardb_types::types::{TimestampBuilder, ValueBuilder}; @@ -977,7 +975,6 @@ mod tests { let compressed_record_batch = compressed_segment_batch_builder.finish(); modelardb_types::arrays!( compressed_record_batch, - univariate_ids, model_type_ids, start_times, end_times, @@ -990,7 +987,6 @@ mod tests { ); assert_eq!(1, compressed_record_batch.num_rows()); - assert_eq!(0, univariate_ids.value(0)); assert_eq!(GORILLA_ID, model_type_ids.value(0)); assert_eq!(100, start_times.value(0)); assert_eq!(500, end_times.value(0)); diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 13a45b6cc..53854c9dc 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -861,7 +861,6 @@ mod tests { // Extract the individual columns from the record batch. modelardb_types::arrays!( segments, - _univariate_id_array, model_type_id_array, start_time_array, end_time_array, diff --git a/crates/modelardb_compression/src/types.rs b/crates/modelardb_compression/src/types.rs index 46a26c3f5..eabb48a76 100644 --- a/crates/modelardb_compression/src/types.rs +++ b/crates/modelardb_compression/src/types.rs @@ -18,10 +18,11 @@ use std::debug_assert; use std::sync::Arc; -use arrow::array::{BinaryBuilder, Float32Builder, UInt16Builder, UInt64Builder, UInt8Builder}; +use arrow::array::{ + ArrayBuilder, ArrayRef, BinaryBuilder, Float32Builder, StringArray, UInt16Array, UInt8Builder, +}; +use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; -use modelardb_types::functions; -use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{ ErrorBound, Timestamp, TimestampArray, TimestampBuilder, Value, ValueArray, ValueBuilder, }; @@ -194,7 +195,6 @@ impl CompressedSegmentBuilder { /// value in `uncompressed_value` after the last value represented by the model in this segment. pub(crate) fn finish( mut self, - univariate_id: u64, error_bound: ErrorBound, residuals_end_index: usize, uncompressed_timestamps: &TimestampArray, @@ -253,7 +253,6 @@ impl CompressedSegmentBuilder { }; compressed_segment_batch_builder.append_compressed_segment( - univariate_id, self.model_type_id, start_time, end_time, @@ -401,8 +400,12 @@ impl CompressedSegmentBuilder { /// A batch of compressed segments being built. pub(crate) struct CompressedSegmentBatchBuilder { - /// Univariate id of each compressed segment in the batch. - univariate_ids: UInt64Builder, + /// Schema of the compressed segments in the batch. + compressed_schema: Arc, + /// Tag values for the time series the compressed segments in the batch belong to. + tag_values: Vec, + /// Index of the field column the compressed segments in the batch belong to. + field_column_index: u16, /// Model type id of each compressed segment in the batch. model_type_ids: UInt8Builder, /// First timestamp of each compressed segment in the batch. @@ -426,14 +429,19 @@ pub(crate) struct CompressedSegmentBatchBuilder { residuals: BinaryBuilder, /// Actual error of each compressed segment in the batch. error: Float32Builder, - /// Field column of each compressed segment in the batch. - field_columns: UInt16Builder, } impl CompressedSegmentBatchBuilder { - pub(crate) fn new(capacity: usize) -> Self { + pub(crate) fn new( + compressed_schema: Arc, + tag_values: Vec, + field_column_index: u16, + capacity: usize, + ) -> Self { Self { - univariate_ids: UInt64Builder::with_capacity(capacity), + compressed_schema, + tag_values, + field_column_index, model_type_ids: UInt8Builder::with_capacity(capacity), start_times: TimestampBuilder::with_capacity(capacity), end_times: TimestampBuilder::with_capacity(capacity), @@ -443,14 +451,12 @@ impl CompressedSegmentBatchBuilder { values: BinaryBuilder::with_capacity(capacity, capacity), residuals: BinaryBuilder::with_capacity(capacity, capacity), error: Float32Builder::with_capacity(capacity), - field_columns: UInt16Builder::with_capacity(capacity), } } /// Append a compressed segment to the builder. pub(crate) fn append_compressed_segment( &mut self, - univariate_id: u64, model_type_id: u8, start_time: Timestamp, end_time: Timestamp, @@ -461,8 +467,6 @@ impl CompressedSegmentBatchBuilder { residuals: &[u8], error: f32, ) { - let field_column_index = functions::univariate_id_to_column_index(univariate_id); - self.univariate_ids.append_value(univariate_id); self.model_type_ids.append_value(model_type_id); self.start_times.append_value(start_time); self.end_times.append_value(end_time); @@ -472,28 +476,37 @@ impl CompressedSegmentBatchBuilder { self.values.append_value(values); self.residuals.append_value(residuals); self.error.append_value(error); - self.field_columns.append_value(field_column_index); } /// Return [`RecordBatch`] of compressed segments and consume the builder. pub(crate) fn finish(mut self) -> RecordBatch { - RecordBatch::try_new( - COMPRESSED_SCHEMA.0.clone(), - vec![ - Arc::new(self.univariate_ids.finish()), - Arc::new(self.model_type_ids.finish()), - Arc::new(self.start_times.finish()), - Arc::new(self.end_times.finish()), - Arc::new(self.timestamps.finish()), - Arc::new(self.min_values.finish()), - Arc::new(self.max_values.finish()), - Arc::new(self.values.finish()), - Arc::new(self.residuals.finish()), - Arc::new(self.error.finish()), - Arc::new(self.field_columns.finish()), - ], - ) - .unwrap() + let batch_length = self.model_type_ids.len(); + let field_column_array: UInt16Array = std::iter::repeat(self.field_column_index) + .take(batch_length) + .collect(); + + let mut columns: Vec = vec![ + Arc::new(self.model_type_ids.finish()), + Arc::new(self.start_times.finish()), + Arc::new(self.end_times.finish()), + Arc::new(self.timestamps.finish()), + Arc::new(self.min_values.finish()), + Arc::new(self.max_values.finish()), + Arc::new(self.values.finish()), + Arc::new(self.residuals.finish()), + Arc::new(self.error.finish()), + Arc::new(field_column_array), + ]; + + for tag_value in &self.tag_values { + let tag_array: StringArray = std::iter::repeat(Some(tag_value)) + .take(batch_length) + .collect(); + + columns.push(Arc::new(tag_array)); + } + + RecordBatch::try_new(self.compressed_schema, columns).unwrap() } } diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 1d784fe9f..a8cb328b0 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -589,6 +589,7 @@ impl UncompressedDataManager { // unwrap() is safe as uncompressed_timestamps and uncompressed_values have the same length. modelardb_compression::try_compress( + model_table_metadata.compressed_schema.clone(), tag_values.clone(), field_column_index, error_bound, diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index d96849c69..082fe4fbf 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -442,7 +442,7 @@ impl DeltaLake { pub async fn write_compressed_segments_to_model_table( &self, table_name: &str, - mut compressed_segments: Vec, + compressed_segments: Vec, ) -> Result { // Specify that the file must be sorted by univariate_id and then by start_time. let sorting_columns = Some(vec![ From f125d0e0f791b13fdee99729011569e5ea8b9b96 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 21:52:00 +0100 Subject: [PATCH 26/69] Fix calls to try_compress() in tests --- .../modelardb_compression/src/compression.rs | 43 ++++++++++++++----- .../modelardb_compression/src/models/swing.rs | 21 ++++++++- crates/modelardb_compression/src/types.rs | 11 ++++- .../src/storage/uncompressed_data_manager.rs | 2 +- 4 files changed, 62 insertions(+), 15 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index fa5572dbc..01673420d 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -46,7 +46,7 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; pub fn try_compress( compressed_schema: Arc, tag_values: Vec, - field_column_index: &usize, + field_column_index: u16, error_bound: ErrorBound, uncompressed_timestamps: &TimestampArray, uncompressed_values: &ValueArray, @@ -71,7 +71,7 @@ pub fn try_compress( let mut compressed_segment_batch_builder = CompressedSegmentBatchBuilder::new( compressed_schema, tag_values, - *field_column_index as u16, + field_column_index, end_index, ); @@ -261,13 +261,14 @@ mod tests { use super::*; use arrow::array::{ArrayBuilder, BinaryArray, Float32Array, UInt64Builder, UInt8Array}; + use arrow::datatypes::{DataType, Field}; use modelardb_common::test::data_generation::{self, ValuesStructure}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ZERO}; use modelardb_types::types::{TimestampBuilder, ValueBuilder}; use crate::{models, MODEL_TYPE_NAMES}; - const UNIVARIATE_ID: u64 = 1; + const TAG_VALUE: &str = "tag"; const ADD_NOISE_RANGE: Option> = Some(1.0..1.05); const TRY_COMPRESS_TEST_LENGTH: usize = 50; @@ -275,7 +276,9 @@ mod tests { #[test] fn test_try_compress_empty_time_series_within_absolute_error_bound_zero() { let compressed_record_batch = try_compress( - UNIVARIATE_ID, + compressed_schema(), + vec![TAG_VALUE.to_owned()], + 0, ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), &TimestampBuilder::new().finish(), &ValueBuilder::new().finish(), @@ -287,7 +290,9 @@ mod tests { #[test] fn test_try_compress_empty_time_series_within_relative_error_bound_zero() { let compressed_record_batch = try_compress( - UNIVARIATE_ID, + compressed_schema(), + vec![TAG_VALUE.to_owned()], + 0, ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), &TimestampBuilder::new().finish(), &ValueBuilder::new().finish(), @@ -507,7 +512,9 @@ mod tests { data_generation::generate_values(uncompressed_timestamps.values(), values_structure); let compressed_record_batch = try_compress( - 1, + compressed_schema(), + vec![TAG_VALUE.to_owned()], + 0, error_bound, &uncompressed_timestamps, &uncompressed_values, @@ -655,7 +662,9 @@ mod tests { assert_eq!(uncompressed_timestamps.len(), uncompressed_values.len()); let compressed_record_batch = try_compress( - UNIVARIATE_ID, + compressed_schema(), + vec![TAG_VALUE.to_owned()], + 0, error_bound, &uncompressed_timestamps, &uncompressed_values, @@ -869,7 +878,9 @@ mod tests { ); let compressed_record_batch = try_compress( - UNIVARIATE_ID, + compressed_schema(), + vec![TAG_VALUE.to_owned()], + 0, error_bound, &uncompressed_timestamps, &uncompressed_values, @@ -960,10 +971,15 @@ mod tests { let error_bound = ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(); let uncompressed_timestamps = TimestampArray::from_iter_values((100..=500).step_by(100)); let uncompressed_values = ValueArray::from(vec![73.0, 37.0, 37.0, 37.0, 73.0]); - let mut compressed_segment_batch_builder = CompressedSegmentBatchBuilder::new(1); - compress_and_store_residuals_in_a_separate_segment( + let mut compressed_segment_batch_builder = CompressedSegmentBatchBuilder::new( + compressed_schema(), + vec![TAG_VALUE.to_owned()], 0, + 1, + ); + + compress_and_store_residuals_in_a_separate_segment( error_bound, 0, uncompressed_timestamps.len() - 1, @@ -998,4 +1014,11 @@ mod tests { assert!(residuals.value(0).is_empty()); assert!(errors.value(0).is_nan()); } + + pub fn compressed_schema() -> Arc { + let mut compressed_schema_fields = COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + compressed_schema_fields.push(Arc::new(Field::new("tag", DataType::Utf8, false))); + + Arc::new(Schema::new(compressed_schema_fields)) + } } diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 53854c9dc..7269c2093 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -350,10 +350,14 @@ fn compute_slope_and_intercept( mod tests { use super::*; - use arrow::array::{BinaryArray, Float32Array, UInt64Array, UInt8Array}; + use std::sync::Arc; + + use arrow::array::{BinaryArray, Float32Array, UInt8Array}; + use arrow::datatypes::{DataType, Field, Schema}; use modelardb_common::test::{ ERROR_BOUND_ABSOLUTE_MAX, ERROR_BOUND_FIVE, ERROR_BOUND_RELATIVE_MAX, ERROR_BOUND_ZERO, }; + use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueArray, ValueBuilder}; use proptest::num::f32 as ProptestValue; use proptest::strategy::Strategy; @@ -856,7 +860,20 @@ mod tests { (START_TIME..end_time).step_by(SAMPLING_INTERVAL as usize), ); let values = ValueArray::from_iter_values(values); - let segments = crate::try_compress(1, error_bound, ×tamps, &values).unwrap(); + + let mut compressed_schema_fields = COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + compressed_schema_fields.push(Arc::new(Field::new("tag", DataType::Utf8, false))); + let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); + + let segments = crate::try_compress( + compressed_schema, + vec!["tag".to_owned()], + 0, + error_bound, + ×tamps, + &values, + ) + .unwrap(); // Extract the individual columns from the record batch. modelardb_types::arrays!( diff --git a/crates/modelardb_compression/src/types.rs b/crates/modelardb_compression/src/types.rs index eabb48a76..b6c530f67 100644 --- a/crates/modelardb_compression/src/types.rs +++ b/crates/modelardb_compression/src/types.rs @@ -515,8 +515,10 @@ mod tests { use super::*; use arrow::array::BinaryArray; + use arrow::datatypes::{DataType, Field}; use modelardb_common::test::data_generation::{self, ValuesStructure}; use modelardb_common::test::{ERROR_BOUND_TEN, ERROR_BOUND_ZERO}; + use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{TimestampArray, ValueArray}; use crate::compression; @@ -813,10 +815,15 @@ mod tests { // Create a segment that represents its values using a model of the expected type and its // residuals using Gorilla, and then assert that the expected encoding is used for it. let residuals_end_index = uncompressed_timestamps.len() - 1; - let mut compressed_segment_batch_builder = CompressedSegmentBatchBuilder::new(1); + + let mut compressed_schema_fields = COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + compressed_schema_fields.push(Arc::new(Field::new("tag", DataType::Utf8, false))); + let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); + + let mut compressed_segment_batch_builder = + CompressedSegmentBatchBuilder::new(compressed_schema, vec!["tag".to_owned()], 0, 1); model.finish( - 0, ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), residuals_end_index, &uncompressed_timestamps, diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index a8cb328b0..131b78401 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -591,7 +591,7 @@ impl UncompressedDataManager { modelardb_compression::try_compress( model_table_metadata.compressed_schema.clone(), tag_values.clone(), - field_column_index, + *field_column_index as u16, error_bound, uncompressed_timestamps, uncompressed_values, From a1d3e1a27ac596a3e1b303dc460805b582c213c9 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 22:14:22 +0100 Subject: [PATCH 27/69] Use compressed schema with tag column in test util function --- .../modelardb_compression/src/compression.rs | 8 +++--- crates/modelardb_compression/src/types.rs | 6 ++--- .../src/storage/compressed_data_manager.rs | 4 +-- crates/modelardb_storage/src/test.rs | 26 +++++++------------ 4 files changed, 19 insertions(+), 25 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 01673420d..8ec0d6cce 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -21,7 +21,6 @@ use std::sync::Arc; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; -use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{ErrorBound, TimestampArray, ValueArray}; use crate::error::{ModelarDbCompressionError, Result}; @@ -42,7 +41,7 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; /// Assumes `uncompressed_timestamps` and `uncompressed_values` are sorted according to /// `uncompressed_timestamps`. Returns [`ModelarDbCompressionError`] if `uncompressed_timestamps` /// and `uncompressed_values` have different lengths, otherwise the resulting compressed segments -/// are returned as a [`RecordBatch`] with the [`COMPRESSED_SCHEMA`] schema. +/// are returned as a [`RecordBatch`] with the [`compressed_schema`] schema. pub fn try_compress( compressed_schema: Arc, tag_values: Vec, @@ -62,7 +61,7 @@ pub fn try_compress( // If there is no uncompressed data to compress, an empty [`RecordBatch`] can be returned. if uncompressed_timestamps.is_empty() { - return Ok(RecordBatch::new_empty(COMPRESSED_SCHEMA.0.clone())); + return Ok(RecordBatch::new_empty(compressed_schema)); } // Enough memory for end_index compressed segments are allocated to never require reallocation @@ -264,6 +263,7 @@ mod tests { use arrow::datatypes::{DataType, Field}; use modelardb_common::test::data_generation::{self, ValuesStructure}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ZERO}; + use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{TimestampBuilder, ValueBuilder}; use crate::{models, MODEL_TYPE_NAMES}; @@ -699,7 +699,7 @@ mod tests { compressed_record_batch, ); - let model_type_ids = modelardb_types::array!(compressed_record_batch, 1, UInt8Array); + let model_type_ids = modelardb_types::array!(compressed_record_batch, 0, UInt8Array); assert_eq!(model_type_ids.values(), expected_model_type_ids); } diff --git a/crates/modelardb_compression/src/types.rs b/crates/modelardb_compression/src/types.rs index b6c530f67..34802adf0 100644 --- a/crates/modelardb_compression/src/types.rs +++ b/crates/modelardb_compression/src/types.rs @@ -834,9 +834,9 @@ mod tests { let batch = compressed_segment_batch_builder.finish(); assert_eq!(1, batch.num_rows()); - let segment_min_value = modelardb_types::array!(batch, 5, ValueArray).value(0); - let segment_max_value = modelardb_types::array!(batch, 6, ValueArray).value(0); - let segment_values = modelardb_types::array!(batch, 7, BinaryArray).value(0); + let segment_min_value = modelardb_types::array!(batch, 4, ValueArray).value(0); + let segment_max_value = modelardb_types::array!(batch, 5, ValueArray).value(0); + let segment_values = modelardb_types::array!(batch, 6, BinaryArray).value(0); assert_eq!(expected_segment_min_value, segment_min_value); assert_eq!(expected_segment_max_value, segment_max_value); diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index 861986431..394be80fa 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -610,12 +610,12 @@ mod tests { model_table_metadata, vec![ test::compressed_segments_record_batch_with_time( - COLUMN_INDEX as u64, + COLUMN_INDEX, time_ms, offset, ), test::compressed_segments_record_batch_with_time( - (COLUMN_INDEX + 1) as u64, + COLUMN_INDEX + 1, time_ms, offset, ), diff --git a/crates/modelardb_storage/src/test.rs b/crates/modelardb_storage/src/test.rs index 68dd658f5..c6df86aed 100644 --- a/crates/modelardb_storage/src/test.rs +++ b/crates/modelardb_storage/src/test.rs @@ -17,14 +17,11 @@ use std::sync::Arc; -use arrow::array::{ - BinaryArray, Float32Array, RecordBatch, StringArray, UInt16Array, UInt64Array, UInt8Array, -}; +use arrow::array::{BinaryArray, Float32Array, RecordBatch, StringArray, UInt16Array, UInt8Array}; use arrow::compute::concat_batches; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ONE, ERROR_BOUND_ZERO}; -use modelardb_types::functions; -use modelardb_types::schemas::{COMPRESSED_SCHEMA, TABLE_METADATA_SCHEMA}; +use modelardb_types::schemas::TABLE_METADATA_SCHEMA; use modelardb_types::types::{ ArrowTimestamp, ArrowValue, ErrorBound, Timestamp, TimestampArray, Value, ValueArray, }; @@ -139,24 +136,22 @@ pub fn uncompressed_model_table_record_batch(row_count: usize) -> RecordBatch { /// Return a [`RecordBatch`] containing three compressed segments. pub fn compressed_segments_record_batch() -> RecordBatch { - compressed_segments_record_batch_with_time(1, 0, 0.0) + compressed_segments_record_batch_with_time(0, 0, 0.0) } -/// Return a [`RecordBatch`] containing three compressed segments from `univariate_id`. The -/// compressed segments time range is from `time_ms` to `time_ms` + 3, while the value range is from -/// `offset` + 5.2 to `offset` + 34.2. +/// Return a [`RecordBatch`] containing three compressed segments. The compressed segments time +/// range is from `time_ms` to `time_ms` + 3, while the value range is from`offset` + 5.2 to +/// `offset` + 34.2. pub fn compressed_segments_record_batch_with_time( - univariate_id: u64, + field_column: u16, time_ms: i64, offset: f32, ) -> RecordBatch { - let field_column = functions::univariate_id_to_column_index(univariate_id); let start_times = vec![time_ms, time_ms + 2, time_ms + 4]; let end_times = vec![time_ms + 1, time_ms + 3, time_ms + 5]; let min_values = vec![offset + 5.2, offset + 10.3, offset + 30.2]; let max_values = vec![offset + 20.2, offset + 12.2, offset + 34.2]; - let univariate_id = UInt64Array::from(vec![univariate_id, univariate_id, univariate_id]); let model_type_id = UInt8Array::from(vec![1, 1, 2]); let start_time = TimestampArray::from(start_times); let end_time = TimestampArray::from(end_times); @@ -167,13 +162,11 @@ pub fn compressed_segments_record_batch_with_time( let residuals = BinaryArray::from_vec(vec![b"", b"", b""]); let error = Float32Array::from(vec![0.2, 0.5, 0.1]); let field_column = UInt16Array::from(vec![field_column, field_column, field_column]); - - let schema = COMPRESSED_SCHEMA.clone(); + let tag_column = StringArray::from(vec!["tag", "tag", "tag"]); RecordBatch::try_new( - schema.0, + model_table_metadata().compressed_schema, vec![ - Arc::new(univariate_id), Arc::new(model_type_id), Arc::new(start_time), Arc::new(end_time), @@ -184,6 +177,7 @@ pub fn compressed_segments_record_batch_with_time( Arc::new(residuals), Arc::new(error), Arc::new(field_column), + Arc::new(tag_column), ], ) .unwrap() From ff30a73dea97710e72b5c8c404e780f3e095aa34 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 23:16:03 +0100 Subject: [PATCH 28/69] Use model table compressed schema in compressed data buffer --- crates/modelardb_common/src/test/mod.rs | 2 +- .../src/storage/compressed_data_buffer.rs | 32 +++++++++++-------- .../src/storage/compressed_data_manager.rs | 15 +++------ 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/crates/modelardb_common/src/test/mod.rs b/crates/modelardb_common/src/test/mod.rs index 032149efd..5bcce3428 100644 --- a/crates/modelardb_common/src/test/mod.rs +++ b/crates/modelardb_common/src/test/mod.rs @@ -24,7 +24,7 @@ pub const INGESTED_BUFFER_SIZE: usize = 1438392; pub const UNCOMPRESSED_BUFFER_SIZE: usize = 1048576; /// Expected size of the compressed segments produced in the tests. -pub const COMPRESSED_SEGMENTS_SIZE: usize = 1437; +pub const COMPRESSED_SEGMENTS_SIZE: usize = 1565; /// Number of bytes reserved for ingested data in tests. pub const INGESTED_RESERVED_MEMORY_IN_BYTES: usize = 5 * 1024 * 1024; // 5 MiB diff --git a/crates/modelardb_server/src/storage/compressed_data_buffer.rs b/crates/modelardb_server/src/storage/compressed_data_buffer.rs index 68402ff93..dd51f1080 100644 --- a/crates/modelardb_server/src/storage/compressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/compressed_data_buffer.rs @@ -19,7 +19,6 @@ use std::sync::Arc; use datafusion::arrow::record_batch::RecordBatch; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; -use modelardb_types::schemas::COMPRESSED_SCHEMA; use crate::error::{ModelarDbServerError, Result}; @@ -54,6 +53,8 @@ impl CompressedSegmentBatch { /// model table as one or more [RecordBatches](RecordBatch) per column and providing functionality /// for appending segments and saving all segments to a single Apache Parquet file. pub(super) struct CompressedDataBuffer { + /// Metadata of the model table the buffer stores compressed segments for. + model_table_metadata: Arc, /// Compressed segments that make up the compressed data in the [`CompressedDataBuffer`]. compressed_segments: Vec, /// Continuously updated total sum of the size of the compressed segments. @@ -61,27 +62,28 @@ pub(super) struct CompressedDataBuffer { } impl CompressedDataBuffer { - pub(super) fn new() -> Self { + pub(super) fn new(model_table_metadata: Arc) -> Self { Self { + model_table_metadata, compressed_segments: vec![], size_in_bytes: 0, } } /// Append `compressed_segments` to the [`CompressedDataBuffer`] and return the size of - /// `compressed_segments` in bytes if their schema is [`COMPRESSED_SCHEMA`], otherwise + /// `compressed_segments` in bytes if their schema matches the model table, otherwise /// [`ModelarDbServerError`] is returned. pub(super) fn append_compressed_segments( &mut self, mut compressed_segments: Vec, ) -> Result { - if compressed_segments - .iter() - .any(|compressed_segments| compressed_segments.schema() != COMPRESSED_SCHEMA.0) - { - return Err(ModelarDbServerError::InvalidArgument( - "Compressed segments must all use COMPRESSED_SCHEMA.".to_owned(), - )); + if compressed_segments.iter().any(|compressed_segments| { + compressed_segments.schema() != self.model_table_metadata.compressed_schema + }) { + return Err(ModelarDbServerError::InvalidArgument(format!( + "Compressed segments must all match {}.", + self.model_table_metadata.name + ))); } let mut compressed_segments_size = 0; @@ -127,7 +129,8 @@ mod tests { #[test] fn test_can_append_valid_compressed_segments() { - let mut compressed_data_buffer = CompressedDataBuffer::new(); + let mut compressed_data_buffer = + CompressedDataBuffer::new(test::model_table_metadata_arc()); compressed_data_buffer .append_compressed_segments(vec![ @@ -143,7 +146,8 @@ mod tests { #[test] fn test_compressed_data_buffer_size_updated_when_appending() { - let mut compressed_data_buffer = CompressedDataBuffer::new(); + let mut compressed_data_buffer = + CompressedDataBuffer::new(test::model_table_metadata_arc()); compressed_data_buffer .append_compressed_segments(vec![ @@ -157,7 +161,9 @@ mod tests { #[tokio::test] async fn test_can_get_record_batches_from_compressed_data_buffer() { - let mut compressed_data_buffer = CompressedDataBuffer::new(); + let mut compressed_data_buffer = + CompressedDataBuffer::new(test::model_table_metadata_arc()); + let compressed_segments = vec![ test::compressed_segments_record_batch(), test::compressed_segments_record_batch(), diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index 394be80fa..ac88bffc6 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -155,7 +155,8 @@ impl CompressedDataManager { let model_table_name = model_table_name.to_owned(); debug!("Creating compressed data buffer for table '{model_table_name}' as none exist.",); - let mut compressed_data_buffer = CompressedDataBuffer::new(); + let mut compressed_data_buffer = + CompressedDataBuffer::new(compressed_segment_batch.model_table_metadata); let segment_size = compressed_data_buffer .append_compressed_segments(compressed_segment_batch.compressed_segments); @@ -609,16 +610,8 @@ mod tests { CompressedSegmentBatch::new( model_table_metadata, vec![ - test::compressed_segments_record_batch_with_time( - COLUMN_INDEX, - time_ms, - offset, - ), - test::compressed_segments_record_batch_with_time( - COLUMN_INDEX + 1, - time_ms, - offset, - ), + test::compressed_segments_record_batch_with_time(COLUMN_INDEX, time_ms, offset), + test::compressed_segments_record_batch_with_time(COLUMN_INDEX + 1, time_ms, offset), ], ) } From 930a109cd7984e416f75b1d49f05c02806c98a99 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 23:48:27 +0100 Subject: [PATCH 29/69] Sort compressed segment files by tag columns instead of univariate id --- crates/modelardb_storage/src/delta_lake.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 082fe4fbf..1bea15cf9 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -444,14 +444,20 @@ impl DeltaLake { table_name: &str, compressed_segments: Vec, ) -> Result { - // Specify that the file must be sorted by univariate_id and then by start_time. - let sorting_columns = Some(vec![ - SortingColumn::new(0, false, false), - SortingColumn::new(2, false, false), - ]); + // Specify that the file must be sorted by the tag columns and then by start_time. + let mut sorting_columns = Vec::new(); + let base_compressed_schema_len = COMPRESSED_SCHEMA.0.fields().len(); + let compressed_schema_len = compressed_segments[0].schema().fields().len(); + + // Compressed segments have the tag columns at the end of the schema. + for tag_column_index in base_compressed_schema_len..compressed_schema_len { + sorting_columns.push(SortingColumn::new(tag_column_index as i32, false, false)); + } + + sorting_columns.push(SortingColumn::new(1, false, false)); let partition_columns = vec![FIELD_COLUMN.to_owned()]; - let writer_properties = apache_parquet_writer_properties(sorting_columns); + let writer_properties = apache_parquet_writer_properties(Some(sorting_columns)); self.write_record_batches_to_table( self.delta_table(table_name).await?, From e0664242f25c5e88473ecf04d74234d97e624c89 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 20 Feb 2025 23:58:32 +0100 Subject: [PATCH 30/69] Use compressed schema with tag columns when creating model tables in delta lake --- crates/modelardb_manager/src/remote.rs | 2 +- crates/modelardb_server/src/context.rs | 2 +- .../src/storage/compressed_data_manager.rs | 6 +++--- .../src/storage/data_transfer.rs | 2 +- crates/modelardb_storage/src/delta_lake.rs | 16 ++++++++++------ .../src/optimizer/model_simple_aggregates.rs | 2 +- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index ca0ef8e0b..10c55c415 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -217,7 +217,7 @@ impl FlightServiceHandler { self.context .remote_data_folder .delta_lake - .create_model_table(&model_table_metadata.name) + .create_model_table(&model_table_metadata) .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index f5417b942..593720bdc 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -161,7 +161,7 @@ impl Context { self.data_folders .local_data_folder .delta_lake - .create_model_table(&model_table_metadata.name) + .create_model_table(&model_table_metadata) .await?; let query_folder_table_metadata_manager = self diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index ac88bffc6..493eb474d 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -390,7 +390,7 @@ mod tests { let mut delta_table = local_data_folder .delta_lake - .create_model_table(test::MODEL_TABLE_NAME) + .create_model_table(&test::model_table_metadata()) .await .unwrap(); @@ -450,7 +450,7 @@ mod tests { let segments = compressed_segments_record_batch(); local_data_folder .delta_lake - .create_model_table(segments.model_table_name()) + .create_model_table(&segments.model_table_metadata) .await .unwrap(); @@ -506,7 +506,7 @@ mod tests { let segments = compressed_segments_record_batch(); local_data_folder .delta_lake - .create_model_table(segments.model_table_name()) + .create_model_table(&segments.model_table_metadata) .await .unwrap(); data_manager diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 29e60053c..878d49341 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -484,7 +484,7 @@ mod tests { let model_table_metadata = test::model_table_metadata(); local_data_folder .delta_lake - .create_model_table(&model_table_metadata.name) + .create_model_table(&model_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 1bea15cf9..cebb04e0d 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -39,6 +39,7 @@ use object_store::ObjectStore; use url::Url; use crate::error::{ModelarDbStorageError, Result}; +use crate::metadata::model_table_metadata::ModelTableMetadata; use crate::{apache_parquet_writer_properties, METADATA_FOLDER, TABLE_FOLDER}; /// Functionality for managing Delta Lake tables in a local folder or an object store. @@ -288,15 +289,18 @@ impl DeltaLake { .await } - /// Create a Delta Lake table for a model table with `table_name` and [`COMPRESSED_SCHEMA`] - /// if it does not already exist. Returns [`DeltaTable`] if the table could be created and + /// Create a Delta Lake table for a model table with `model_table_metadata` if it does not + /// already exist. Returns [`DeltaTable`] if the table could be created and /// [`ModelarDbStorageError`] if it could not. - pub async fn create_model_table(&self, table_name: &str) -> Result { + pub async fn create_model_table( + &self, + model_table_metadata: &ModelTableMetadata, + ) -> Result { self.create_table( - table_name, - &COMPRESSED_SCHEMA.0, + &model_table_metadata.name, + &model_table_metadata.compressed_schema, &[FIELD_COLUMN.to_owned()], - self.location_of_compressed_table(table_name), + self.location_of_compressed_table(&model_table_metadata.name), SaveMode::ErrorIfExists, ) .await diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 4693ef5ce..5cd11e16a 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -788,7 +788,7 @@ mod tests { let model_table_metadata = test::model_table_metadata_arc(); let delta_table = delta_lake - .create_model_table(&model_table_metadata.name) + .create_model_table(&model_table_metadata) .await .unwrap(); From 78a914bbac572823add1504831f37ed8b407df28 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 21 Feb 2025 00:26:24 +0100 Subject: [PATCH 31/69] Fix unit tests after changes to compressed segment schema --- .../src/storage/compressed_data_manager.rs | 28 ++----------------- .../src/storage/data_transfer.rs | 2 +- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index 493eb474d..d84163976 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -288,18 +288,15 @@ mod tests { use super::*; use datafusion::arrow::array::{Array, Int8Array}; - use datafusion::arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; use modelardb_common::test::{ COMPRESSED_RESERVED_MEMORY_IN_BYTES, COMPRESSED_SEGMENTS_SIZE, INGESTED_RESERVED_MEMORY_IN_BYTES, UNCOMPRESSED_RESERVED_MEMORY_IN_BYTES, }; - use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; use modelardb_storage::test; - use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound}; use tempfile::{self, TempDir}; const COLUMN_INDEX: u16 = 1; - const ERROR_BOUND_ZERO: f32 = 0.0; // Tests for insert_record_batch(). #[tokio::test] @@ -525,7 +522,7 @@ mod tests { data_manager .memory_pool .remaining_compressed_memory_in_bytes(), - 1437 + 1565 ); // There should no longer be any compressed data in memory. @@ -588,27 +585,8 @@ mod tests { /// segments. The compressed segments time range is from `time_ms` to `time_ms` + 3, while the /// value range is from `offset` + 5.2 to `offset` + 34.2. fn compressed_segment_batch_with_time(time_ms: i64, offset: f32) -> CompressedSegmentBatch { - let query_schema = Arc::new(Schema::new(vec![ - Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), - Field::new("field_1", ArrowValue::DATA_TYPE, false), - Field::new("field_2", ArrowValue::DATA_TYPE, false), - ])); - let model_table_metadata = Arc::new( - ModelTableMetadata::try_new( - test::MODEL_TABLE_NAME.to_owned(), - query_schema, - vec![ - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ], - vec![None, None, None], - ) - .unwrap(), - ); - CompressedSegmentBatch::new( - model_table_metadata, + test::model_table_metadata_arc(), vec![ test::compressed_segments_record_batch_with_time(COLUMN_INDEX, time_ms, offset), test::compressed_segments_record_batch_with_time(COLUMN_INDEX + 1, time_ms, offset), diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 878d49341..8b64b2cd3 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -284,7 +284,7 @@ mod tests { use modelardb_storage::test; use tempfile::{self, TempDir}; - const EXPECTED_MODEL_TABLE_FILE_SIZE: usize = 2080; + const EXPECTED_MODEL_TABLE_FILE_SIZE: usize = 2038; // Tests for data transfer component. #[tokio::test] From e26688542f3ddc59a23ebe4ea09abdb9b34cffa4 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:30:40 +0100 Subject: [PATCH 32/69] Add temporary fix to grid since tag metadata is no longer available --- crates/modelardb_compression/src/compression.rs | 3 +-- crates/modelardb_storage/src/query/grid_exec.rs | 3 +-- crates/modelardb_storage/src/query/model_table.rs | 11 +++-------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 8ec0d6cce..202899ce3 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -907,7 +907,6 @@ mod tests { modelardb_types::arrays!( compressed_record_batch, - univariate_ids, model_type_ids, start_times, end_times, @@ -924,7 +923,7 @@ mod tests { let start_index = univariate_id_builder.len(); models::grid( - univariate_ids.value(row_index), + 0, model_type_ids.value(row_index), start_times.value(row_index), end_times.value(row_index), diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index 088a2dfcf..6d90b54a3 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -266,7 +266,6 @@ impl GridStream { // Retrieve the arrays from batch and cast them to their concrete type. modelardb_types::arrays!( batch, - univariate_ids, model_type_ids, start_times, end_times, @@ -307,7 +306,7 @@ impl GridStream { let length_before = univariate_id_builder.len(); modelardb_compression::grid( - univariate_ids.value(row_index), + 0, model_type_ids.value(row_index), start_times.value(row_index), end_times.value(row_index), diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 5f065b74d..23ed647df 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -18,7 +18,7 @@ //! and returns a physical query plan that produces all the data points required for the query. use std::any::Any; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::result::Result as StdResult; use std::sync::Arc; @@ -509,13 +509,8 @@ impl TableProvider for ModelTable { GRID_SCHEMA.0.clone(), )?; - // Compute a mapping from hashes to the requested tag values in the requested order. If the - // server is a cloud node, use the table metadata manager for the remote metadata Delta Lake. - let hash_to_tags = self - .table_metadata_manager - .mapping_from_hash_to_tags(table_name, &stored_tag_columns_in_projection) - .await - .map_err(|error| DataFusionError::Plan(error.to_string()))?; + // TODO: Retrieve the tag values from the data instead. + let hash_to_tags: HashMap> = HashMap::new(); if stored_field_columns_in_projection.is_empty() { stored_field_columns_in_projection.push(self.fallback_field_column); From e01487750c21047831c6676e2b1576be4ea11373 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Sat, 22 Feb 2025 17:42:20 +0100 Subject: [PATCH 33/69] Reformat, fix clippy errors and remove unused dependencies --- Cargo.lock | 2 -- crates/modelardb_server/src/context.rs | 2 +- .../modelardb_server/src/storage/uncompressed_data_manager.rs | 2 +- crates/modelardb_storage/Cargo.toml | 2 -- crates/modelardb_storage/src/lib.rs | 4 ++-- crates/modelardb_storage/src/query/model_table.rs | 2 +- 6 files changed, 5 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 708f762bc..7130ce9cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3016,7 +3016,6 @@ dependencies = [ "arrow-flight", "async-trait", "bytes", - "dashmap", "datafusion", "deltalake", "futures", @@ -3024,7 +3023,6 @@ dependencies = [ "modelardb_compression", "modelardb_types", "object_store", - "proptest", "sqlparser", "tempfile", "tokio", diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 593720bdc..38258a161 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -161,7 +161,7 @@ impl Context { self.data_folders .local_data_folder .delta_lake - .create_model_table(&model_table_metadata) + .create_model_table(model_table_metadata) .await?; let query_folder_table_metadata_manager = self diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 131b78401..d451bc497 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -106,7 +106,7 @@ impl UncompressedDataManager { // unwrap() is safe as data cannot be ingested into a model table that does not exist. let model_table_metadata = context - .model_table_metadata_from_default_database_schema(&table_name) + .model_table_metadata_from_default_database_schema(table_name) .await? .unwrap(); diff --git a/crates/modelardb_storage/Cargo.toml b/crates/modelardb_storage/Cargo.toml index 3c9080e98..64c127cc0 100644 --- a/crates/modelardb_storage/Cargo.toml +++ b/crates/modelardb_storage/Cargo.toml @@ -24,7 +24,6 @@ arrow-flight.workspace = true arrow.workspace = true async-trait.workspace = true bytes.workspace = true -dashmap.workspace = true datafusion.workspace = true deltalake = { workspace = true, features = ["datafusion", "s3"] } futures.workspace = true @@ -39,6 +38,5 @@ url.workspace = true [dev-dependencies] futures.workspace = true -proptest.workspace = true tempfile.workspace = true tokio = { workspace = true, features = ["rt-multi-thread", "signal"] } diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 16548a0b8..7dcd97e80 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -28,8 +28,8 @@ use std::result::Result as StdResult; use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, Float32Builder, - ListArray, ListBuilder, RecordBatch, StringArray, StringBuilder, + Array, ArrayRef, BinaryArray, BooleanArray, Float32Array, Float32Builder, ListArray, + ListBuilder, RecordBatch, StringArray, StringBuilder, }; use arrow::compute; use arrow::compute::concat_batches; diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 23ed647df..49c27fd8b 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -42,7 +42,7 @@ use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr}; use deltalake::kernel::LogicalFile; use deltalake::{DeltaTable, DeltaTableError, ObjectMeta, PartitionFilter, PartitionValue}; -use modelardb_types::schemas::{QUERY_COMPRESSED_SCHEMA, FIELD_COLUMN, GRID_SCHEMA}; +use modelardb_types::schemas::{FIELD_COLUMN, GRID_SCHEMA, QUERY_COMPRESSED_SCHEMA}; use modelardb_types::types::{ArrowTimestamp, ArrowValue}; use crate::metadata::model_table_metadata::ModelTableMetadata; From dee061dfe50b35004efe49bef92a92d41609984e Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Sun, 23 Feb 2025 00:11:45 +0100 Subject: [PATCH 34/69] Remove table metadata manager from ModelTable struct --- crates/modelardb_server/src/context.rs | 26 +++++-------------- crates/modelardb_storage/src/lib.rs | 9 +++---- .../src/optimizer/model_simple_aggregates.rs | 12 --------- .../src/query/model_table.rs | 7 +---- 4 files changed, 10 insertions(+), 44 deletions(-) diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 38258a161..857e438ac 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -22,7 +22,6 @@ use datafusion::arrow::datatypes::{Schema, SchemaRef}; use datafusion::catalog::SchemaProvider; use datafusion::prelude::SessionContext; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; -use modelardb_storage::metadata::table_metadata_manager::TableMetadataManager; use modelardb_types::schemas::TABLE_METADATA_SCHEMA; use tokio::runtime::Runtime; use tokio::sync::RwLock; @@ -164,18 +163,9 @@ impl Context { .create_model_table(model_table_metadata) .await?; - let query_folder_table_metadata_manager = self - .data_folders - .query_data_folder - .table_metadata_manager - .clone(); - // Register the model table with Apache DataFusion. - self.register_model_table( - Arc::new(model_table_metadata.clone()), - query_folder_table_metadata_manager.clone(), - ) - .await?; + self.register_model_table(Arc::new(model_table_metadata.clone())) + .await?; // Persist the new model table to the metadata Delta Lake. self.data_folders @@ -250,22 +240,19 @@ impl Context { .model_table_metadata() .await?; - let table_metadata_manager = &self.data_folders.query_data_folder.table_metadata_manager; for metadata in model_table_metadata { - self.register_model_table(metadata, table_metadata_manager.clone()) - .await?; + self.register_model_table(metadata).await?; } Ok(()) } - /// Register the model table with `model_table_metadata` from `table_metadata_manager` in Apache - /// DataFusion. If the model table does not exist or could not be registered with Apache - /// DataFusion, return [`ModelarDbServerError`]. + /// Register the model table with `model_table_metadata` in Apache DataFusion. If the model + /// table does not exist or could not be registered with Apache DataFusion, return + /// [`ModelarDbServerError`]. async fn register_model_table( &self, model_table_metadata: Arc, - table_metadata_manager: Arc, ) -> Result<()> { let delta_table = self .data_folders @@ -283,7 +270,6 @@ impl Context { &self.session_context, delta_table, model_table_metadata.clone(), - table_metadata_manager, model_table_data_sink, )?; diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 7dcd97e80..8c0959a28 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -63,7 +63,6 @@ use sqlparser::ast::Statement; use crate::error::{ModelarDbStorageError, Result}; use crate::metadata::model_table_metadata::{GeneratedColumn, ModelTableMetadata}; -use crate::metadata::table_metadata_manager::TableMetadataManager; use crate::query::metadata_table::MetadataTable; use crate::query::model_table::ModelTable; use crate::query::normal_table::NormalTable; @@ -121,19 +120,17 @@ pub fn register_normal_table( Ok(()) } -/// Register the model table stored in `delta_table` with `model_table_metadata` from -/// `table_metadata_manager` and `data_sink` in `session_context`. If the model table could not be -/// registered with Apache DataFusion, return [`ModelarDbStorageError`]. +/// Register the model table stored in `delta_table` with `model_table_metadata` and `data_sink` in +/// `session_context`. If the model table could not be registered with Apache DataFusion, return +/// [`ModelarDbStorageError`]. pub fn register_model_table( session_context: &SessionContext, delta_table: DeltaTable, model_table_metadata: Arc, - table_metadata_manager: Arc, data_sink: Arc, ) -> Result<()> { let model_table = ModelTable::new( delta_table, - table_metadata_manager, model_table_metadata.clone(), data_sink, ); diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 5cd11e16a..699994b53 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -630,7 +630,6 @@ mod tests { use tonic::async_trait; use crate::delta_lake::DeltaLake; - use crate::metadata::table_metadata_manager::TableMetadataManager; use crate::optimizer; use crate::query::grid_exec::GridExec; use crate::query::model_table::ModelTable; @@ -766,11 +765,6 @@ mod tests { // Setup access to data and metadata in data folder. let data_folder_path = temp_dir.path(); let delta_lake = DeltaLake::try_from_local_path(data_folder_path).unwrap(); - let table_metadata_manager = Arc::new( - TableMetadataManager::try_from_path(data_folder_path, None) - .await - .unwrap(), - ); // Setup access to Apache DataFusion. let mut session_state_builder = SessionStateBuilder::new().with_default_features(); @@ -792,16 +786,10 @@ mod tests { .await .unwrap(); - table_metadata_manager - .save_model_table_metadata(&model_table_metadata) - .await - .unwrap(); - let model_table_data_sink = Arc::new(NoOpDataSink {}); let model_table = ModelTable::new( delta_table, - table_metadata_manager, model_table_metadata.clone(), model_table_data_sink, ); diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 49c27fd8b..a617c0983 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -46,7 +46,6 @@ use modelardb_types::schemas::{FIELD_COLUMN, GRID_SCHEMA, QUERY_COMPRESSED_SCHEM use modelardb_types::types::{ArrowTimestamp, ArrowValue}; use crate::metadata::model_table_metadata::ModelTableMetadata; -use crate::metadata::table_metadata_manager::TableMetadataManager; use crate::query::generated_as_exec::{ColumnToGenerate, GeneratedAsExec}; use crate::query::grid_exec::GridExec; use crate::query::sorted_join_exec::{SortedJoinColumnType, SortedJoinExec}; @@ -64,8 +63,6 @@ pub(crate) struct ModelTable { model_table_metadata: Arc, /// Where data should be written to. data_sink: Arc, - /// Access to metadata related to tables. - table_metadata_manager: Arc, /// Field column to use for queries that do not include fields. fallback_field_column: u16, } @@ -73,7 +70,6 @@ pub(crate) struct ModelTable { impl ModelTable { pub(crate) fn new( delta_table: DeltaTable, - table_metadata_manager: Arc, model_table_metadata: Arc, data_sink: Arc, ) -> Arc { @@ -96,7 +92,6 @@ impl ModelTable { delta_table, model_table_metadata, data_sink, - table_metadata_manager, fallback_field_column, }) } @@ -403,7 +398,7 @@ impl TableProvider for ModelTable { limit: Option, ) -> DataFusionResult> { // Create shorthands for the metadata used during planning to improve readability. - let table_name = self.model_table_metadata.name.as_str(); + let _table_name = self.model_table_metadata.name.as_str(); let schema = &self.model_table_metadata.schema; let tag_column_indices = &self.model_table_metadata.tag_column_indices; let query_schema = &self.model_table_metadata.query_schema; From 5652619c5e205e8f6c973db0f439c626ece34204 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Sun, 23 Feb 2025 23:29:55 +0100 Subject: [PATCH 35/69] Fix comments and remove unused variable --- crates/modelardb_storage/src/query/model_table.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index a617c0983..895267690 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -264,7 +264,7 @@ fn new_binary_expr(left: Expr, op: Operator, right: Expr) -> Expr { }) } -/// Convert `expr` to a [`Option`] with the types in `query_schema`. +/// Convert `maybe_expr` to a [`PhysicalExpr`] with the types in `query_schema` if possible. fn maybe_convert_logical_expr_to_physical_expr( maybe_expr: Option<&Expr>, query_schema: SchemaRef, @@ -342,8 +342,8 @@ fn new_apache_parquet_exec( Ok(Arc::new(apache_parquet_exec)) } -// Convert the [`LogicalFile`] `logical_file` to a [`PartitionFilter`]. A [`DataFusionError`] is -// returned if the time the file was last modified cannot be read from `logical_file`. +/// Convert the [`LogicalFile`] `logical_file` to a [`PartitionFilter`]. A [`DataFusionError`] is +/// returned if the time the file was last modified cannot be read from `logical_file`. fn logical_file_to_partitioned_file( logical_file: &LogicalFile, ) -> DataFusionResult { @@ -398,7 +398,6 @@ impl TableProvider for ModelTable { limit: Option, ) -> DataFusionResult> { // Create shorthands for the metadata used during planning to improve readability. - let _table_name = self.model_table_metadata.name.as_str(); let schema = &self.model_table_metadata.schema; let tag_column_indices = &self.model_table_metadata.tag_column_indices; let query_schema = &self.model_table_metadata.query_schema; @@ -487,8 +486,6 @@ impl TableProvider for ModelTable { } } - // TODO: extract all of the predicates that consist of tag = tag_value from the query so the - // segments can be pruned by univariate_id in ParquetExec and hash_to_tags can be minimized. // Filters are not converted to PhysicalExpr in rewrite_and_combine_filters() to simplify // testing rewrite_and_combine_filters() as Expr can be compared while PhysicalExpr cannot. let (maybe_rewritten_parquet_filters, maybe_rewritten_grid_filters) = From 172adb466f857fc87f1dc723f5d2c620a282904e Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Sun, 23 Feb 2025 23:42:45 +0100 Subject: [PATCH 36/69] Remove utility functions to convert univariate id to tag hash and column index --- crates/modelardb_types/src/functions.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/crates/modelardb_types/src/functions.rs b/crates/modelardb_types/src/functions.rs index a70b797b2..9bcd182d7 100644 --- a/crates/modelardb_types/src/functions.rs +++ b/crates/modelardb_types/src/functions.rs @@ -15,16 +15,6 @@ //! Implementation of helper functions to operate on the types used through ModelarDB. -/// Extract the first 54-bits from `univariate_id` which is a hash computed from tags. -pub fn univariate_id_to_tag_hash(univariate_id: u64) -> u64 { - univariate_id & 18446744073709550592 -} - -/// Extract the last 10-bits from `univariate_id` which is the index of the time series column. -pub fn univariate_id_to_column_index(univariate_id: u64) -> u16 { - (univariate_id & 1023) as u16 -} - /// Normalize `name` to allow direct comparisons between names. pub fn normalize_name(name: &str) -> String { name.to_lowercase() From 703d4e09834bfc0f1227f68b8611b4adb1b6a1ce Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Sun, 23 Feb 2025 23:45:45 +0100 Subject: [PATCH 37/69] Remove hash_to_tags from SortedJoinExec --- .../src/query/model_table.rs | 6 +-- .../src/query/sorted_join_exec.rs | 39 ++----------------- 2 files changed, 4 insertions(+), 41 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 895267690..bc1bcdaea 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -18,7 +18,7 @@ //! and returns a physical query plan that produces all the data points required for the query. use std::any::Any; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::result::Result as StdResult; use std::sync::Arc; @@ -501,9 +501,6 @@ impl TableProvider for ModelTable { GRID_SCHEMA.0.clone(), )?; - // TODO: Retrieve the tag values from the data instead. - let hash_to_tags: HashMap> = HashMap::new(); - if stored_field_columns_in_projection.is_empty() { stored_field_columns_in_projection.push(self.fallback_field_column); } @@ -538,7 +535,6 @@ impl TableProvider for ModelTable { let sorted_join_exec = SortedJoinExec::new( schema_after_projection, stored_columns_in_projection, - Arc::new(hash_to_tags), field_column_execution_plans, ); diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index 278220502..1f411a95a 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -21,13 +21,12 @@ //! or more tag columns. use std::any::Any; -use std::collections::HashMap; use std::fmt::{Formatter, Result as FmtResult}; use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; -use datafusion::arrow::array::{ArrayRef, StringBuilder, UInt64Array}; +use datafusion::arrow::array::{ArrayRef, StringBuilder}; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result as DataFusionResult}; @@ -40,7 +39,6 @@ use datafusion::physical_plan::{ PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use futures::stream::{Stream, StreamExt}; -use modelardb_types::functions; use crate::query::QUERY_REQUIREMENT_DATA_POINT; @@ -62,8 +60,6 @@ pub(crate) struct SortedJoinExec { schema: SchemaRef, /// Order of columns to return. return_order: Vec, - /// Mapping from tag hash to tags. - hash_to_tags: Arc>>, /// Execution plans to read batches of data points from. inputs: Vec>, /// Properties about the plan used in query optimization. @@ -76,7 +72,6 @@ impl SortedJoinExec { pub(crate) fn new( schema: SchemaRef, return_order: Vec, - hash_to_tags: Arc>>, inputs: Vec>, ) -> Arc { // Specify that the record batches produced by the execution plan will have an unknown order @@ -93,7 +88,6 @@ impl SortedJoinExec { Arc::new(SortedJoinExec { schema, return_order, - hash_to_tags, inputs, plan_properties, metrics: ExecutionPlanMetricsSet::new(), @@ -139,7 +133,6 @@ impl ExecutionPlan for SortedJoinExec { Ok(SortedJoinExec::new( self.schema.clone(), self.return_order.clone(), - self.hash_to_tags.clone(), children, )) } else { @@ -165,7 +158,6 @@ impl ExecutionPlan for SortedJoinExec { Ok(Box::pin(SortedJoinStream::new( self.schema.clone(), self.return_order.clone(), - self.hash_to_tags.clone(), streams, BaselineMetrics::new(&self.metrics, partition), ))) @@ -208,8 +200,6 @@ struct SortedJoinStream { schema: SchemaRef, /// Order of columns to return. return_order: Vec, - /// Mapping from tag hash to tags. - hash_to_tags: Arc>>, /// Streams to read batches of data points from. inputs: Vec, /// Current batch of data points to join from. @@ -222,7 +212,6 @@ impl SortedJoinStream { fn new( schema: SchemaRef, return_order: Vec, - hash_to_tags: Arc>>, inputs: Vec, baseline_metrics: BaselineMetrics, ) -> Self { @@ -232,7 +221,6 @@ impl SortedJoinStream { SortedJoinStream { schema, return_order, - hash_to_tags, inputs, batches, baseline_metrics, @@ -289,32 +277,11 @@ impl SortedJoinStream { fn sorted_join(&self) -> Poll>> { let mut columns: Vec = Vec::with_capacity(self.schema.fields.len()); - // Compute the requested tag columns, so they can be assigned to the batch by index. + // TODO: Compute the requested tag columns, so they can be assigned to the batch by index. // unwrap() is safe as a record batch is read from each input before this method is called. let batch = self.batches[0].as_ref().unwrap(); - let univariate_ids = modelardb_types::array!(batch, 0, UInt64Array); - let mut tag_columns = if !self.hash_to_tags.is_empty() { - // unwrap() is safe as hash_to_tags is guaranteed not to be empty. - let tags = self.hash_to_tags.values().next().unwrap(); - let capacity = univariate_ids.len(); - let mut tag_columns: Vec = tags - .iter() - .map(|_vec| StringBuilder::with_capacity(capacity, capacity)) - .collect(); - - for univariate_id in univariate_ids.values() { - let tag_hash = functions::univariate_id_to_tag_hash(*univariate_id); - let tags = &self.hash_to_tags[&tag_hash]; - for (index, tag) in tags.iter().enumerate() { - tag_columns[index].append_value(tag.clone()); - } - } - - tag_columns - } else { - vec![] - }; + let mut tag_columns: Vec = vec![]; // The batches and tags columns are already in the correct order, so they can be appended. let mut field_index = 0; From d5f21dfd24d28638329b9bdf827ae69a1d843d9c Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Mon, 24 Feb 2025 20:02:49 +0100 Subject: [PATCH 38/69] Update indices for accessing compressed segment arrays --- .../src/optimizer/model_simple_aggregates.rs | 42 +++++++++---------- .../modelardb_storage/src/query/grid_exec.rs | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 699994b53..2273a7914 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -334,9 +334,9 @@ struct ModelCountAccumulator { impl Accumulator for ModelCountAccumulator { /// Update the [`Accumulators`](Accumulator) state from `values`. fn update_batch(&mut self, arrays: &[ArrayRef]) -> DataFusionResult<()> { - let start_times = modelardb_types::value!(arrays, 2, TimestampArray); - let end_times = modelardb_types::value!(arrays, 3, TimestampArray); - let timestamps = modelardb_types::value!(arrays, 4, BinaryArray); + let start_times = modelardb_types::value!(arrays, 1, TimestampArray); + let end_times = modelardb_types::value!(arrays, 2, TimestampArray); + let timestamps = modelardb_types::value!(arrays, 3, BinaryArray); for row_index in 0..start_times.len() { let start_time = start_times.value(row_index); @@ -384,7 +384,7 @@ struct ModelMinAccumulator { impl Accumulator for ModelMinAccumulator { /// Update the [`Accumulators`](Accumulator) state from `values`. fn update_batch(&mut self, values: &[ArrayRef]) -> DataFusionResult<()> { - let min_values = modelardb_types::value!(values, 5, ValueArray); + let min_values = modelardb_types::value!(values, 4, ValueArray); for row_index in 0..min_values.len() { self.min = Value::min(self.min, min_values.value(row_index)); } @@ -427,7 +427,7 @@ struct ModelMaxAccumulator { impl Accumulator for ModelMaxAccumulator { /// Update the [`Accumulators`](Accumulator) state from `values`. fn update_batch(&mut self, arrays: &[ArrayRef]) -> DataFusionResult<()> { - let max_values = modelardb_types::value!(arrays, 6, ValueArray); + let max_values = modelardb_types::value!(arrays, 5, ValueArray); for row_index in 0..max_values.len() { self.max = Value::max(self.max, max_values.value(row_index)); } @@ -470,14 +470,14 @@ struct ModelSumAccumulator { impl Accumulator for ModelSumAccumulator { /// Update the [`Accumulators`](Accumulator) state from `values`. fn update_batch(&mut self, arrays: &[ArrayRef]) -> DataFusionResult<()> { - let model_type_ids = modelardb_types::value!(arrays, 1, UInt8Array); - let start_times = modelardb_types::value!(arrays, 2, TimestampArray); - let end_times = modelardb_types::value!(arrays, 3, TimestampArray); - let timestamps = modelardb_types::value!(arrays, 4, BinaryArray); - let min_values = modelardb_types::value!(arrays, 5, ValueArray); - let max_values = modelardb_types::value!(arrays, 6, ValueArray); - let values = modelardb_types::value!(arrays, 7, BinaryArray); - let residuals = modelardb_types::value!(arrays, 8, BinaryArray); + let model_type_ids = modelardb_types::value!(arrays, 0, UInt8Array); + let start_times = modelardb_types::value!(arrays, 1, TimestampArray); + let end_times = modelardb_types::value!(arrays, 2, TimestampArray); + let timestamps = modelardb_types::value!(arrays, 3, BinaryArray); + let min_values = modelardb_types::value!(arrays, 4, ValueArray); + let max_values = modelardb_types::value!(arrays, 5, ValueArray); + let values = modelardb_types::value!(arrays, 6, BinaryArray); + let residuals = modelardb_types::value!(arrays, 7, BinaryArray); for row_index in 0..model_type_ids.len() { let model_type_id = model_type_ids.value(row_index); @@ -542,14 +542,14 @@ struct ModelAvgAccumulator { impl Accumulator for ModelAvgAccumulator { /// Update the [`Accumulators`](Accumulator) state from `values`. fn update_batch(&mut self, arrays: &[ArrayRef]) -> DataFusionResult<()> { - let model_type_ids = modelardb_types::value!(arrays, 1, UInt8Array); - let start_times = modelardb_types::value!(arrays, 2, TimestampArray); - let end_times = modelardb_types::value!(arrays, 3, TimestampArray); - let timestamps = modelardb_types::value!(arrays, 4, BinaryArray); - let min_values = modelardb_types::value!(arrays, 5, ValueArray); - let max_values = modelardb_types::value!(arrays, 6, ValueArray); - let values = modelardb_types::value!(arrays, 7, BinaryArray); - let residuals = modelardb_types::value!(arrays, 8, BinaryArray); + let model_type_ids = modelardb_types::value!(arrays, 0, UInt8Array); + let start_times = modelardb_types::value!(arrays, 1, TimestampArray); + let end_times = modelardb_types::value!(arrays, 2, TimestampArray); + let timestamps = modelardb_types::value!(arrays, 3, BinaryArray); + let min_values = modelardb_types::value!(arrays, 4, ValueArray); + let max_values = modelardb_types::value!(arrays, 5, ValueArray); + let values = modelardb_types::value!(arrays, 6, BinaryArray); + let residuals = modelardb_types::value!(arrays, 7, BinaryArray); for row_index in 0..model_type_ids.len() { let model_type_id = model_type_ids.value(row_index); diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index 6d90b54a3..aecd7bd1e 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -159,7 +159,7 @@ impl ExecutionPlan for GridExec { partition: usize, task_context: Arc, ) -> DataFusionResult { - // Must be read before GridStream as task_context are moved into input. + // Must be read before GridStream as task_context is moved into input. let batch_size = task_context.session_config().batch_size(); let grid_stream_metrics = GridStreamMetrics::new(&self.metrics, partition); From 028c6efe84915d63dcd715f842dd715a4c17df04 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 22:36:02 +0100 Subject: [PATCH 39/69] Add query compressed schema to ModelTable --- crates/modelardb_storage/src/query/model_table.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index bc1bcdaea..c5e391597 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -65,6 +65,8 @@ pub(crate) struct ModelTable { data_sink: Arc, /// Field column to use for queries that do not include fields. fallback_field_column: u16, + /// Schema of the compressed segments stored on disk. + query_compressed_schema: Arc, } impl ModelTable { @@ -88,11 +90,21 @@ impl ModelTable { .unwrap() as u16 // unwrap() is safe as all model tables contain at least one field. }; + // Add the tag columns to the base schema for queryable compressed segments. + let mut query_compressed_schema_fields = QUERY_COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + for index in &model_table_metadata.tag_column_indices { + query_compressed_schema_fields + .push(Arc::new(model_table_metadata.schema.field(*index).clone())); + } + + let query_compressed_schema = Arc::new(Schema::new(query_compressed_schema_fields)); + Arc::new(ModelTable { delta_table, model_table_metadata, data_sink, fallback_field_column, + query_compressed_schema, }) } From 8c9b2db0144404b8c4f9d08ad342d6f544960891 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 22:56:02 +0100 Subject: [PATCH 40/69] Add query order segment to model table --- .../src/query/model_table.rs | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index c5e391597..38a8f8224 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -23,6 +23,7 @@ use std::fmt; use std::result::Result as StdResult; use std::sync::Arc; +use arrow::compute::SortOptions; use async_trait::async_trait; use datafusion::arrow::datatypes::{ ArrowPrimitiveType, DataType, Field, Schema, SchemaRef, TimeUnit, @@ -37,7 +38,8 @@ use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{self, utils, BinaryExpr, Expr, Operator}; -use datafusion::physical_expr::{planner, LexOrdering}; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::{planner, LexOrdering, PhysicalSortExpr}; use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr}; use deltalake::kernel::LogicalFile; @@ -67,6 +69,10 @@ pub(crate) struct ModelTable { fallback_field_column: u16, /// Schema of the compressed segments stored on disk. query_compressed_schema: Arc, + /// The sort order [`ParquetExec`] guarantees for the segments it produces. It is guaranteed by + /// [`ParquetExec`] because the storage engine uses this sort order for each Apache Parquet file + /// in this model table and these files are read sequentially by [`ParquetExec`]. + query_order_segment: LexOrdering, } impl ModelTable { @@ -99,12 +105,37 @@ impl ModelTable { let query_compressed_schema = Arc::new(Schema::new(query_compressed_schema_fields)); + // Segments are sorted by the tag columns and the start time. + let sort_options = SortOptions { + descending: false, + nulls_first: false, + }; + + let mut physical_sort_exprs = vec![]; + for index in &model_table_metadata.tag_column_indices { + let tag_column_name = model_table_metadata.schema.field(*index).name(); + + // unwrap() is safe as the tag columns are always present in the query compressed schema. + let segment_index = query_compressed_schema.index_of(tag_column_name).unwrap(); + + physical_sort_exprs.push(PhysicalSortExpr { + expr: Arc::new(Column::new(tag_column_name, segment_index)), + options: sort_options, + }); + }; + + physical_sort_exprs.push(PhysicalSortExpr { + expr: Arc::new(Column::new("start_time", 1)), + options: sort_options, + }); + Arc::new(ModelTable { delta_table, model_table_metadata, data_sink, fallback_field_column, query_compressed_schema, + query_order_segment: LexOrdering::new(physical_sort_exprs), }) } From f2d87db391d22ca6924d0246660a15085ace9828 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 23:08:22 +0100 Subject: [PATCH 41/69] Add query requirement segment to ModelTable --- .../src/query/model_table.rs | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 38a8f8224..7bbf11c1a 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -39,7 +39,9 @@ use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{self, utils, BinaryExpr, Expr, Operator}; use datafusion::physical_expr::expressions::Column; -use datafusion::physical_expr::{planner, LexOrdering, PhysicalSortExpr}; +use datafusion::physical_expr::{ + planner, LexOrdering, LexRequirement, PhysicalSortExpr, PhysicalSortRequirement, +}; use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr}; use deltalake::kernel::LogicalFile; @@ -73,6 +75,8 @@ pub(crate) struct ModelTable { /// [`ParquetExec`] because the storage engine uses this sort order for each Apache Parquet file /// in this model table and these files are read sequentially by [`ParquetExec`]. query_order_segment: LexOrdering, + /// The sort order that [`GridExec`] requires for the segments it receives as its input. + query_requirement_segment: LexRequirement, } impl ModelTable { @@ -111,31 +115,41 @@ impl ModelTable { nulls_first: false, }; - let mut physical_sort_exprs = vec![]; + let mut segment_physical_sort_exprs = vec![]; for index in &model_table_metadata.tag_column_indices { let tag_column_name = model_table_metadata.schema.field(*index).name(); // unwrap() is safe as the tag columns are always present in the query compressed schema. let segment_index = query_compressed_schema.index_of(tag_column_name).unwrap(); - physical_sort_exprs.push(PhysicalSortExpr { + segment_physical_sort_exprs.push(PhysicalSortExpr { expr: Arc::new(Column::new(tag_column_name, segment_index)), options: sort_options, }); - }; + } - physical_sort_exprs.push(PhysicalSortExpr { + segment_physical_sort_exprs.push(PhysicalSortExpr { expr: Arc::new(Column::new("start_time", 1)), options: sort_options, }); + // The sort order that GridExec requires for the segments it receives as its input matches + // the sort order ParquetExec guarantees for the segments it produces. + let segment_physical_sort_requirements: Vec = + segment_physical_sort_exprs + .clone() + .into_iter() + .map(|physical_sort_expr| physical_sort_expr.into()) + .collect(); + Arc::new(ModelTable { delta_table, model_table_metadata, data_sink, fallback_field_column, query_compressed_schema, - query_order_segment: LexOrdering::new(physical_sort_exprs), + query_order_segment: LexOrdering::new(segment_physical_sort_exprs), + query_requirement_segment: LexRequirement::new(segment_physical_sort_requirements), }) } From 19d028290a5adffa3a14a169742d7d8bc8853fc6 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 23:21:17 +0100 Subject: [PATCH 42/69] Add util method to get query order and requirement for a schema --- .../src/query/model_table.rs | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 7bbf11c1a..0853692db 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -203,6 +203,48 @@ impl fmt::Debug for ModelTable { } } +/// Return a [`LexOrdering`] and [`LexRequirement`] that sort by the tag columns from +/// `model_table_metadata` in `schema` first and then by `time_column`. +fn query_order_and_requirement( + model_table_metadata: &ModelTableMetadata, + schema: &Schema, + time_column: Column, +) -> (LexOrdering, LexRequirement) { + let sort_options = SortOptions { + descending: false, + nulls_first: false, + }; + + let mut physical_sort_exprs = vec![]; + for index in &model_table_metadata.tag_column_indices { + let tag_column_name = model_table_metadata.schema.field(*index).name(); + + // unwrap() is safe as the tag columns are always present in the schema. + let segment_index = schema.index_of(tag_column_name).unwrap(); + + physical_sort_exprs.push(PhysicalSortExpr { + expr: Arc::new(Column::new(tag_column_name, segment_index)), + options: sort_options, + }); + } + + physical_sort_exprs.push(PhysicalSortExpr { + expr: Arc::new(time_column), + options: sort_options, + }); + + let physical_sort_requirements: Vec = physical_sort_exprs + .clone() + .into_iter() + .map(|physical_sort_expr| physical_sort_expr.into()) + .collect(); + + ( + LexOrdering::new(physical_sort_exprs), + LexRequirement::new(physical_sort_requirements), + ) +} + /// Rewrite and combine the `filters` that are written in terms of the model table's query schema, /// to a filter that is written in terms of the schema used for compressed segments by the storage /// engine and a filter that is written in terms of the schema used for univariate time series by From 3175d88babf7484b1c2a18036d811761e9db65b3 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 23:23:07 +0100 Subject: [PATCH 43/69] Use util method to get segment query order and requirement --- .../src/query/model_table.rs | 41 ++++--------------- 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 0853692db..69d26b81e 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -109,38 +109,11 @@ impl ModelTable { let query_compressed_schema = Arc::new(Schema::new(query_compressed_schema_fields)); - // Segments are sorted by the tag columns and the start time. - let sort_options = SortOptions { - descending: false, - nulls_first: false, - }; - - let mut segment_physical_sort_exprs = vec![]; - for index in &model_table_metadata.tag_column_indices { - let tag_column_name = model_table_metadata.schema.field(*index).name(); - - // unwrap() is safe as the tag columns are always present in the query compressed schema. - let segment_index = query_compressed_schema.index_of(tag_column_name).unwrap(); - - segment_physical_sort_exprs.push(PhysicalSortExpr { - expr: Arc::new(Column::new(tag_column_name, segment_index)), - options: sort_options, - }); - } - - segment_physical_sort_exprs.push(PhysicalSortExpr { - expr: Arc::new(Column::new("start_time", 1)), - options: sort_options, - }); - - // The sort order that GridExec requires for the segments it receives as its input matches - // the sort order ParquetExec guarantees for the segments it produces. - let segment_physical_sort_requirements: Vec = - segment_physical_sort_exprs - .clone() - .into_iter() - .map(|physical_sort_expr| physical_sort_expr.into()) - .collect(); + let (query_order_segment, query_requirement_segment) = query_order_and_requirement( + &model_table_metadata, + &query_compressed_schema, + Column::new("start_time", 1), + ); Arc::new(ModelTable { delta_table, @@ -148,8 +121,8 @@ impl ModelTable { data_sink, fallback_field_column, query_compressed_schema, - query_order_segment: LexOrdering::new(segment_physical_sort_exprs), - query_requirement_segment: LexRequirement::new(segment_physical_sort_requirements), + query_order_segment, + query_requirement_segment, }) } From a4c07e6d1499e0a0098fdd0db44a7d8d23522383 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 23:32:10 +0100 Subject: [PATCH 44/69] Remove univariate_id from GRID_SCHEMA --- crates/modelardb_types/src/schemas.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/modelardb_types/src/schemas.rs b/crates/modelardb_types/src/schemas.rs index e8f93188d..7963bdc3e 100644 --- a/crates/modelardb_types/src/schemas.rs +++ b/crates/modelardb_types/src/schemas.rs @@ -21,8 +21,8 @@ use std::sync::LazyLock; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; use crate::types::{ - ArrowTimestamp, ArrowUnivariateId, ArrowValue, CompressedSchema, ConfigurationSchema, - QueryCompressedSchema, QuerySchema, TableMetadataSchema, + ArrowTimestamp, ArrowValue, CompressedSchema, ConfigurationSchema, QueryCompressedSchema, + QuerySchema, TableMetadataSchema, }; /// Name of the column used to partition the compressed segments. @@ -67,7 +67,6 @@ pub static COMPRESSED_METADATA_SIZE_IN_BYTES: LazyLock = LazyLock::new(|| /// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used internally during query processing. pub static GRID_SCHEMA: LazyLock = LazyLock::new(|| { QuerySchema(Arc::new(Schema::new(vec![ - Field::new("univariate_id", ArrowUnivariateId::DATA_TYPE, false), Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), Field::new("value", ArrowValue::DATA_TYPE, false), ]))) From 1236ebd478ed6d0c1fb6b383887e8133ed405886 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Tue, 25 Feb 2025 23:34:16 +0100 Subject: [PATCH 45/69] Add grid schema, query order data point, and query requirement data point to ModelTable --- .../src/query/model_table.rs | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 69d26b81e..d7110047e 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -77,6 +77,14 @@ pub(crate) struct ModelTable { query_order_segment: LexOrdering, /// The sort order that [`GridExec`] requires for the segments it receives as its input. query_requirement_segment: LexRequirement, + /// Schema used internally during query processing. + grid_schema: Arc, + /// The sort order [`GridExec`] guarantees for the data points it produces. It is guaranteed by + /// [`GridExec`] because it receives segments sorted by `query_order_segment` from [`ParquetExec`] + /// and because these segments cannot contain data points for overlapping time intervals. + query_order_data_point: LexOrdering, + /// The sort order that [`SortedJoinExec`] requires for the data points it receives as its input. + query_requirement_data_point: LexRequirement, } impl ModelTable { @@ -115,6 +123,20 @@ impl ModelTable { Column::new("start_time", 1), ); + // Add the tag columns to the base schema for data points. + let mut grid_schema_fields = GRID_SCHEMA.0.fields.clone().to_vec(); + for index in &model_table_metadata.tag_column_indices { + grid_schema_fields.push(Arc::new(model_table_metadata.schema.field(*index).clone())); + } + + let grid_schema = Arc::new(Schema::new(grid_schema_fields)); + + let (query_order_data_point, query_requirement_data_point) = query_order_and_requirement( + &model_table_metadata, + &grid_schema, + Column::new("timestamp", 0), + ); + Arc::new(ModelTable { delta_table, model_table_metadata, @@ -123,6 +145,9 @@ impl ModelTable { query_compressed_schema, query_order_segment, query_requirement_segment, + grid_schema, + query_order_data_point, + query_requirement_data_point, }) } From fe8bd38bdfcc3515b677661212e098e2b22ff907 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 00:00:42 +0100 Subject: [PATCH 46/69] Pass model table query compressed schema and output ordering when creating parquet exec --- crates/modelardb_storage/src/query/model_table.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index d7110047e..707c86fa0 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -54,8 +54,6 @@ use crate::query::generated_as_exec::{ColumnToGenerate, GeneratedAsExec}; use crate::query::grid_exec::GridExec; use crate::query::sorted_join_exec::{SortedJoinColumnType, SortedJoinExec}; -use super::QUERY_ORDER_SEGMENT; - /// A queryable representation of a model table which stores multivariate time series as segments /// containing metadata and models. [`ModelTable`] implements [`TableProvider`] so it can be /// registered with Apache DataFusion and the multivariate time series queried as multiple @@ -394,6 +392,8 @@ fn new_apache_parquet_exec( partition_filters: &[PartitionFilter], maybe_limit: Option, maybe_parquet_filters: &Option>, + file_schema: SchemaRef, + output_ordering: Vec, ) -> DataFusionResult> { // Collect the LogicalFiles into a Vec so they can be sorted the same for all field columns. let mut logical_files = delta_table @@ -416,13 +416,13 @@ fn new_apache_parquet_exec( let log_store = delta_table.log_store(); let file_scan_config = FileScanConfig { object_store_url: log_store.object_store_url(), - file_schema: QUERY_COMPRESSED_SCHEMA.0.clone(), + file_schema, file_groups: vec![partitioned_files], statistics: Statistics::new_unknown(&QUERY_COMPRESSED_SCHEMA.0), projection: None, limit: maybe_limit, table_partition_cols: vec![], - output_ordering: vec![LexOrdering::new(QUERY_ORDER_SEGMENT.to_vec())], + output_ordering, }; let apache_parquet_exec_builder = if let Some(parquet_filters) = maybe_parquet_filters { @@ -590,12 +590,12 @@ impl TableProvider for ModelTable { let maybe_physical_parquet_filters = maybe_convert_logical_expr_to_physical_expr( maybe_rewritten_parquet_filters.as_ref(), - QUERY_COMPRESSED_SCHEMA.0.clone(), + self.query_compressed_schema.clone(), )?; let maybe_physical_grid_filters = maybe_convert_logical_expr_to_physical_expr( maybe_rewritten_grid_filters.as_ref(), - GRID_SCHEMA.0.clone(), + self.grid_schema.clone(), )?; if stored_field_columns_in_projection.is_empty() { @@ -622,6 +622,8 @@ impl TableProvider for ModelTable { &partition_filters, limit, &maybe_physical_parquet_filters, + self.query_compressed_schema.clone(), + vec![LexOrdering::new(self.query_order_segment.to_vec())], )?; let grid_exec = GridExec::new(maybe_physical_grid_filters.clone(), limit, parquet_exec); From ccf58dc2dd96dac26eb019bafde463d6227dcb48 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 00:03:43 +0100 Subject: [PATCH 47/69] Use model table specific query requirement segment and query order data point in GridExec --- .../modelardb_storage/src/query/grid_exec.rs | 31 ++++++++++++------- .../src/query/model_table.rs | 9 +++++- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index aecd7bd1e..76f206d10 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -23,7 +23,7 @@ use std::fmt::{Formatter, Result as FmtResult}; use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; - +use arrow::datatypes::Schema; use async_trait::async_trait; use datafusion::arrow::array::{ Array, ArrayBuilder, ArrayRef, BinaryArray, Float32Array, UInt64Array, UInt64Builder, @@ -35,7 +35,7 @@ use datafusion::arrow::record_batch::RecordBatch; use datafusion::common::cast::as_boolean_array; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::context::TaskContext; -use datafusion::physical_expr::{EquivalenceProperties, LexRequirement}; +use datafusion::physical_expr::{EquivalenceProperties, LexOrdering, LexRequirement}; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion::physical_plan::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, @@ -46,11 +46,8 @@ use datafusion::physical_plan::{ }; use futures::stream::{Stream, StreamExt}; use modelardb_compression::{self, MODEL_TYPE_COUNT, MODEL_TYPE_NAMES}; -use modelardb_types::schemas::GRID_SCHEMA; use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueArray, ValueBuilder}; -use crate::query::{QUERY_ORDER_DATA_POINT, QUERY_REQUIREMENT_SEGMENT}; - /// An execution plan that reconstructs the data points stored as compressed segments containing /// metadata and models. It is `pub(crate)` so the additional rules added to Apache DataFusion's /// physical optimizer can pattern match on it. @@ -66,24 +63,29 @@ pub(crate) struct GridExec { input: Arc, /// Properties about the plan used in query optimization. plan_properties: PlanProperties, + /// The sort order that [`GridExec`] requires for the segments it receives as its input. + query_requirement_segment: LexRequirement, + /// The sort order [`GridExec`] guarantees for the data points it produces. + query_order_data_point: LexOrdering, /// Metrics collected during execution for use by EXPLAIN ANALYZE. metrics: ExecutionPlanMetricsSet, } impl GridExec { pub(super) fn new( + schema: Arc, maybe_predicate: Option>, limit: Option, input: Arc, + query_requirement_segment: LexRequirement, + query_order_data_point: LexOrdering, ) -> Arc { - let schema = GRID_SCHEMA.0.clone(); - - // The global order for the data points produced by the set of GridExec instances producing + // The sort order for the data points produced by the set of GridExec instances producing // input for a SortedJoinExec must be the same. This is needed because SortedJoinExec - // assumes the data it receives from all of its inputs uses the same global sort order. + // assumes the data it receives from all of its inputs uses the same sort order. let equivalence_properties = EquivalenceProperties::new_with_orderings( schema.clone(), - &[QUERY_ORDER_DATA_POINT.clone()], + &[query_order_data_point.clone()], ); let plan_properties = PlanProperties::new( @@ -99,6 +101,8 @@ impl GridExec { limit, input, plan_properties, + query_requirement_segment, + query_order_data_point, metrics: ExecutionPlanMetricsSet::new(), }) } @@ -140,9 +144,12 @@ impl ExecutionPlan for GridExec { ) -> DataFusionResult> { if children.len() == 1 { Ok(GridExec::new( + self.schema.clone(), self.maybe_predicate.clone(), self.limit, children[0].clone(), + self.query_requirement_segment.clone(), + self.query_order_data_point.clone(), )) } else { Err(DataFusionError::Plan(format!( @@ -186,9 +193,9 @@ impl ExecutionPlan for GridExec { } /// Specify that [`GridExec`] requires that its input provides data that is sorted by - /// [`QUERY_REQUIREMENT_SEGMENT`]. + /// `query_requirement_segment`. fn required_input_ordering(&self) -> Vec> { - vec![Some(QUERY_REQUIREMENT_SEGMENT.clone())] + vec![Some(self.query_requirement_segment.clone())] } /// Return a snapshot of the set of metrics being collected by the execution plain. diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 707c86fa0..c3242392e 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -626,7 +626,14 @@ impl TableProvider for ModelTable { vec![LexOrdering::new(self.query_order_segment.to_vec())], )?; - let grid_exec = GridExec::new(maybe_physical_grid_filters.clone(), limit, parquet_exec); + let grid_exec = GridExec::new( + self.grid_schema.clone(), + maybe_physical_grid_filters.clone(), + limit, + parquet_exec, + self.query_requirement_segment.clone(), + self.query_order_data_point.clone(), + ); field_column_execution_plans.push(grid_exec); } From 5f8dc5038bff6c1f1a831398a3bdf832c6aefd61 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 00:12:57 +0100 Subject: [PATCH 48/69] Pass query requirement data point to SortedJoinExec --- .../modelardb_storage/src/query/model_table.rs | 1 + .../src/query/sorted_join_exec.rs | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index c3242392e..e0e42acfd 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -642,6 +642,7 @@ impl TableProvider for ModelTable { schema_after_projection, stored_columns_in_projection, field_column_execution_plans, + self.query_requirement_data_point.clone(), ); // Only include GeneratedAsExec in the query plan if there are columns to generate. diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index 1f411a95a..866154e1a 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -40,8 +40,6 @@ use datafusion::physical_plan::{ }; use futures::stream::{Stream, StreamExt}; -use crate::query::QUERY_REQUIREMENT_DATA_POINT; - /// The different types of columns supported by [`SortedJoinExec`], used for specifying the order in /// which the timestamp, field, and tag columns should be returned by [`SortedJoinStream`]. #[derive(Debug, Clone)] @@ -64,6 +62,8 @@ pub(crate) struct SortedJoinExec { inputs: Vec>, /// Properties about the plan used in query optimization. plan_properties: PlanProperties, + /// The sort order that [`SortedJoinExec`] requires for the data points it receives as its input. + query_requirement_data_point: LexRequirement, /// Metrics collected during execution for use by EXPLAIN ANALYZE. metrics: ExecutionPlanMetricsSet, } @@ -73,9 +73,9 @@ impl SortedJoinExec { schema: SchemaRef, return_order: Vec, inputs: Vec>, + query_requirement_data_point: LexRequirement, ) -> Arc { - // Specify that the record batches produced by the execution plan will have an unknown order - // as the output from SortedJoinExec does not include the univariate_id but instead tags. + // Specify that the record batches produced by the execution plan will have an unknown order. let equivalence_properties = EquivalenceProperties::new(schema.clone()); let plan_properties = PlanProperties::new( @@ -90,6 +90,7 @@ impl SortedJoinExec { return_order, inputs, plan_properties, + query_requirement_data_point, metrics: ExecutionPlanMetricsSet::new(), }) } @@ -134,6 +135,7 @@ impl ExecutionPlan for SortedJoinExec { self.schema.clone(), self.return_order.clone(), children, + self.query_requirement_data_point.clone(), )) } else { Err(DataFusionError::Plan(format!( @@ -176,9 +178,9 @@ impl ExecutionPlan for SortedJoinExec { } /// Specify that [`SortedJoinStream`] requires that its inputs' provide data that is sorted by - /// [`QUERY_REQUIREMENT_DATA_POINT`]. + /// `query_requirement_data_point`. fn required_input_ordering(&self) -> Vec> { - vec![Some(QUERY_REQUIREMENT_DATA_POINT.clone()); self.inputs.len()] + vec![Some(self.query_requirement_data_point.clone()); self.inputs.len()] } /// Return a snapshot of the set of metrics being collected by the execution plain. @@ -289,11 +291,11 @@ impl SortedJoinStream { for element in &self.return_order { match element { - SortedJoinColumnType::Timestamp => columns.push(batch.column(1).clone()), + SortedJoinColumnType::Timestamp => columns.push(batch.column(0).clone()), SortedJoinColumnType::Field => { // unwrap() is safe as a record batch has already been read from each input. let batch = self.batches[field_index].as_ref().unwrap(); - columns.push(batch.column(2).clone()); + columns.push(batch.column(1).clone()); field_index += 1; } SortedJoinColumnType::Tag => { From 3adf091cfd0aca34b0ce76f357c0d70bd0245ae8 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 00:16:21 +0100 Subject: [PATCH 49/69] Remove global sort orders and sort requirements --- crates/modelardb_storage/src/query/mod.rs | 103 +--------------------- 1 file changed, 3 insertions(+), 100 deletions(-) diff --git a/crates/modelardb_storage/src/query/mod.rs b/crates/modelardb_storage/src/query/mod.rs index 193244eb5..0e7f9d211 100644 --- a/crates/modelardb_storage/src/query/mod.rs +++ b/crates/modelardb_storage/src/query/mod.rs @@ -13,14 +13,9 @@ * limitations under the License. */ -//! Implementation of types which allows both normal tables and model tables to be added to Apache -//! DataFusion. This allows them to be queried and small amounts of data to be added with INSERT. - -use std::sync::{Arc, LazyLock}; - -use datafusion::physical_expr::{LexOrdering, LexRequirement, PhysicalSortExpr}; -use datafusion::physical_plan::expressions::Column; -use deltalake::arrow::compute::SortOptions; +//! Implementation of types which allow normal tables, metadata tables, and model tables to be added +//! to Apache DataFusion. This allows them to be queried and small amounts of data to be added with +//! INSERT. // grid_exec and sorted_join_exec are pub(crate) so the rules added to Apache DataFusion's physical // optimizer can access them. @@ -30,95 +25,3 @@ pub(crate) mod metadata_table; pub(crate) mod model_table; pub(crate) mod normal_table; pub(crate) mod sorted_join_exec; - -/// The global sort order -/// [`ParquetExec`](datafusion::datasource::physical_plan::parquet::ParquetExec) guarantees for the -/// segments it produces. It is guaranteed by -/// [`ParquetExec`](datafusion::datasource::physical_plan::parquet::ParquetExec) because the storage -/// engine uses this sort order for each Apache Parquet file and these files are read sequentially -/// by [`ParquetExec`](datafusion::datasource::physical_plan::parquet::ParquetExec). Another sort -/// order could also be used, the current query pipeline simply requires that the -/// [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) -/// [`SortedJoinExec`](sorted_join_exec::SortedJoinExec) receives from its inputs all contain data -/// points for the same time interval and that they are sorted the same. -static QUERY_ORDER_SEGMENT: LazyLock = LazyLock::new(|| { - let sort_options = SortOptions { - descending: false, - nulls_first: false, - }; - - let physical_sort_expr = vec![ - PhysicalSortExpr { - expr: Arc::new(Column::new("univariate_id", 0)), - options: sort_options, - }, - PhysicalSortExpr { - expr: Arc::new(Column::new("start_time", 2)), - options: sort_options, - }, - ]; - - LexOrdering::new(physical_sort_expr) -}); - -/// The global sort order that [`GridExec`](grid_exec::GridExec) requires for the segments it -/// receives as its input. Another sort order could also be used, the current query pipeline simply -/// requires that the [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) -/// [`SortedJoinExec`](sorted_join_exec::SortedJoinExec) receives from its inputs all contain data -/// points for the same time interval and that they are sorted the same. -static QUERY_REQUIREMENT_SEGMENT: LazyLock = LazyLock::new(|| { - let physical_sort_requirements = QUERY_ORDER_SEGMENT - .inner - .clone() - .drain(..) - .map(|physical_sort_expr| physical_sort_expr.into()) - .collect(); - - LexRequirement::new(physical_sort_requirements) -}); - -/// The global sort order [`GridExec`](grid_exec::GridExec) guarantees for the data points it -/// produces. It is guaranteed by [`GridExec`](grid_exec::GridExec) because it receives segments -/// sorted by [`QUERY_ORDER_SEGMENT`] from -/// [`ParquetExec`](datafusion::datasource::physical_plan::parquet::ParquetExec) and because these -/// segments cannot contain data points for overlapping time intervals. Another sort order could -/// also be used, the current query pipeline simply requires that the -/// [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) -/// [`SortedJoinExec`](sorted_join_exec::SortedJoinExec) receives from its inputs all contain data -/// points for the same time interval and that they are sorted the same. -static QUERY_ORDER_DATA_POINT: LazyLock = LazyLock::new(|| { - let sort_options = SortOptions { - descending: false, - nulls_first: false, - }; - - let physical_sort_expr = vec![ - PhysicalSortExpr { - expr: Arc::new(Column::new("univariate_id", 0)), - options: sort_options, - }, - PhysicalSortExpr { - expr: Arc::new(Column::new("timestamp", 1)), - options: sort_options, - }, - ]; - - LexOrdering::new(physical_sort_expr) -}); - -/// The global sort order that [`SortedJoinExec`](sorted_join_exec::SortedJoinExec) requires for the -/// data points it receives as its input. Another sort order could also be used, the current query -/// pipeline simply requires that the -/// [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) -/// [`SortedJoinExec`](sorted_join_exec::SortedJoinExec) receives from its inputs all contain data -/// points for the same time interval and that they are sorted the same. -static QUERY_REQUIREMENT_DATA_POINT: LazyLock = LazyLock::new(|| { - let physical_sort_requirements = QUERY_ORDER_DATA_POINT - .inner - .clone() - .drain(..) - .map(|physical_sort_expr| physical_sort_expr.into()) - .collect(); - - LexRequirement::new(physical_sort_requirements) -}); From 0bdd384d02247e0289cbc02a1ddd12e7452bb12e Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 17:58:32 +0100 Subject: [PATCH 50/69] Remove univariate_id from pmc_mean::grid() --- .../src/models/pmc_mean.rs | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/crates/modelardb_compression/src/models/pmc_mean.rs b/crates/modelardb_compression/src/models/pmc_mean.rs index cafea3115..b76973506 100644 --- a/crates/modelardb_compression/src/models/pmc_mean.rs +++ b/crates/modelardb_compression/src/models/pmc_mean.rs @@ -21,7 +21,7 @@ //! [ModelarDB paper]: https://www.vldb.org/pvldb/vol11/p1688-jensen.pdf use modelardb_types::schemas::COMPRESSED_METADATA_SIZE_IN_BYTES; -use modelardb_types::types::{Timestamp, UnivariateId, UnivariateIdBuilder, Value, ValueBuilder}; +use modelardb_types::types::{Timestamp, Value, ValueBuilder}; use crate::models; use crate::models::ErrorBound; @@ -100,18 +100,10 @@ pub fn sum(model_length: usize, value: Value) -> Value { model_length as Value * value } -/// Reconstruct the values for the `timestamps` without matching values in -/// `value_builder` using a model of type PMC-Mean. The `univariate_ids` and -/// `values` are appended to `univariate_builder` and `value_builder`. -pub fn grid( - univariate_id: UnivariateId, - value: Value, - univariate_id_builder: &mut UnivariateIdBuilder, - timestamps: &[Timestamp], - value_builder: &mut ValueBuilder, -) { +/// Reconstruct the values for the `timestamps` without matching values in `value_builder` using a +/// model of type PMC-Mean. The `values` are appended to `value_builder`. +pub fn grid(value: Value, timestamps: &[Timestamp], value_builder: &mut ValueBuilder) { for _timestamp in timestamps { - univariate_id_builder.append_value(univariate_id); value_builder.append_value(value); } } @@ -376,29 +368,20 @@ mod tests { #[test] fn test_grid(value in ProptestValue::ANY) { let sampling_interval: i64 = 60; - let mut univariate_id_builder = UnivariateIdBuilder::with_capacity(10); let timestamps: Vec = (60..=600).step_by(60).collect(); let mut value_builder = ValueBuilder::with_capacity(10); grid( - 1, value, - &mut univariate_id_builder, ×tamps, &mut value_builder, ); - let univariate_ids = univariate_id_builder.finish(); let values = value_builder.finish(); prop_assert!( - univariate_ids.len() == 10 - && univariate_ids.len() == timestamps.len() - && univariate_ids.len() == values.len() + timestamps.len() == 10 && timestamps.len() == values.len() ); - prop_assert!(univariate_ids - .iter() - .all(|maybe_univariate_id| maybe_univariate_id.unwrap() == 1)); prop_assert!(timestamps .windows(2) .all(|window| window[1] - window[0] == sampling_interval)); From 1de2c03bd5034b64a08e9858325f3c9957c86b04 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 18:06:10 +0100 Subject: [PATCH 51/69] Remove univariate_id from swing::grid() --- .../modelardb_compression/src/models/swing.rs | 22 ++++--------------- 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 7269c2093..44a29d7ec 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -25,7 +25,7 @@ use modelardb_types::schemas::COMPRESSED_METADATA_SIZE_IN_BYTES; use modelardb_types::types::{ - ErrorBound, Timestamp, TimestampBuilder, UnivariateId, UnivariateIdBuilder, Value, ValueBuilder, + ErrorBound, Timestamp, TimestampBuilder, UnivariateIdBuilder, Value, ValueBuilder, }; use super::timestamps; @@ -302,16 +302,13 @@ pub fn sum( } } -/// Reconstruct the values for the `timestamps` without matching values in -/// `value_builder` using a model of type Swing. The `univariate_ids` and -/// `values` are appended to `univariate_id_builder` and `value_builder`. +/// Reconstruct the values for the `timestamps` without matching values in `value_builder` using a +/// model of type Swing. The `values` are appended to `value_builder`. pub fn grid( - univariate_id: UnivariateId, start_time: Timestamp, end_time: Timestamp, first_value: Value, last_value: Value, - univariate_id_builder: &mut UnivariateIdBuilder, timestamps: &[Timestamp], value_builder: &mut ValueBuilder, ) { @@ -319,7 +316,6 @@ pub fn grid( compute_slope_and_intercept(start_time, first_value as f64, end_time, last_value as f64); for timestamp in timestamps { - univariate_id_builder.append_value(univariate_id); let value = (slope * (*timestamp as f64) + intercept) as Value; value_builder.append_value(value); } @@ -766,31 +762,21 @@ mod tests { fn test_grid(value in num::i32::ANY.prop_map(i32_to_value)) { let timestamps: Vec = (START_TIME ..= END_TIME) .step_by(SAMPLING_INTERVAL as usize).collect(); - let mut univariate_id_builder = UnivariateIdBuilder::with_capacity(timestamps.len()); let mut value_builder = ValueBuilder::with_capacity(timestamps.len()); // The linear function represents a constant to have a known value. grid( - 1, START_TIME, END_TIME, value, value, - &mut univariate_id_builder, ×tamps, &mut value_builder, ); - let univariate_ids = univariate_id_builder.finish(); let values = value_builder.finish(); - prop_assert!( - univariate_ids.len() == timestamps.len() - && univariate_ids.len() == values.len() - ); - prop_assert!(univariate_ids - .iter() - .all(|maybe_univariate_id| maybe_univariate_id.unwrap() == 1)); + prop_assert!(timestamps.len() == values.len()); prop_assert!(timestamps .windows(2) .all(|window| window[1] - window[0] == SAMPLING_INTERVAL)); From 900f6c304f1a400d93c37494fea40aa9a4f42eca Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 18:10:40 +0100 Subject: [PATCH 52/69] Remove univariate_id from gorilla::grid() --- .../src/models/gorilla.rs | 38 +++---------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/crates/modelardb_compression/src/models/gorilla.rs b/crates/modelardb_compression/src/models/gorilla.rs index 6928dc289..0a9544ddf 100644 --- a/crates/modelardb_compression/src/models/gorilla.rs +++ b/crates/modelardb_compression/src/models/gorilla.rs @@ -22,7 +22,7 @@ //! //! [Gorilla paper]: https://www.vldb.org/pvldb/vol8/p1816-teller.pdf -use modelardb_types::types::{Timestamp, UnivariateId, UnivariateIdBuilder, Value, ValueBuilder}; +use modelardb_types::types::{Timestamp, Value, ValueBuilder}; use crate::models; use crate::models::bits::{BitReader, BitVecBuilder}; @@ -215,13 +215,11 @@ pub fn sum(length: usize, values: &[u8], maybe_model_last_value: Option) /// Decompress all the values in `values` for the `timestamps` without matching values in /// `value_builder`. The values in `values` are compressed using Gorilla's compression method for -/// floating-point values. `univariate_ids` and `values` are appended to `univariate_id_builder` and -/// `value_builder`. If `maybe_model_last_value` is provided, it is assumed the first value in -/// `values` is compressed against it instead of being stored in full, i.e., uncompressed. +/// floating-point values. `values` are appended to `value_builder`. If `maybe_model_last_value` +/// is provided, it is assumed the first value in `values` is compressed against it instead of being +/// stored in full, i.e., uncompressed. pub fn grid( - univariate_id: UnivariateId, values: &[u8], - univariate_id_builder: &mut UnivariateIdBuilder, timestamps: &[Timestamp], value_builder: &mut ValueBuilder, maybe_model_last_value: Option, @@ -238,7 +236,6 @@ pub fn grid( } else { // The first value is stored uncompressed using size_of:: bits. let first_value = bits.read_bits(models::VALUE_SIZE_IN_BITS) as u32; - univariate_id_builder.append_value(univariate_id); value_builder.append_value(Value::from_bits(first_value)); first_value }; @@ -262,7 +259,6 @@ pub fn grid( value ^= last_value; last_value = value; } - univariate_id_builder.append_value(univariate_id); value_builder.append_value(Value::from_bits(last_value)); } } @@ -516,29 +512,13 @@ mod tests { fn assert_grid_with_error_bound(error_bound: ErrorBound, values: &[Value]) { let compressed_values = compress_values_using_gorilla(error_bound, values, None); - let mut univariate_id_builder = UnivariateIdBuilder::with_capacity(values.len()); let timestamps: Vec = (1..=values.len() as i64).step_by(1).collect(); let mut value_builder = ValueBuilder::with_capacity(values.len()); - grid( - 1, - &compressed_values, - &mut univariate_id_builder, - ×tamps, - &mut value_builder, - None, - ); + grid(&compressed_values, ×tamps, &mut value_builder, None); - let univariate_ids_array = univariate_id_builder.finish(); let values_array = value_builder.finish(); - assert!( - univariate_ids_array.len() == values.len() - && univariate_ids_array.len() == timestamps.len() - && univariate_ids_array.len() == values_array.len() - ); - assert!(univariate_ids_array - .iter() - .all(|maybe_univariate_id| maybe_univariate_id.unwrap() == 1)); + assert!(values.len() == timestamps.len() && values.len() == values_array.len()); assert!(timestamps .windows(2) .all(|window| window[1] - window[0] == 1)); @@ -580,24 +560,18 @@ mod tests { fn assert_grid_single(error_bound: ErrorBound, maybe_model_last_value: Option) { let compressed_values = compress_values_using_gorilla(error_bound, &[37.0], maybe_model_last_value); - let mut univariate_id_builder = UnivariateIdBuilder::new(); let mut value_builder = ValueBuilder::new(); grid( - 1, &compressed_values, - &mut univariate_id_builder, &[100], &mut value_builder, maybe_model_last_value, ); - let univariate_ids = univariate_id_builder.finish(); let values = value_builder.finish(); - assert_eq!(univariate_ids.len(), 1); assert_eq!(values.len(), 1); - assert_eq!(univariate_ids.value(0), 1); assert_eq!(values.value(0), 37.0); } From 17ff1b850394864f41639fdd2d8576df3f975078 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 18:17:29 +0100 Subject: [PATCH 53/69] Remove univariate_id from modelardb_compression::grid() --- .../modelardb_compression/src/compression.rs | 2 -- .../modelardb_compression/src/models/mod.rs | 25 +++---------------- .../modelardb_compression/src/models/swing.rs | 5 +--- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 202899ce3..84a06c9c0 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -923,7 +923,6 @@ mod tests { let start_index = univariate_id_builder.len(); models::grid( - 0, model_type_ids.value(row_index), start_times.value(row_index), end_times.value(row_index), @@ -932,7 +931,6 @@ mod tests { max_values.value(row_index), values.value(row_index), residuals.value(row_index), - &mut univariate_id_builder, &mut timestamp_builder, &mut value_builder, ); diff --git a/crates/modelardb_compression/src/models/mod.rs b/crates/modelardb_compression/src/models/mod.rs index 5500900b0..9eaf49f82 100644 --- a/crates/modelardb_compression/src/models/mod.rs +++ b/crates/modelardb_compression/src/models/mod.rs @@ -26,9 +26,7 @@ pub mod timestamps; use std::mem; use arrow::array::ArrayBuilder; -use modelardb_types::types::{ - ErrorBound, Timestamp, TimestampBuilder, UnivariateId, UnivariateIdBuilder, Value, ValueBuilder, -}; +use modelardb_types::types::{ErrorBound, Timestamp, TimestampBuilder, Value, ValueBuilder}; use crate::types::CompressedSegmentBuilder; @@ -183,10 +181,9 @@ pub fn sum( } /// Reconstruct the data points for a compressed segment whose values are represented by a model and -/// residuals. Each data point is split into its three components and appended to `univariate_ids`, -/// `timestamps`, and `values`. +/// residuals. Each data point is split into its two components and appended to `timestamp_builder` +/// and `value_builder`. pub fn grid( - univariate_id: UnivariateId, model_type_id: u8, start_time: Timestamp, end_time: Timestamp, @@ -195,7 +192,6 @@ pub fn grid( max_value: Value, values: &[u8], residuals: &[u8], - univariate_id_builder: &mut UnivariateIdBuilder, timestamp_builder: &mut TimestampBuilder, value_builder: &mut ValueBuilder, ) { @@ -212,9 +208,7 @@ pub fn grid( // Reconstruct the values from the model. match model_type_id { PMC_MEAN_ID => pmc_mean::grid( - univariate_id, CompressedSegmentBuilder::decode_values_for_pmc_mean(min_value, max_value, values), - univariate_id_builder, model_timestamps, value_builder, ), @@ -226,24 +220,15 @@ pub fn grid( let model_end_time = *model_timestamps.last().unwrap(); swing::grid( - univariate_id, start_time, model_end_time, first_value, last_value, - univariate_id_builder, model_timestamps, value_builder, ) } - GORILLA_ID => gorilla::grid( - univariate_id, - values, - univariate_id_builder, - model_timestamps, - value_builder, - None, - ), + GORILLA_ID => gorilla::grid(values, model_timestamps, value_builder, None), _ => panic!("Unknown model type."), } @@ -252,9 +237,7 @@ pub fn grid( let model_last_value = value_builder.values_slice()[value_builder.len() - 1]; gorilla::grid( - univariate_id, &residuals[..residuals.len() - 1], - univariate_id_builder, residuals_timestamps, value_builder, Some(model_last_value), diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 44a29d7ec..ea84ad6f7 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -25,7 +25,7 @@ use modelardb_types::schemas::COMPRESSED_METADATA_SIZE_IN_BYTES; use modelardb_types::types::{ - ErrorBound, Timestamp, TimestampBuilder, UnivariateIdBuilder, Value, ValueBuilder, + ErrorBound, Timestamp, TimestampBuilder, Value, ValueBuilder, }; use super::timestamps; @@ -880,12 +880,10 @@ mod tests { assert_eq!(model_type_id_array.value(0), SWING_ID); // Reconstruct all values from the segment. - let mut reconstructed_ids = UnivariateIdBuilder::with_capacity(timestamps.len()); let mut reconstructed_timestamps = TimestampBuilder::with_capacity(timestamps.len()); let mut reconstructed_values = ValueBuilder::with_capacity(timestamps.len()); models::grid( - 0, model_type_id_array.value(0), start_time_array.value(0), end_time_array.value(0), @@ -894,7 +892,6 @@ mod tests { max_value_array.value(0), values_array.value(0), residuals_array.value(0), - &mut reconstructed_ids, &mut reconstructed_timestamps, &mut reconstructed_values, ); From b734a275b28ea203d3137c6ce3a770456c65db11 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 18:29:39 +0100 Subject: [PATCH 54/69] Remove univariate id from GridExec and types --- .../modelardb_compression/src/compression.rs | 5 ++--- .../src/models/gorilla.rs | 6 ++--- .../modelardb_storage/src/query/grid_exec.rs | 22 ++++++------------- crates/modelardb_types/src/types.rs | 7 ------ 4 files changed, 12 insertions(+), 28 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 84a06c9c0..f7147412c 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -901,7 +901,6 @@ mod tests { uncompressed_values: &ValueArray, compressed_record_batch: &RecordBatch, ) { - let mut univariate_id_builder = UInt64Builder::new(); let mut timestamp_builder = TimestampBuilder::new(); let mut value_builder = ValueBuilder::new(); @@ -920,7 +919,7 @@ mod tests { let mut index_to_model_type = vec![]; for row_index in 0..compressed_record_batch.num_rows() { - let start_index = univariate_id_builder.len(); + let start_index = value_builder.len(); models::grid( model_type_ids.value(row_index), @@ -935,7 +934,7 @@ mod tests { &mut value_builder, ); - let end_index = univariate_id_builder.len(); + let end_index = value_builder.len(); index_to_model_type.push((start_index..end_index, model_type_ids.value(row_index))); } diff --git a/crates/modelardb_compression/src/models/gorilla.rs b/crates/modelardb_compression/src/models/gorilla.rs index 0a9544ddf..cadb6614e 100644 --- a/crates/modelardb_compression/src/models/gorilla.rs +++ b/crates/modelardb_compression/src/models/gorilla.rs @@ -174,9 +174,9 @@ impl Gorilla { /// it is assumed the first value in `values` is compressed against it instead of being stored in /// full, i.e., uncompressed. pub fn sum(length: usize, values: &[u8], maybe_model_last_value: Option) -> Value { - // This function replicates code from gorilla::grid() as it isn't necessary - // to store the univariate ids, timestamps, and values in arrays for a sum. - // So any changes to the decompression must be mirrored in gorilla::grid(). + // This function replicates code from gorilla::grid() as it isn't necessary to store the + // timestamps and values in arrays for a sum. So any changes to the decompression must be + // mirrored in gorilla::grid(). let mut bits = BitReader::try_new(values).unwrap(); let mut leading_zeros = u8::MAX; let mut trailing_zeros: u8 = 0; diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index 76f206d10..d45491411 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -23,11 +23,11 @@ use std::fmt::{Formatter, Result as FmtResult}; use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; + use arrow::datatypes::Schema; use async_trait::async_trait; use datafusion::arrow::array::{ - Array, ArrayBuilder, ArrayRef, BinaryArray, Float32Array, UInt64Array, UInt64Builder, - UInt8Array, + Array, ArrayBuilder, ArrayRef, BinaryArray, Float32Array, UInt8Array, }; use datafusion::arrow::compute::filter_record_batch; use datafusion::arrow::datatypes::SchemaRef; @@ -186,7 +186,7 @@ impl ExecutionPlan for GridExec { } /// Specify that [`GridExec`] requires one partition for each input as it assumes that the - /// global sort order are the same for its input and Apache Arrow DataFusion only guarantees the + /// sort order are the same for its input and Apache Arrow DataFusion only guarantees the /// sort order within each partition rather than the input's global sort order. fn required_input_distribution(&self) -> Vec { vec![Distribution::SinglePartition] @@ -289,31 +289,25 @@ impl GridStream { // from each segment in the new batch as each segment contains at least one data point. let current_rows = self.current_batch.num_rows() - self.current_batch_offset; let new_rows = batch.num_rows(); - let mut univariate_id_builder = UInt64Builder::with_capacity(current_rows + new_rows); let mut timestamp_builder = TimestampBuilder::with_capacity(current_rows + new_rows); let mut value_builder = ValueBuilder::with_capacity(current_rows + new_rows); // Copy over the data points from the current batch to keep the resulting batch sorted. let current_batch = &self.current_batch; // Required as self cannot be passed to array!. - univariate_id_builder.append_slice( - &modelardb_types::array!(current_batch, 0, UInt64Array).values() - [self.current_batch_offset..], - ); timestamp_builder.append_slice( - &modelardb_types::array!(current_batch, 1, TimestampArray).values() + &modelardb_types::array!(current_batch, 0, TimestampArray).values() [self.current_batch_offset..], ); value_builder.append_slice( - &modelardb_types::array!(current_batch, 2, ValueArray).values() + &modelardb_types::array!(current_batch, 1, ValueArray).values() [self.current_batch_offset..], ); // Reconstruct the data points from the compressed segments. for row_index in 0..new_rows { - let length_before = univariate_id_builder.len(); + let length_before = value_builder.len(); modelardb_compression::grid( - 0, model_type_ids.value(row_index), start_times.value(row_index), end_times.value(row_index), @@ -322,21 +316,19 @@ impl GridStream { max_values.value(row_index), values.value(row_index), residuals.value(row_index), - &mut univariate_id_builder, &mut timestamp_builder, &mut value_builder, ); self.grid_stream_metrics.add( model_type_ids.value(row_index), - univariate_id_builder.len() - length_before, + value_builder.len() - length_before, !residuals.value(row_index).is_empty(), modelardb_compression::are_compressed_timestamps_regular(timestamps.values()), ); } let columns: Vec = vec![ - Arc::new(univariate_id_builder.finish()), Arc::new(timestamp_builder.finish()), Arc::new(value_builder.finish()), ]; diff --git a/crates/modelardb_types/src/types.rs b/crates/modelardb_types/src/types.rs index 1192f9cb4..851060886 100644 --- a/crates/modelardb_types/src/types.rs +++ b/crates/modelardb_types/src/types.rs @@ -24,13 +24,6 @@ use std::str::FromStr; use crate::error::{ModelarDbTypesError, Result}; -// Types used for a univariate id. -pub type UnivariateId = std::primitive::u64; -pub type ArrowUnivariateId = arrow::datatypes::UInt64Type; - -// Types used for a collection of univariate ids. -pub type UnivariateIdBuilder = arrow::array::PrimitiveBuilder; - // Types used for a single timestamp. pub type Timestamp = std::primitive::i64; // It is signed to match TimestampMicrosecondType. pub type ArrowTimestamp = arrow::datatypes::TimestampMicrosecondType; From 0a388dd73b3561ea7cfdccff98e6c2f03976f5d5 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 21:26:39 +0100 Subject: [PATCH 55/69] Reconstruct tag columns in GridExec --- .../modelardb_storage/src/query/grid_exec.rs | 39 ++++++++++++++++++- .../src/query/model_table.rs | 4 +- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index d45491411..40ba58262 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -24,6 +24,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; +use arrow::array::{StringArray, StringBuilder}; use arrow::datatypes::Schema; use async_trait::async_trait; use datafusion::arrow::array::{ @@ -46,6 +47,7 @@ use datafusion::physical_plan::{ }; use futures::stream::{Stream, StreamExt}; use modelardb_compression::{self, MODEL_TYPE_COUNT, MODEL_TYPE_NAMES}; +use modelardb_types::schemas::QUERY_COMPRESSED_SCHEMA; use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueArray, ValueBuilder}; /// An execution plan that reconstructs the data points stored as compressed segments containing @@ -284,6 +286,11 @@ impl GridStream { _error_array ); + let mut tag_arrays = vec![]; + for tag_index in QUERY_COMPRESSED_SCHEMA.0.fields().len()..batch.num_columns() { + tag_arrays.push(modelardb_types::array!(batch, tag_index, StringArray)); + } + // Allocate builders with approximately enough capacity. The builders are allocated with // enough capacity for the remaining data points in the current batch and one data point // from each segment in the new batch as each segment contains at least one data point. @@ -292,6 +299,14 @@ impl GridStream { let mut timestamp_builder = TimestampBuilder::with_capacity(current_rows + new_rows); let mut value_builder = ValueBuilder::with_capacity(current_rows + new_rows); + let mut tag_builders = vec![]; + for _ in 0..tag_arrays.len() { + tag_builders.push(StringBuilder::with_capacity( + current_rows + new_rows, + current_rows + new_rows, + )); + } + // Copy over the data points from the current batch to keep the resulting batch sorted. let current_batch = &self.current_batch; // Required as self cannot be passed to array!. timestamp_builder.append_slice( @@ -303,6 +318,13 @@ impl GridStream { [self.current_batch_offset..], ); + for (index, tag_builder) in tag_builders.iter_mut().enumerate() { + let tag_array = modelardb_types::array!(current_batch, index + 2, StringArray); + for i in self.current_batch_offset..current_batch.num_rows() { + tag_builder.append_value(tag_array.value(i)); + } + } + // Reconstruct the data points from the compressed segments. for row_index in 0..new_rows { let length_before = value_builder.len(); @@ -320,19 +342,32 @@ impl GridStream { &mut value_builder, ); + let created_rows = value_builder.len() - length_before; + + for (tag_builder, tag_array) in tag_builders.iter_mut().zip(&tag_arrays) { + let tag_value = tag_array.value(row_index); + for _ in 0..created_rows { + tag_builder.append_value(tag_value); + } + } + self.grid_stream_metrics.add( model_type_ids.value(row_index), - value_builder.len() - length_before, + created_rows, !residuals.value(row_index).is_empty(), modelardb_compression::are_compressed_timestamps_regular(timestamps.values()), ); } - let columns: Vec = vec![ + let mut columns: Vec = vec![ Arc::new(timestamp_builder.finish()), Arc::new(value_builder.finish()), ]; + for mut tag_builder in tag_builders { + columns.push(Arc::new(tag_builder.finish())); + } + // Update the current batch, unwrap() is safe as GridStream uses a static schema. // For simplicity, all data points are reconstructed and then pruned by time. let current_batch = RecordBatch::try_new(self.schema.clone(), columns).unwrap(); diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index e0e42acfd..ed985bb9e 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -416,9 +416,9 @@ fn new_apache_parquet_exec( let log_store = delta_table.log_store(); let file_scan_config = FileScanConfig { object_store_url: log_store.object_store_url(), - file_schema, + file_schema: file_schema.clone(), file_groups: vec![partitioned_files], - statistics: Statistics::new_unknown(&QUERY_COMPRESSED_SCHEMA.0), + statistics: Statistics::new_unknown(&file_schema), projection: None, limit: maybe_limit, table_partition_cols: vec![], From da3c9d33bbc018e7f06645197300422e647a678c Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 22:58:45 +0100 Subject: [PATCH 56/69] No longer use tag_column_indices when checking for tag columns in projection --- crates/modelardb_storage/src/query/model_table.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index ed985bb9e..9eddfeac3 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -24,6 +24,7 @@ use std::result::Result as StdResult; use std::sync::Arc; use arrow::compute::SortOptions; +use arrow::datatypes::DataType::Utf8; use async_trait::async_trait; use datafusion::arrow::datatypes::{ ArrowPrimitiveType, DataType, Field, Schema, SchemaRef, TimeUnit, @@ -551,9 +552,7 @@ impl TableProvider for ModelTable { let mut stored_columns_in_projection: Vec = Vec::with_capacity(projection.len()); let mut stored_field_columns_in_projection: Vec = - Vec::with_capacity(query_schema.fields.len() - 1 - tag_column_indices.len()); - let mut stored_tag_columns_in_projection: Vec<&str> = - Vec::with_capacity(tag_column_indices.len()); + Vec::with_capacity(schema.fields.len() - 1 - tag_column_indices.len()); let mut generated_columns_in_projection: Vec = Vec::with_capacity(query_schema.fields.len() - schema.fields().len()); @@ -561,11 +560,10 @@ impl TableProvider for ModelTable { if *query_schema.field(*query_schema_index).data_type() == ArrowTimestamp::DATA_TYPE { // Timestamp. stored_columns_in_projection.push(SortedJoinColumnType::Timestamp); - } else if tag_column_indices.contains(query_schema_index) { + } else if *query_schema.field(*query_schema_index).data_type() == Utf8 { // Tag. - stored_tag_columns_in_projection - .push(query_schema.fields[*query_schema_index].name()); - stored_columns_in_projection.push(SortedJoinColumnType::Tag); + let tag_column_name = query_schema.fields[*query_schema_index].name().clone(); + stored_columns_in_projection.push(SortedJoinColumnType::Tag(tag_column_name)); } else if let Some(generated_column) = &generated_columns[*query_schema_index] { // Generated field. let physical_expr = convert_logical_expr_to_physical_expr( From 27a1cfde20cc2a2edfe80e49da7d43fe206fd3ab Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 23:00:35 +0100 Subject: [PATCH 57/69] Use tag columns in data points in sorted_join() --- .../src/query/sorted_join_exec.rs | 25 ++++++++----------- docs/dev/README.md | 2 +- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index 866154e1a..ffb9a255f 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -26,7 +26,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; -use datafusion::arrow::array::{ArrayRef, StringBuilder}; +use datafusion::arrow::array::ArrayRef; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result as DataFusionResult}; @@ -46,11 +46,11 @@ use futures::stream::{Stream, StreamExt}; pub(crate) enum SortedJoinColumnType { Timestamp, Field, - Tag, + Tag(String), } -/// An execution plan that join arrays of data points sorted by `univariate_id` and `timestamp` from -/// multiple execution plans and tags. It is `pub(crate)` so the additional rules added to Apache +/// An execution plan that join arrays of data points sorted by tag columns and `timestamp` from +/// multiple execution plans. It is `pub(crate)` so the additional rules added to Apache /// DataFusion's physical optimizer can pattern match on it. #[derive(Debug)] pub(crate) struct SortedJoinExec { @@ -171,8 +171,8 @@ impl ExecutionPlan for SortedJoinExec { } /// Specify that [`SortedJoinStream`] requires one partition for each input as it assumes that - /// the global sort order is the same for all inputs and Apache Arrow DataFusion only - /// guarantees the sort order within each partition rather than the inputs' global sort order. + /// the sort order is the same for all inputs and Apache Arrow DataFusion only guarantees the + /// sort order within each partition rather than the inputs' global sort order. fn required_input_distribution(&self) -> Vec { vec![Distribution::SinglePartition; self.inputs.len()] } @@ -279,15 +279,11 @@ impl SortedJoinStream { fn sorted_join(&self) -> Poll>> { let mut columns: Vec = Vec::with_capacity(self.schema.fields.len()); - // TODO: Compute the requested tag columns, so they can be assigned to the batch by index. // unwrap() is safe as a record batch is read from each input before this method is called. let batch = self.batches[0].as_ref().unwrap(); - let mut tag_columns: Vec = vec![]; - - // The batches and tags columns are already in the correct order, so they can be appended. + // The batches are already in the correct order, so they can be appended. let mut field_index = 0; - let mut tag_index = 0; for element in &self.return_order { match element { @@ -298,10 +294,9 @@ impl SortedJoinStream { columns.push(batch.column(1).clone()); field_index += 1; } - SortedJoinColumnType::Tag => { - let tags = Arc::new(tag_columns[tag_index].finish()); - columns.push(tags); - tag_index += 1; + SortedJoinColumnType::Tag(tag_column_name) => { + // unwrap() is safe as all tag columns are present in the schema. + columns.push(batch.column_by_name(tag_column_name).unwrap().clone()); } } } diff --git a/docs/dev/README.md b/docs/dev/README.md index f8770417a..5bb855859 100644 --- a/docs/dev/README.md +++ b/docs/dev/README.md @@ -72,7 +72,7 @@ storage. - **Test** - Constants and functionality for data generation for use in tests. - [modelardb_types](/crates/modelardb_types) - Library of shared macros and types for use by the other crates. - **Error** - Error type used throughout the crate, a single error type is used for simplicity. - - **Functions** - Functions for operating on the types, e.g., extracting elements from univariate ids. + - **Functions** - Functions for operating on the types. - **Macros** - Macros for extracting an array from a `RecordBatch` and extracting all arrays from a `RecordBatch` with compressed segments. - **Schemas** - Schemas used throughout the ModelarDB project, e.g., for buffers and for Apache Parquet files with From 62e04dcd3587e2ea897edd0e328e8cf1beee068f Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 26 Feb 2025 23:27:18 +0100 Subject: [PATCH 58/69] Reformat and fixed doc and clippy issues --- crates/modelardb_compression/src/compression.rs | 2 +- crates/modelardb_compression/src/models/swing.rs | 4 +--- crates/modelardb_storage/src/lib.rs | 6 +----- crates/modelardb_storage/src/metadata/mod.rs | 4 ++-- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index f7147412c..1783da55c 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -259,7 +259,7 @@ mod tests { use super::*; - use arrow::array::{ArrayBuilder, BinaryArray, Float32Array, UInt64Builder, UInt8Array}; + use arrow::array::{ArrayBuilder, BinaryArray, Float32Array, UInt8Array}; use arrow::datatypes::{DataType, Field}; use modelardb_common::test::data_generation::{self, ValuesStructure}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ZERO}; diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index ea84ad6f7..270d4e497 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -24,9 +24,7 @@ //! [ModelarDB paper]: https://www.vldb.org/pvldb/vol11/p1688-jensen.pdf use modelardb_types::schemas::COMPRESSED_METADATA_SIZE_IN_BYTES; -use modelardb_types::types::{ - ErrorBound, Timestamp, TimestampBuilder, Value, ValueBuilder, -}; +use modelardb_types::types::{ErrorBound, Timestamp, TimestampBuilder, Value, ValueBuilder}; use super::timestamps; use crate::models; diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 8c0959a28..8cc17d118 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -129,11 +129,7 @@ pub fn register_model_table( model_table_metadata: Arc, data_sink: Arc, ) -> Result<()> { - let model_table = ModelTable::new( - delta_table, - model_table_metadata.clone(), - data_sink, - ); + let model_table = ModelTable::new(delta_table, model_table_metadata.clone(), data_sink); session_context.register_table(&model_table_metadata.name, model_table)?; diff --git a/crates/modelardb_storage/src/metadata/mod.rs b/crates/modelardb_storage/src/metadata/mod.rs index cac6c8b74..df2525fd9 100644 --- a/crates/modelardb_storage/src/metadata/mod.rs +++ b/crates/modelardb_storage/src/metadata/mod.rs @@ -13,8 +13,8 @@ * limitations under the License. */ -//! Implementation of [`ModelTableMetadata`](crate::ModelTableMetadata) which contains metadata -//! required to interact with model tables and [`TableMetadataManager`](crate::TableMetadataManager) +//! Implementation of [`ModelTableMetadata`](crate::ModelTableMetadata) which contains metadata required +//! to interact with model tables and [`TableMetadataManager`](table_metadata_manager::TableMetadataManager) //! which provides functionality to access table related metadata in the metadata Delta Lake. pub mod model_table_metadata; From d4cb178e57234b8fbe983ec2cea9a794535e14d0 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 27 Feb 2025 18:45:42 +0100 Subject: [PATCH 59/69] Fix bug causing INSERT INTO to fail due to schema mismatch --- .../src/storage/data_sinks.rs | 23 +++++++++++++++---- .../src/metadata/model_table_metadata.rs | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/crates/modelardb_server/src/storage/data_sinks.rs b/crates/modelardb_server/src/storage/data_sinks.rs index c5d2b19dc..b0723eeff 100644 --- a/crates/modelardb_server/src/storage/data_sinks.rs +++ b/crates/modelardb_server/src/storage/data_sinks.rs @@ -13,14 +13,16 @@ * limitations under the License. */ -//! Implementation of [`DataSinks`](`DataSink`) that writes -//! [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) to [`StorageEngine`]. +//! Implementation of [`DataSinks`](`DataSink`) that writes [`RecordBatches`](RecordBatch) to +//! [`StorageEngine`]. use std::any::Any; use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::sync::Arc; use async_trait::async_trait; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::arrow::datatypes::{Field, Schema}; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::TaskContext; use datafusion::physical_plan::insert::DataSink; @@ -32,9 +34,8 @@ use tokio::sync::RwLock; use crate::storage::StorageEngine; -/// [`DataSink`] that writes [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) to -/// [`StorageEngine`]. Use [`ModelTableDataSink`] for writing multivariate time series to -/// [`StorageEngine`]. +/// [`DataSink`] that writes [`RecordBatches`](RecordBatch) to [`StorageEngine`]. Use +/// [`ModelTableDataSink`] for writing multivariate time series to [`StorageEngine`]. pub struct NormalTableDataSink { /// The name of the normal table inserted data will be written to. table_name: String, @@ -152,6 +153,18 @@ impl DataSink for ModelTableDataSink { let record_batch = record_batch?.project(&self.model_table_metadata.query_schema_to_schema)?; + // Manually ensure the fields are not nullable. It is not possible to insert null values + // into model tables but the schema of the record batch may contain nullable fields. + let mut fields: Vec = Vec::with_capacity(record_batch.schema().fields.len()); + for field in record_batch.schema().fields() { + fields.push(Field::new(field.name(), field.data_type().clone(), false)); + } + + let record_batch = RecordBatch::try_new( + Arc::new(Schema::new(fields)), + record_batch.columns().to_vec(), + )?; + data_points_inserted += record_batch.num_rows() as u64; let mut storage_engine = self.storage_engine.write().await; diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index 0c416298d..556c33748 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -529,7 +529,7 @@ mod test { assert_eq!( result.unwrap_err().to_string(), - "Invalid argument: The record batch does not match the schema of the model table." + "Invalid Argument Error: The record batch does not match the schema of the model table." ); } From e74acb295132f2374961cb686ff70c52abd6dbe2 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 27 Feb 2025 18:57:57 +0100 Subject: [PATCH 60/69] Reformat with Rustfmt --- crates/modelardb_client/src/error.rs | 2 +- crates/modelardb_client/src/helper.rs | 4 +- crates/modelardb_client/src/main.rs | 4 +- crates/modelardb_common/src/remote.rs | 2 +- .../modelardb_compression/src/compression.rs | 84 +++--- .../src/models/gorilla.rs | 10 +- crates/modelardb_compression/src/types.rs | 4 +- crates/modelardb_manager/src/cluster.rs | 14 +- crates/modelardb_manager/src/error.rs | 2 +- crates/modelardb_manager/src/metadata.rs | 26 +- crates/modelardb_manager/src/remote.rs | 4 +- crates/modelardb_server/src/configuration.rs | 2 +- crates/modelardb_server/src/context.rs | 256 +++++++++++------- crates/modelardb_server/src/data_folders.rs | 12 +- crates/modelardb_server/src/error.rs | 2 +- crates/modelardb_server/src/manager.rs | 4 +- crates/modelardb_server/src/remote.rs | 8 +- .../src/storage/data_sinks.rs | 2 +- .../src/storage/data_transfer.rs | 26 +- crates/modelardb_server/src/storage/types.rs | 6 +- .../src/storage/uncompressed_data_buffer.rs | 2 +- .../src/storage/uncompressed_data_manager.rs | 78 +++--- .../tests/integration_test.rs | 16 +- crates/modelardb_storage/src/delta_lake.rs | 4 +- crates/modelardb_storage/src/error.rs | 2 +- crates/modelardb_storage/src/lib.rs | 4 +- .../src/metadata/model_table_metadata.rs | 4 +- .../src/metadata/table_metadata_manager.rs | 83 +++--- crates/modelardb_storage/src/parser.rs | 228 +++++++++------- .../src/query/generated_as_exec.rs | 2 +- .../src/query/metadata_table.rs | 2 +- .../src/query/model_table.rs | 4 +- .../src/query/normal_table.rs | 2 +- crates/modelardb_storage/src/test.rs | 2 +- 34 files changed, 521 insertions(+), 386 deletions(-) diff --git a/crates/modelardb_client/src/error.rs b/crates/modelardb_client/src/error.rs index d80c0a825..96e72fdcc 100644 --- a/crates/modelardb_client/src/error.rs +++ b/crates/modelardb_client/src/error.rs @@ -23,8 +23,8 @@ use std::result::Result as StdResult; use arrow::error::ArrowError; use object_store::Error as ObjectStoreError; use rustyline::error::ReadlineError as RustyLineError; -use tonic::transport::Error as TonicTransportError; use tonic::Status as TonicStatusError; +use tonic::transport::Error as TonicTransportError; /// Result type used throughout `modelardb_client`. pub type Result = StdResult; diff --git a/crates/modelardb_client/src/helper.rs b/crates/modelardb_client/src/helper.rs index f15d2a4fb..49ccf2683 100644 --- a/crates/modelardb_client/src/helper.rs +++ b/crates/modelardb_client/src/helper.rs @@ -19,13 +19,13 @@ use std::result::Result; +use rustyline::Context; +use rustyline::Helper; use rustyline::completion::{self, Completer}; use rustyline::error::ReadlineError; use rustyline::highlight::Highlighter; use rustyline::hint::Hinter; use rustyline::validate::Validator; -use rustyline::Context; -use rustyline::Helper; /// Provides tab-completion for the client's read-eval-print loop. pub struct ClientHelper { diff --git a/crates/modelardb_client/src/main.rs b/crates/modelardb_client/src/main.rs index f2e6c1786..fec58c286 100644 --- a/crates/modelardb_client/src/main.rs +++ b/crates/modelardb_client/src/main.rs @@ -31,13 +31,13 @@ use arrow::datatypes::{Schema, SchemaRef, ToByteSlice}; use arrow::ipc::convert; use arrow::util::pretty; use arrow_flight::flight_service_client::FlightServiceClient; -use arrow_flight::{utils, Action, Criteria, FlightData, FlightDescriptor, Ticket}; +use arrow_flight::{Action, Criteria, FlightData, FlightDescriptor, Ticket, utils}; use bytes::Bytes; use object_store::local::LocalFileSystem; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; -use rustyline::history::FileHistory; use rustyline::Editor; +use rustyline::history::FileHistory; use tonic::transport::Channel; use tonic::{Request, Streaming}; diff --git a/crates/modelardb_common/src/remote.rs b/crates/modelardb_common/src/remote.rs index 3224513eb..1e0450499 100644 --- a/crates/modelardb_common/src/remote.rs +++ b/crates/modelardb_common/src/remote.rs @@ -21,7 +21,7 @@ use std::error::Error; use arrow::array::ArrayRef; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; -use arrow_flight::{utils, FlightData, FlightDescriptor}; +use arrow_flight::{FlightData, FlightDescriptor, utils}; use tonic::Status; /// Return the table stored as the first element in [`FlightDescriptor.path`], otherwise a diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 1783da55c..1a524ff19 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -25,7 +25,7 @@ use modelardb_types::types::{ErrorBound, TimestampArray, ValueArray}; use crate::error::{ModelarDbCompressionError, Result}; use crate::models::gorilla::Gorilla; -use crate::models::{self, timestamps, GORILLA_ID}; +use crate::models::{self, GORILLA_ID, timestamps}; use crate::types::{CompressedSegmentBatchBuilder, CompressedSegmentBuilder, ModelBuilder}; /// Maximum number of residuals that can be stored as part of a compressed segment. The number of @@ -266,7 +266,7 @@ mod tests { use modelardb_types::schemas::COMPRESSED_SCHEMA; use modelardb_types::types::{TimestampBuilder, ValueBuilder}; - use crate::{models, MODEL_TYPE_NAMES}; + use crate::{MODEL_TYPE_NAMES, models}; const TAG_VALUE: &str = "tag"; const ADD_NOISE_RANGE: Option> = Some(1.0..1.05); @@ -531,8 +531,8 @@ mod tests { } #[test] - fn test_try_compress_regular_random_linear_constant_time_series_within_absolute_error_bound_zero( - ) { + fn test_try_compress_regular_random_linear_constant_time_series_within_absolute_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), false, @@ -542,8 +542,8 @@ mod tests { } #[test] - fn test_try_compress_regular_random_linear_constant_time_series_within_relative_error_bound_zero( - ) { + fn test_try_compress_regular_random_linear_constant_time_series_within_relative_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), false, @@ -553,8 +553,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_random_linear_constant_time_series_within_absolute_error_bound_zero( - ) { + fn test_try_compress_irregular_random_linear_constant_time_series_within_absolute_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), true, @@ -564,8 +564,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_random_linear_constant_time_series_within_relative_error_bound_zero( - ) { + fn test_try_compress_irregular_random_linear_constant_time_series_within_relative_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), true, @@ -575,8 +575,8 @@ mod tests { } #[test] - fn test_try_compress_regular_constant_linear_random_time_series_within_absolute_error_bound_zero( - ) { + fn test_try_compress_regular_constant_linear_random_time_series_within_absolute_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), false, @@ -586,8 +586,8 @@ mod tests { } #[test] - fn test_try_compress_regular_constant_linear_random_time_series_within_relative_error_bound_zero( - ) { + fn test_try_compress_regular_constant_linear_random_time_series_within_relative_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), false, @@ -597,8 +597,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_constant_linear_random_time_series_within_absolute_error_bound_zero( - ) { + fn test_try_compress_irregular_constant_linear_random_time_series_within_absolute_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), true, @@ -608,8 +608,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_constant_linear_random_time_series_within_relative_error_bound_zero( - ) { + fn test_try_compress_irregular_constant_linear_random_time_series_within_relative_error_bound_zero() + { generate_compress_and_assert_known_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), true, @@ -704,8 +704,8 @@ mod tests { } #[test] - fn test_try_compress_regular_synthetic_time_series_without_noise_within_absolute_error_bound_zero( - ) { + fn test_try_compress_regular_synthetic_time_series_without_noise_within_absolute_error_bound_zero() + { generate_compress_and_assert_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), false, @@ -714,8 +714,8 @@ mod tests { } #[test] - fn test_try_compress_regular_synthetic_time_series_without_noise_within_relative_error_bound_zero( - ) { + fn test_try_compress_regular_synthetic_time_series_without_noise_within_relative_error_bound_zero() + { generate_compress_and_assert_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), false, @@ -724,8 +724,8 @@ mod tests { } #[test] - fn test_try_compress_regular_synthetic_time_series_without_noise_within_absolute_error_bound_five( - ) { + fn test_try_compress_regular_synthetic_time_series_without_noise_within_absolute_error_bound_five() + { generate_compress_and_assert_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_FIVE).unwrap(), false, @@ -734,8 +734,8 @@ mod tests { } #[test] - fn test_try_compress_regular_synthetic_time_series_without_noise_within_relative_error_bound_five( - ) { + fn test_try_compress_regular_synthetic_time_series_without_noise_within_relative_error_bound_five() + { generate_compress_and_assert_time_series( ErrorBound::try_new_relative(ERROR_BOUND_FIVE).unwrap(), false, @@ -784,8 +784,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_without_noise_within_absolute_error_bound_zero( - ) { + fn test_try_compress_irregular_synthetic_time_series_without_noise_within_absolute_error_bound_zero() + { generate_compress_and_assert_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), true, @@ -794,8 +794,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_without_noise_within_relative_error_bound_zero( - ) { + fn test_try_compress_irregular_synthetic_time_series_without_noise_within_relative_error_bound_zero() + { generate_compress_and_assert_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), true, @@ -804,8 +804,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_without_noise_within_absolute_error_bound_five( - ) { + fn test_try_compress_irregular_synthetic_time_series_without_noise_within_absolute_error_bound_five() + { generate_compress_and_assert_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_FIVE).unwrap(), true, @@ -814,8 +814,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_without_noise_within_relative_error_bound_five( - ) { + fn test_try_compress_irregular_synthetic_time_series_without_noise_within_relative_error_bound_five() + { generate_compress_and_assert_time_series( ErrorBound::try_new_relative(ERROR_BOUND_FIVE).unwrap(), true, @@ -824,8 +824,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_with_noise_within_absolute_error_bound_zero( - ) { + fn test_try_compress_irregular_synthetic_time_series_with_noise_within_absolute_error_bound_zero() + { generate_compress_and_assert_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), true, @@ -834,8 +834,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_with_noise_within_relative_error_bound_zero( - ) { + fn test_try_compress_irregular_synthetic_time_series_with_noise_within_relative_error_bound_zero() + { generate_compress_and_assert_time_series( ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), true, @@ -844,8 +844,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_with_noise_within_absolute_error_bound_five( - ) { + fn test_try_compress_irregular_synthetic_time_series_with_noise_within_absolute_error_bound_five() + { generate_compress_and_assert_time_series( ErrorBound::try_new_absolute(ERROR_BOUND_FIVE).unwrap(), true, @@ -854,8 +854,8 @@ mod tests { } #[test] - fn test_try_compress_irregular_synthetic_time_series_with_noise_within_relative_error_bound_five( - ) { + fn test_try_compress_irregular_synthetic_time_series_with_noise_within_relative_error_bound_five() + { generate_compress_and_assert_time_series( ErrorBound::try_new_relative(ERROR_BOUND_FIVE).unwrap(), true, diff --git a/crates/modelardb_compression/src/models/gorilla.rs b/crates/modelardb_compression/src/models/gorilla.rs index cadb6614e..587adb917 100644 --- a/crates/modelardb_compression/src/models/gorilla.rs +++ b/crates/modelardb_compression/src/models/gorilla.rs @@ -25,8 +25,8 @@ use modelardb_types::types::{Timestamp, Value, ValueBuilder}; use crate::models; -use crate::models::bits::{BitReader, BitVecBuilder}; use crate::models::ErrorBound; +use crate::models::bits::{BitReader, BitVecBuilder}; /// The state the Gorilla model type needs while compressing the values of a /// time series segment. @@ -519,9 +519,11 @@ mod tests { let values_array = value_builder.finish(); assert!(values.len() == timestamps.len() && values.len() == values_array.len()); - assert!(timestamps - .windows(2) - .all(|window| window[1] - window[0] == 1)); + assert!( + timestamps + .windows(2) + .all(|window| window[1] - window[0] == 1) + ); assert!(slice_of_value_equal(values_array.values(), values)); } diff --git a/crates/modelardb_compression/src/types.rs b/crates/modelardb_compression/src/types.rs index 34802adf0..74aebb2cf 100644 --- a/crates/modelardb_compression/src/types.rs +++ b/crates/modelardb_compression/src/types.rs @@ -19,7 +19,7 @@ use std::debug_assert; use std::sync::Arc; use arrow::array::{ - ArrayBuilder, ArrayRef, BinaryBuilder, Float32Builder, StringArray, UInt16Array, UInt8Builder, + ArrayBuilder, ArrayRef, BinaryBuilder, Float32Builder, StringArray, UInt8Builder, UInt16Array, }; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; @@ -30,8 +30,8 @@ use modelardb_types::types::{ use crate::models::gorilla::Gorilla; use crate::models::pmc_mean::PMCMean; use crate::models::swing::Swing; -use crate::models::{timestamps, VALUE_SIZE_IN_BYTES}; use crate::models::{PMC_MEAN_ID, SWING_ID}; +use crate::models::{VALUE_SIZE_IN_BYTES, timestamps}; /// A model being built from an uncompressed segment using the potentially lossy model types in /// [`models`]. Each of the potentially lossy model types is used to fit models to the data points, diff --git a/crates/modelardb_manager/src/cluster.rs b/crates/modelardb_manager/src/cluster.rs index b5f80b0b7..c4b9eaaea 100644 --- a/crates/modelardb_manager/src/cluster.rs +++ b/crates/modelardb_manager/src/cluster.rs @@ -20,13 +20,13 @@ use std::collections::VecDeque; use arrow::record_batch::RecordBatch; use arrow_flight::flight_service_client::FlightServiceClient; use arrow_flight::{Action, Ticket}; -use futures::stream::FuturesUnordered; use futures::StreamExt; +use futures::stream::FuturesUnordered; use log::info; use modelardb_types::schemas::TABLE_METADATA_SCHEMA; use modelardb_types::types::ServerMode; -use tonic::metadata::{Ascii, MetadataValue}; use tonic::Request; +use tonic::metadata::{Ascii, MetadataValue}; use crate::error::{ModelarDbManagerError, Result}; @@ -288,10 +288,12 @@ mod test { #[tokio::test] async fn test_remove_node_invalid_url() { let mut cluster = Cluster::new(); - assert!(cluster - .remove_node("invalid_url", &Uuid::new_v4().to_string().parse().unwrap()) - .await - .is_err()); + assert!( + cluster + .remove_node("invalid_url", &Uuid::new_v4().to_string().parse().unwrap()) + .await + .is_err() + ); } #[test] diff --git a/crates/modelardb_manager/src/error.rs b/crates/modelardb_manager/src/error.rs index 48036d029..16990450f 100644 --- a/crates/modelardb_manager/src/error.rs +++ b/crates/modelardb_manager/src/error.rs @@ -23,8 +23,8 @@ use std::result::Result as StdResult; use deltalake::errors::DeltaTableError; use modelardb_common::error::ModelarDbCommonError; use modelardb_storage::error::ModelarDbStorageError; -use tonic::transport::Error as TonicTransportError; use tonic::Status as TonicStatusError; +use tonic::transport::Error as TonicTransportError; /// Result type used throughout `modelardb_manager`. pub type Result = StdResult; diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index f45a0ed28..a3529fb9c 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -21,9 +21,9 @@ use std::sync::Arc; use arrow::array::{Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; +use deltalake::DeltaTableError; use deltalake::datafusion::logical_expr::{col, lit}; use deltalake::datafusion::prelude::SessionContext; -use deltalake::DeltaTableError; use modelardb_storage::delta_lake::DeltaLake; use modelardb_storage::metadata::table_metadata_manager::TableMetadataManager; use modelardb_storage::{register_metadata_table, sql_and_concat}; @@ -205,17 +205,21 @@ mod tests { let (_temp_dir, metadata_manager) = create_metadata_manager().await; // Verify that the tables were created, registered, and has the expected columns. - assert!(metadata_manager - .session_context - .sql("SELECT key FROM manager_metadata") - .await - .is_ok()); + assert!( + metadata_manager + .session_context + .sql("SELECT key FROM manager_metadata") + .await + .is_ok() + ); - assert!(metadata_manager - .session_context - .sql("SELECT url, mode FROM nodes") - .await - .is_ok()); + assert!( + metadata_manager + .session_context + .sql("SELECT url, mode FROM nodes") + .await + .is_ok() + ); } #[tokio::test] diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 10c55c415..fd0d5b32e 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -33,7 +33,7 @@ use arrow_flight::{ HandshakeRequest, HandshakeResponse, PollInfo, PutResult, Result as FlightResult, SchemaAsIpc, SchemaResult, Ticket, }; -use futures::{stream, Stream}; +use futures::{Stream, stream}; use modelardb_common::arguments; use modelardb_common::remote; use modelardb_common::remote::{error_to_status_internal, error_to_status_invalid_argument}; @@ -47,9 +47,9 @@ use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; use tracing::info; +use crate::Context; use crate::cluster::Node; use crate::error::{ModelarDbManagerError, Result}; -use crate::Context; /// Start an Apache Arrow Flight server on 0.0.0.0:`port`. pub fn start_apache_arrow_flight_server( diff --git a/crates/modelardb_server/src/configuration.rs b/crates/modelardb_server/src/configuration.rs index 402f12767..39a6de00b 100644 --- a/crates/modelardb_server/src/configuration.rs +++ b/crates/modelardb_server/src/configuration.rs @@ -21,9 +21,9 @@ use std::sync::Arc; use tokio::sync::RwLock; +use crate::ClusterMode; use crate::error::Result; use crate::storage::StorageEngine; -use crate::ClusterMode; /// Manages the system's configuration and provides functionality for updating the configuration. #[derive(Clone)] diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 857e438ac..59c2f7b0d 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -29,8 +29,8 @@ use tracing::info; use crate::configuration::ConfigurationManager; use crate::error::{ModelarDbServerError, Result}; -use crate::storage::data_sinks::{ModelTableDataSink, NormalTableDataSink}; use crate::storage::StorageEngine; +use crate::storage::data_sinks::{ModelTableDataSink, NormalTableDataSink}; use crate::{ClusterMode, DataFolders}; /// Provides access to the system's configuration and components. @@ -445,15 +445,19 @@ mod tests { .unwrap(); // Both a normal table and a model table should be created. - assert!(context - .check_if_table_exists(test::NORMAL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::NORMAL_TABLE_NAME) + .await + .is_err() + ); - assert!(context - .check_if_table_exists(test::MODEL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::MODEL_TABLE_NAME) + .await + .is_err() + ); } #[tokio::test] @@ -476,19 +480,23 @@ mod tests { assert!(folder_path.exists()); // The normal table should be saved to the metadata Delta Lake. - assert!(context - .data_folders - .local_data_folder - .table_metadata_manager - .is_normal_table(test::NORMAL_TABLE_NAME) - .await - .unwrap()); + assert!( + context + .data_folders + .local_data_folder + .table_metadata_manager + .is_normal_table(test::NORMAL_TABLE_NAME) + .await + .unwrap() + ); // The normal table should be registered in the Apache DataFusion catalog. - assert!(context - .check_if_table_exists(test::NORMAL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::NORMAL_TABLE_NAME) + .await + .is_err() + ); } #[tokio::test] @@ -496,15 +504,19 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; - assert!(context - .create_normal_table(test::NORMAL_TABLE_NAME, &test::normal_table_schema()) - .await - .is_ok()); + assert!( + context + .create_normal_table(test::NORMAL_TABLE_NAME, &test::normal_table_schema()) + .await + .is_ok() + ); - assert!(context - .create_normal_table(test::NORMAL_TABLE_NAME, &test::normal_table_schema()) - .await - .is_err()); + assert!( + context + .create_normal_table(test::NORMAL_TABLE_NAME, &test::normal_table_schema()) + .await + .is_err() + ); } #[tokio::test] @@ -532,10 +544,12 @@ mod tests { ); // The model table should be registered in the Apache DataFusion catalog. - assert!(context - .check_if_table_exists(test::MODEL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::MODEL_TABLE_NAME) + .await + .is_err() + ); } #[tokio::test] @@ -543,15 +557,19 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; - assert!(context - .create_model_table(&test::model_table_metadata()) - .await - .is_ok()); + assert!( + context + .create_model_table(&test::model_table_metadata()) + .await + .is_ok() + ); - assert!(context - .create_model_table(&test::model_table_metadata()) - .await - .is_err()); + assert!( + context + .create_model_table(&test::model_table_metadata()) + .await + .is_err() + ); } #[tokio::test] @@ -604,27 +622,33 @@ mod tests { .await .unwrap(); - assert!(context - .check_if_table_exists(test::NORMAL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::NORMAL_TABLE_NAME) + .await + .is_err() + ); context.drop_table(test::NORMAL_TABLE_NAME).await.unwrap(); // The normal table should be deregistered from the Apache DataFusion session context. - assert!(context - .check_if_table_exists(test::NORMAL_TABLE_NAME) - .await - .is_ok()); + assert!( + context + .check_if_table_exists(test::NORMAL_TABLE_NAME) + .await + .is_ok() + ); // The normal table should be deleted from the metadata Delta Lake. - assert!(!context - .data_folders - .local_data_folder - .table_metadata_manager - .is_normal_table(test::NORMAL_TABLE_NAME) - .await - .unwrap()); + assert!( + !context + .data_folders + .local_data_folder + .table_metadata_manager + .is_normal_table(test::NORMAL_TABLE_NAME) + .await + .unwrap() + ); // The normal table should be deleted from the Delta Lake. assert!(!temp_dir.path().join("tables").exists()); @@ -640,27 +664,33 @@ mod tests { .await .unwrap(); - assert!(context - .check_if_table_exists(test::MODEL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::MODEL_TABLE_NAME) + .await + .is_err() + ); context.drop_table(test::MODEL_TABLE_NAME).await.unwrap(); // The model table should be deregistered from the Apache DataFusion session context. - assert!(context - .check_if_table_exists(test::MODEL_TABLE_NAME) - .await - .is_ok()); + assert!( + context + .check_if_table_exists(test::MODEL_TABLE_NAME) + .await + .is_ok() + ); // The model table should be deleted from the metadata Delta Lake. - assert!(!context - .data_folders - .local_data_folder - .table_metadata_manager - .is_model_table(test::MODEL_TABLE_NAME) - .await - .unwrap()); + assert!( + !context + .data_folders + .local_data_folder + .table_metadata_manager + .is_model_table(test::MODEL_TABLE_NAME) + .await + .unwrap() + ); // The model table should be deleted from the Delta Lake. assert!(!temp_dir.path().join("tables").exists()); @@ -710,11 +740,13 @@ mod tests { .unwrap(); // The normal table should not be deleted from the metadata Delta Lake. - assert!(local_data_folder - .table_metadata_manager - .is_normal_table(test::NORMAL_TABLE_NAME) - .await - .unwrap()); + assert!( + local_data_folder + .table_metadata_manager + .is_normal_table(test::NORMAL_TABLE_NAME) + .await + .unwrap() + ); // The normal table data should be deleted from the Delta Lake. delta_table.load().await.unwrap(); @@ -755,11 +787,13 @@ mod tests { .unwrap(); // The model table should not be deleted from the metadata Delta Lake. - assert!(local_data_folder - .table_metadata_manager - .is_model_table(test::MODEL_TABLE_NAME) - .await - .unwrap()); + assert!( + local_data_folder + .table_metadata_manager + .is_model_table(test::MODEL_TABLE_NAME) + .await + .unwrap() + ); // The model table data should be deleted from the Delta Lake. delta_table.load().await.unwrap(); @@ -771,10 +805,12 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; - assert!(context - .truncate_table(test::MODEL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .truncate_table(test::MODEL_TABLE_NAME) + .await + .is_err() + ); } #[tokio::test] @@ -806,11 +842,13 @@ mod tests { .await .unwrap(); - assert!(context - .model_table_metadata_from_default_database_schema(test::NORMAL_TABLE_NAME) - .await - .unwrap() - .is_none()); + assert!( + context + .model_table_metadata_from_default_database_schema(test::NORMAL_TABLE_NAME) + .await + .unwrap() + .is_none() + ); } #[tokio::test] @@ -818,10 +856,12 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; - assert!(context - .model_table_metadata_from_default_database_schema(test::MODEL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .model_table_metadata_from_default_database_schema(test::MODEL_TABLE_NAME) + .await + .is_err() + ); } #[tokio::test] @@ -834,10 +874,12 @@ mod tests { .await .unwrap(); - assert!(context - .check_if_table_exists(test::MODEL_TABLE_NAME) - .await - .is_err()); + assert!( + context + .check_if_table_exists(test::MODEL_TABLE_NAME) + .await + .is_err() + ); } #[tokio::test] @@ -845,10 +887,12 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; - assert!(context - .check_if_table_exists(test::MODEL_TABLE_NAME) - .await - .is_ok()); + assert!( + context + .check_if_table_exists(test::MODEL_TABLE_NAME) + .await + .is_ok() + ); } #[tokio::test] @@ -874,10 +918,12 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; - assert!(context - .schema_of_table_in_default_database_schema(test::MODEL_TABLE_NAME) - .await - .is_err()) + assert!( + context + .schema_of_table_in_default_database_schema(test::MODEL_TABLE_NAME) + .await + .is_err() + ) } /// Create a simple [`Context`] that uses `temp_dir` as the local data folder and query data folder. diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index e59fbbeae..0b22499a7 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -22,10 +22,10 @@ use modelardb_storage::delta_lake::DeltaLake; use modelardb_storage::metadata::table_metadata_manager::TableMetadataManager; use modelardb_types::types::ServerMode; -use crate::error::ModelarDbServerError; -use crate::manager::Manager; use crate::ClusterMode; use crate::Result; +use crate::error::ModelarDbServerError; +use crate::manager::Manager; /// Folder for storing metadata and data in Apache Parquet files. #[derive(Clone)] @@ -163,9 +163,11 @@ mod tests { // Tests for try_from_command_line_arguments(). #[tokio::test] async fn test_try_from_empty_command_line_arguments() { - assert!(DataFolders::try_from_command_line_arguments(&[]) - .await - .is_err()); + assert!( + DataFolders::try_from_command_line_arguments(&[]) + .await + .is_err() + ); } #[tokio::test] diff --git a/crates/modelardb_server/src/error.rs b/crates/modelardb_server/src/error.rs index 69e82fdbb..b0acb0f5f 100644 --- a/crates/modelardb_server/src/error.rs +++ b/crates/modelardb_server/src/error.rs @@ -28,8 +28,8 @@ use deltalake::errors::DeltaTableError; use modelardb_common::error::ModelarDbCommonError; use modelardb_storage::error::ModelarDbStorageError; use object_store::Error as ObjectStoreError; -use tonic::transport::Error as TonicTransportError; use tonic::Status as TonicStatusError; +use tonic::transport::Error as TonicTransportError; /// Result type used throughout `modelardb_server`. pub type Result = StdResult; diff --git a/crates/modelardb_server/src/manager.rs b/crates/modelardb_server/src/manager.rs index c3baeb652..df5b4abe1 100644 --- a/crates/modelardb_server/src/manager.rs +++ b/crates/modelardb_server/src/manager.rs @@ -24,13 +24,13 @@ use arrow_flight::{Action, Result as FlightResult}; use modelardb_common::arguments; use modelardb_types::types::ServerMode; use tokio::sync::RwLock; +use tonic::Request; use tonic::metadata::MetadataMap; use tonic::transport::Channel; -use tonic::Request; +use crate::PORT; use crate::context::Context; use crate::error::{ModelarDbServerError, Result}; -use crate::PORT; /// Manages metadata related to the manager and provides functionality for interacting with the manager. #[derive(Clone, Debug)] diff --git a/crates/modelardb_server/src/remote.rs b/crates/modelardb_server/src/remote.rs index 51d61ab0c..33de90ad8 100644 --- a/crates/modelardb_server/src/remote.rs +++ b/crates/modelardb_server/src/remote.rs @@ -27,9 +27,9 @@ use std::sync::Arc; use arrow_flight::flight_service_client::FlightServiceClient; use arrow_flight::flight_service_server::{FlightService, FlightServiceServer}; use arrow_flight::{ - utils, Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PollInfo, PutResult, Result as FlightResult, SchemaAsIpc, - SchemaResult, Ticket, + SchemaResult, Ticket, utils, }; use datafusion::arrow::array::{ArrayRef, StringArray, UInt64Array}; use datafusion::arrow::datatypes::SchemaRef; @@ -40,8 +40,8 @@ use datafusion::execution::RecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{EmptyRecordBatchStream, SendableRecordBatchStream}; use deltalake::arrow::datatypes::Schema; -use futures::stream::{self, BoxStream, SelectAll}; use futures::StreamExt; +use futures::stream::{self, BoxStream, SelectAll}; use modelardb_common::remote::{error_to_status_internal, error_to_status_invalid_argument}; use modelardb_common::{arguments, remote}; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; @@ -57,9 +57,9 @@ use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; use tracing::{debug, error, info}; +use crate::ClusterMode; use crate::context::Context; use crate::error::{ModelarDbServerError, Result}; -use crate::ClusterMode; /// Start an Apache Arrow Flight server on 0.0.0.0:`port` that passes `context` to the methods that /// process the requests through [`FlightServiceHandler`]. diff --git a/crates/modelardb_server/src/storage/data_sinks.rs b/crates/modelardb_server/src/storage/data_sinks.rs index b0723eeff..57a98d601 100644 --- a/crates/modelardb_server/src/storage/data_sinks.rs +++ b/crates/modelardb_server/src/storage/data_sinks.rs @@ -21,8 +21,8 @@ use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::sync::Arc; use async_trait::async_trait; -use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{Field, Schema}; +use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::TaskContext; use datafusion::physical_plan::insert::DataSink; diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 8b64b2cd3..3db5ebf19 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -321,10 +321,12 @@ mod tests { let (_normal_table_files_size, model_table_files_size) = write_batches_to_tables(&local_data_folder, 1).await; - assert!(data_transfer - .increase_table_size(test::MODEL_TABLE_NAME, model_table_files_size) - .await - .is_ok()); + assert!( + data_transfer + .increase_table_size(test::MODEL_TABLE_NAME, model_table_files_size) + .await + .is_ok() + ); assert_eq!( *data_transfer @@ -370,9 +372,11 @@ mod tests { create_data_transfer_component(local_data_folder.clone()).await; data_transfer.mark_table_as_dropped(test::MODEL_TABLE_NAME); - assert!(data_transfer - .dropped_tables - .contains(test::MODEL_TABLE_NAME)); + assert!( + data_transfer + .dropped_tables + .contains(test::MODEL_TABLE_NAME) + ); } #[tokio::test] @@ -398,9 +402,11 @@ mod tests { // The table should be removed from the in-memory tracking of compressed files and removed // from the dropped tables. - assert!(!data_transfer - .table_size_in_bytes - .contains_key(test::MODEL_TABLE_NAME)); + assert!( + !data_transfer + .table_size_in_bytes + .contains_key(test::MODEL_TABLE_NAME) + ); assert!(data_transfer.dropped_tables.is_empty()); } diff --git a/crates/modelardb_server/src/storage/types.rs b/crates/modelardb_server/src/storage/types.rs index 329cfedfc..04ea7d177 100644 --- a/crates/modelardb_server/src/storage/types.rs +++ b/crates/modelardb_server/src/storage/types.rs @@ -492,8 +492,10 @@ mod tests { test::COMPRESSED_RESERVED_MEMORY_IN_BYTES as isize ); - assert!(!memory_pool - .try_reserve_compressed_memory(2 * test::COMPRESSED_RESERVED_MEMORY_IN_BYTES)); + assert!( + !memory_pool + .try_reserve_compressed_memory(2 * test::COMPRESSED_RESERVED_MEMORY_IN_BYTES) + ); assert_eq!( memory_pool.remaining_compressed_memory_in_bytes(), diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 5da1a6c63..4ad3f7e18 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -27,8 +27,8 @@ use datafusion::arrow::compute; use datafusion::arrow::record_batch::RecordBatch; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; use modelardb_types::types::{Timestamp, TimestampArray, TimestampBuilder, Value, ValueBuilder}; -use object_store::path::Path; use object_store::ObjectStore; +use object_store::path::Path; use tracing::debug; use crate::error::Result; diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index d451bc497..31f67a8ac 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -19,8 +19,8 @@ use std::hash::{DefaultHasher, Hasher}; use std::io::{Error as IOError, ErrorKind as IOErrorKind}; use std::mem; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use dashmap::DashMap; use futures::StreamExt; @@ -33,6 +33,7 @@ use tracing::{debug, error, warn}; use crate::context::Context; use crate::data_folders::DataFolder; use crate::error::Result; +use crate::storage::UNCOMPRESSED_DATA_FOLDER; use crate::storage::compressed_data_buffer::CompressedSegmentBatch; use crate::storage::types::Channels; use crate::storage::types::MemoryPool; @@ -41,7 +42,6 @@ use crate::storage::uncompressed_data_buffer::{ self, IngestedDataBuffer, UncompressedDataBuffer, UncompressedInMemoryDataBuffer, UncompressedOnDiskDataBuffer, }; -use crate::storage::UNCOMPRESSED_DATA_FOLDER; /// Stores uncompressed data points temporarily in an in-memory buffer that spills to Apache Parquet /// files. When an uncompressed data buffer is finished the data is made available for compression. @@ -663,7 +663,7 @@ mod tests { use modelardb_types::types::{TimestampBuilder, ValueBuilder}; use object_store::local::LocalFileSystem; use tempfile::TempDir; - use tokio::time::{sleep, Duration}; + use tokio::time::{Duration, sleep}; use crate::storage::UNCOMPRESSED_DATA_BUFFER_CAPACITY; use crate::{ClusterMode, DataFolders}; @@ -806,9 +806,11 @@ mod tests { insert_data_points(1, &mut data_manager, &model_table_metadata, TAG_VALUE).await; - assert!(data_manager - .uncompressed_in_memory_data_buffers - .contains_key(&TAG_HASH)); + assert!( + data_manager + .uncompressed_in_memory_data_buffers + .contains_key(&TAG_HASH) + ); assert_eq!( data_manager .uncompressed_in_memory_data_buffers @@ -832,9 +834,11 @@ mod tests { assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 0); - assert!(data_manager - .uncompressed_in_memory_data_buffers - .contains_key(&TAG_HASH)); + assert!( + data_manager + .uncompressed_in_memory_data_buffers + .contains_key(&TAG_HASH) + ); assert_eq!( data_manager .uncompressed_in_memory_data_buffers @@ -862,9 +866,11 @@ mod tests { assert_eq!(data_manager.uncompressed_in_memory_data_buffers.len(), 1); assert_eq!(data_manager.uncompressed_on_disk_data_buffers.len(), 0); - assert!(data_manager - .uncompressed_in_memory_data_buffers - .contains_key(&TAG_HASH)); + assert!( + data_manager + .uncompressed_in_memory_data_buffers + .contains_key(&TAG_HASH) + ); assert_eq!( data_manager .uncompressed_in_memory_data_buffers @@ -950,11 +956,13 @@ mod tests { ) .await; - assert!(data_manager - .channels - .uncompressed_data_receiver - .try_recv() - .is_ok()); + assert!( + data_manager + .channels + .uncompressed_data_receiver + .try_recv() + .is_ok() + ); } #[tokio::test] @@ -970,17 +978,21 @@ mod tests { ) .await; - assert!(data_manager - .channels - .uncompressed_data_receiver - .try_recv() - .is_ok()); + assert!( + data_manager + .channels + .uncompressed_data_receiver + .try_recv() + .is_ok() + ); - assert!(data_manager - .channels - .uncompressed_data_receiver - .try_recv() - .is_ok()); + assert!( + data_manager + .channels + .uncompressed_data_receiver + .try_recv() + .is_ok() + ); } #[tokio::test] @@ -988,11 +1000,13 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let (data_manager, _model_table_metadata) = create_managers(&temp_dir).await; - assert!(data_manager - .channels - .uncompressed_data_receiver - .try_recv() - .is_err()); + assert!( + data_manager + .channels + .uncompressed_data_receiver + .try_recv() + .is_err() + ); } #[tokio::test] diff --git a/crates/modelardb_server/tests/integration_test.rs b/crates/modelardb_server/tests/integration_test.rs index 10504e064..e123651cb 100644 --- a/crates/modelardb_server/tests/integration_test.rs +++ b/crates/modelardb_server/tests/integration_test.rs @@ -23,13 +23,13 @@ use std::ops::Range; use std::process::{Child, Command, Stdio}; use std::str; use std::string::String; -use std::sync::atomic::{AtomicU16, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU16, Ordering}; use std::thread; use std::time::Duration; use arrow_flight::flight_service_client::FlightServiceClient; -use arrow_flight::{utils, Action, Criteria, FlightData, FlightDescriptor, PutResult, Ticket}; +use arrow_flight::{Action, Criteria, FlightData, FlightDescriptor, PutResult, Ticket, utils}; use bytes::{Buf, Bytes}; use datafusion::arrow::array::{Array, Float64Array, StringArray, UInt64Array}; use datafusion::arrow::compute; @@ -38,7 +38,7 @@ use datafusion::arrow::ipc::convert; use datafusion::arrow::ipc::reader::StreamReader; use datafusion::arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; use datafusion::arrow::record_batch::RecordBatch; -use futures::{stream, StreamExt}; +use futures::{StreamExt, stream}; use modelardb_common::test::data_generation; use modelardb_types::types::ErrorBound; use sysinfo::{Pid, ProcessesToUpdate, System}; @@ -452,7 +452,7 @@ impl TestContext { let schema_result = self .client .get_schema(Request::new(FlightDescriptor::new_path(vec![ - table_name.to_owned() + table_name.to_owned(), ]))) .await .unwrap() @@ -949,9 +949,11 @@ fn test_cannot_ingest_invalid_time_series() { test_context.create_table(TABLE_NAME, TableType::ModelTable); - assert!(test_context - .send_time_series_to_server(flight_data) - .is_err()); + assert!( + test_context + .send_time_series_to_server(flight_data) + .is_err() + ); test_context.flush_data_to_disk(); diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index cebb04e0d..2b810176a 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -32,15 +32,15 @@ use deltalake::{DeltaOps, DeltaTable, DeltaTableError}; use futures::{StreamExt, TryStreamExt}; use modelardb_common::arguments; use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; +use object_store::ObjectStore; use object_store::aws::AmazonS3Builder; use object_store::local::LocalFileSystem; use object_store::path::Path; -use object_store::ObjectStore; use url::Url; use crate::error::{ModelarDbStorageError, Result}; use crate::metadata::model_table_metadata::ModelTableMetadata; -use crate::{apache_parquet_writer_properties, METADATA_FOLDER, TABLE_FOLDER}; +use crate::{METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties}; /// Functionality for managing Delta Lake tables in a local folder or an object store. pub struct DeltaLake { diff --git a/crates/modelardb_storage/src/error.rs b/crates/modelardb_storage/src/error.rs index 5e6c6a781..24668da90 100644 --- a/crates/modelardb_storage/src/error.rs +++ b/crates/modelardb_storage/src/error.rs @@ -26,8 +26,8 @@ use datafusion::parquet::errors::ParquetError; use deltalake::errors::DeltaTableError; use modelardb_common::error::ModelarDbCommonError; use modelardb_types::error::ModelarDbTypesError; -use object_store::path::Error as ObjectStorePathError; use object_store::Error as ObjectStoreError; +use object_store::path::Error as ObjectStorePathError; use sqlparser::parser::ParserError; /// Result type used throughout `modelardb_storage`. diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 8cc17d118..be4fb70ee 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -40,8 +40,8 @@ use arrow_flight::{IpcMessage, SchemaAsIpc}; use bytes::{Buf, Bytes}; use datafusion::catalog::TableProvider; use datafusion::common::{DFSchema, ToDFSchema}; -use datafusion::execution::session_state::SessionStateBuilder; use datafusion::execution::SendableRecordBatchStream; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::parquet::arrow::async_reader::{ AsyncFileReader, ParquetObjectReader, ParquetRecordBatchStream, }; @@ -57,8 +57,8 @@ use deltalake::DeltaTable; use futures::StreamExt; use modelardb_types::schemas::TABLE_METADATA_SCHEMA; use modelardb_types::types::ErrorBound; -use object_store::path::Path; use object_store::ObjectStore; +use object_store::path::Path; use sqlparser::ast::Statement; use crate::error::{ModelarDbStorageError, Result}; diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index 556c33748..8155c32f7 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -414,8 +414,8 @@ mod test { assert!(result.is_err()); } - fn model_table_schema_error_bounds_and_generated_columns( - ) -> (Arc, Vec, Vec>) { + fn model_table_schema_error_bounds_and_generated_columns() + -> (Arc, Vec, Vec>) { ( Arc::new(Schema::new(vec![ Field::new("location", DataType::Utf8, false), diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index ec48af975..83064150a 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -24,7 +24,7 @@ use arrow::array::{Array, BinaryArray, BooleanArray, Float32Array, Int16Array, S use arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::{DFSchema, ToDFSchema}; use datafusion::logical_expr::lit; -use datafusion::prelude::{col, SessionContext}; +use datafusion::prelude::{SessionContext, col}; use modelardb_common::test::ERROR_BOUND_ZERO; use modelardb_types::types::ErrorBound; @@ -572,17 +572,21 @@ mod tests { .unwrap(); // Verify that the tables were created, registered, and has the expected columns. - assert!(metadata_manager - .session_context - .sql("SELECT table_name FROM normal_table_metadata") - .await - .is_ok()); + assert!( + metadata_manager + .session_context + .sql("SELECT table_name FROM normal_table_metadata") + .await + .is_ok() + ); - assert!(metadata_manager - .session_context - .sql("SELECT table_name, query_schema FROM model_table_metadata") - .await - .is_ok()); + assert!( + metadata_manager + .session_context + .sql("SELECT table_name, query_schema FROM model_table_metadata") + .await + .is_ok() + ); assert!(metadata_manager .session_context @@ -595,37 +599,45 @@ mod tests { #[tokio::test] async fn test_normal_table_is_normal_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - assert!(metadata_manager - .is_normal_table("normal_table_1") - .await - .unwrap()); + assert!( + metadata_manager + .is_normal_table("normal_table_1") + .await + .unwrap() + ); } #[tokio::test] async fn test_model_table_is_not_normal_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - assert!(!metadata_manager - .is_normal_table(test::MODEL_TABLE_NAME) - .await - .unwrap()); + assert!( + !metadata_manager + .is_normal_table(test::MODEL_TABLE_NAME) + .await + .unwrap() + ); } #[tokio::test] async fn test_model_table_is_model_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_model_table().await; - assert!(metadata_manager - .is_model_table(test::MODEL_TABLE_NAME) - .await - .unwrap()); + assert!( + metadata_manager + .is_model_table(test::MODEL_TABLE_NAME) + .await + .unwrap() + ); } #[tokio::test] async fn test_normal_table_is_not_model_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - assert!(!metadata_manager - .is_model_table("normal_table_1") - .await - .unwrap()); + assert!( + !metadata_manager + .is_model_table("normal_table_1") + .await + .unwrap() + ); } #[tokio::test] @@ -693,10 +705,9 @@ mod tests { ); assert_eq!( **batch.column(1), - BinaryArray::from_vec(vec![&try_convert_schema_to_bytes( - &test::model_table_metadata().query_schema - ) - .unwrap()]) + BinaryArray::from_vec(vec![ + &try_convert_schema_to_bytes(&test::model_table_metadata().query_schema).unwrap() + ]) ); // Check that a row has been added to the model_table_field_columns table for each field column. @@ -771,10 +782,12 @@ mod tests { async fn test_drop_table_metadata_for_missing_table() { let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - assert!(metadata_manager - .drop_table_metadata("missing_table") - .await - .is_err()); + assert!( + metadata_manager + .drop_table_metadata("missing_table") + .await + .is_err() + ); } async fn create_metadata_manager_and_save_normal_tables() -> (TempDir, TableMetadataManager) { diff --git a/crates/modelardb_storage/src/parser.rs b/crates/modelardb_storage/src/parser.rs index 8d8eb8d8d..2731c2812 100644 --- a/crates/modelardb_storage/src/parser.rs +++ b/crates/modelardb_storage/src/parser.rs @@ -31,8 +31,8 @@ use datafusion::execution::context::ExecutionProps; use datafusion::functions; use datafusion::logical_expr::{AggregateUDF, Expr as DFExpr, ScalarUDF, TableSource, WindowUDF}; use datafusion::physical_expr::planner; -use datafusion::sql::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion::sql::TableReference; +use datafusion::sql::planner::{ContextProvider, PlannerContext, SqlToRel}; use modelardb_types::functions::normalize_name; // Fully imported to not conflict. use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound}; use sqlparser::ast::{ @@ -42,7 +42,7 @@ use sqlparser::ast::{ TruncateTableTarget, Value, }; use sqlparser::dialect::{Dialect, GenericDialect}; -use sqlparser::keywords::{Keyword, ALL_KEYWORDS}; +use sqlparser::keywords::{ALL_KEYWORDS, Keyword}; use sqlparser::parser::{Parser, ParserError}; use sqlparser::tokenizer::{Span, Token}; @@ -268,7 +268,7 @@ impl ModelarDbDialect { column_type => { return Err(ParserError::ParserError(format!( "Expected TIMESTAMP, FIELD, or TAG, found: {column_type}." - ))) + ))); } }; @@ -870,7 +870,7 @@ fn column_defs_to_model_table_query_schema( "{option} is not supported in model tables." )), None, - )) + )); } } } @@ -884,7 +884,7 @@ fn column_defs_to_model_table_query_schema( "{data_type} is not supported in model tables." )), None, - )) + )); } }; @@ -1161,60 +1161,74 @@ mod tests { #[test] fn test_tokenize_and_parse_create_model_table_without_create() { - assert!(tokenize_and_parse_sql_statement( - "MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_create_model_space() { - assert!(tokenize_and_parse_sql_statement( - "CREATEMODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATEMODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_model() { // Tracks if sqlparser at some point can parse fields/tags in a TABLE. - assert!(tokenize_and_parse_sql_statement( - "CREATE TABLE table_name(timestamp TIMESTAMP, field FIELD, field_one FIELD(10.5), + assert!( + tokenize_and_parse_sql_statement( + "CREATE TABLE table_name(timestamp TIMESTAMP, field FIELD, field_one FIELD(10.5), field_two FIELD(1%), tag TAG)", - ) - .is_err()); + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_model_table_space() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODELTABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODELTABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_table_name() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE(timestamp TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE(timestamp TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_table_table_name_space() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLEtable_name(timestamp TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLEtable_name(timestamp TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_start_parentheses() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name timestamp TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name timestamp TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] @@ -1227,59 +1241,73 @@ mod tests { #[test] fn test_tokenize_and_parse_create_model_table_with_sql_types() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field REAL, tag VARCHAR)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field REAL, tag VARCHAR)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_column_name() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(TIMESTAMP, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(TIMESTAMP, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_with_generated_timestamps() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP AS (37), field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP AS (37), field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_with_generated_tags() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG AS (37))", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG AS (37))", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_with_generated_fields_without_parentheses() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD AS 37, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD AS 37, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_with_generated_fields_without_start_parentheses() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD AS 37), tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD AS 37), tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_with_generated_fields_without_end_parentheses() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD AS (37, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD AS (37, tag TAG)", + ) + .is_err() + ); } #[test] @@ -1302,26 +1330,32 @@ mod tests { #[test] fn test_tokenize_and_parse_create_model_table_without_column_type() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp, field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp, field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_comma() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP field FIELD, tag TAG)", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP field FIELD, tag TAG)", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_create_model_table_without_end_parentheses() { - assert!(tokenize_and_parse_sql_statement( - "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "CREATE MODEL TABLE table_name(timestamp TIMESTAMP, field FIELD, tag TAG", + ) + .is_err() + ); } #[test] @@ -1456,28 +1490,34 @@ mod tests { #[test] fn test_tokenize_and_parse_settings_with_click_house_dialect() { - assert!(Parser::parse_sql( - &ClickHouseDialect {}, - "SELECT * FROM table_name SETTINGS convert_query_to_cnf = true" + assert!( + Parser::parse_sql( + &ClickHouseDialect {}, + "SELECT * FROM table_name SETTINGS convert_query_to_cnf = true" + ) + .is_ok() ) - .is_ok()) } #[test] fn test_tokenize_and_parse_settings_with_modelardb_dialect() { - assert!(Parser::parse_sql( - &ModelarDbDialect::new(), - "SELECT * FROM table_name SETTINGS convert_query_to_cnf = true" + assert!( + Parser::parse_sql( + &ModelarDbDialect::new(), + "SELECT * FROM table_name SETTINGS convert_query_to_cnf = true" + ) + .is_err() ) - .is_err()) } #[test] fn test_tokenize_and_parse_include_one_address_select() { - assert!(tokenize_and_parse_sql_statement( - "INCLUDE 'grpc://192.168.1.2:9999' SELECT * FROM table_name", - ) - .is_ok()); + assert!( + tokenize_and_parse_sql_statement( + "INCLUDE 'grpc://192.168.1.2:9999' SELECT * FROM table_name", + ) + .is_ok() + ); } #[test] @@ -1490,18 +1530,20 @@ mod tests { #[test] fn test_tokenize_and_parse_include_one_double_quoted_address_select() { - assert!(tokenize_and_parse_sql_statement( - "INCLUDE \"grpc://192.168.1.2:9999\" SELECT * FROM table_name", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement( + "INCLUDE \"grpc://192.168.1.2:9999\" SELECT * FROM table_name", + ) + .is_err() + ); } #[test] fn test_tokenize_and_parse_one_address_select() { - assert!(tokenize_and_parse_sql_statement( - "'grpc://192.168.1.2:9999' SELECT * FROM table_name", - ) - .is_err()); + assert!( + tokenize_and_parse_sql_statement("'grpc://192.168.1.2:9999' SELECT * FROM table_name",) + .is_err() + ); } #[test] diff --git a/crates/modelardb_storage/src/query/generated_as_exec.rs b/crates/modelardb_storage/src/query/generated_as_exec.rs index df6ab5157..0d5d1591b 100644 --- a/crates/modelardb_storage/src/query/generated_as_exec.rs +++ b/crates/modelardb_storage/src/query/generated_as_exec.rs @@ -36,8 +36,8 @@ use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use futures::stream::Stream; use futures::StreamExt; +use futures::stream::Stream; use modelardb_types::types::{TimestampArray, ValueArray}; /// A column the [`GeneratedAsExec`] must add to each of the [`RecordBatches`](RecordBatch) using diff --git a/crates/modelardb_storage/src/query/metadata_table.rs b/crates/modelardb_storage/src/query/metadata_table.rs index 89e11b79a..fee2571b6 100644 --- a/crates/modelardb_storage/src/query/metadata_table.rs +++ b/crates/modelardb_storage/src/query/metadata_table.rs @@ -24,7 +24,7 @@ use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::logical_expr::Expr; use datafusion::physical_plan::ExecutionPlan; -use deltalake::{arrow::datatypes::SchemaRef, DeltaTable}; +use deltalake::{DeltaTable, arrow::datatypes::SchemaRef}; use tonic::async_trait; /// A queryable representation of a metadata table. [`MetadataTable`] wraps the [`TableProvider`] of diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 9eddfeac3..4eeb9c2b0 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -38,10 +38,10 @@ use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::context::ExecutionProps; use datafusion::logical_expr::dml::InsertOp; -use datafusion::logical_expr::{self, utils, BinaryExpr, Expr, Operator}; +use datafusion::logical_expr::{self, BinaryExpr, Expr, Operator, utils}; use datafusion::physical_expr::expressions::Column; use datafusion::physical_expr::{ - planner, LexOrdering, LexRequirement, PhysicalSortExpr, PhysicalSortRequirement, + LexOrdering, LexRequirement, PhysicalSortExpr, PhysicalSortRequirement, planner, }; use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr}; diff --git a/crates/modelardb_storage/src/query/normal_table.rs b/crates/modelardb_storage/src/query/normal_table.rs index f8af954ed..191714a24 100644 --- a/crates/modelardb_storage/src/query/normal_table.rs +++ b/crates/modelardb_storage/src/query/normal_table.rs @@ -29,7 +29,7 @@ use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{Expr, LogicalPlan, TableProviderFilterPushDown}; use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, Statistics}; -use deltalake::{arrow::datatypes::SchemaRef, DeltaTable}; +use deltalake::{DeltaTable, arrow::datatypes::SchemaRef}; use tonic::async_trait; /// A queryable representation of a normal table. [`NormalTable`] wraps the [`TableProvider`] diff --git a/crates/modelardb_storage/src/test.rs b/crates/modelardb_storage/src/test.rs index c6df86aed..25d2eb368 100644 --- a/crates/modelardb_storage/src/test.rs +++ b/crates/modelardb_storage/src/test.rs @@ -17,7 +17,7 @@ use std::sync::Arc; -use arrow::array::{BinaryArray, Float32Array, RecordBatch, StringArray, UInt16Array, UInt8Array}; +use arrow::array::{BinaryArray, Float32Array, RecordBatch, StringArray, UInt8Array, UInt16Array}; use arrow::compute::concat_batches; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; use modelardb_common::test::{ERROR_BOUND_FIVE, ERROR_BOUND_ONE, ERROR_BOUND_ZERO}; From 950a9a56023b466b6439fcc6dd62f5efda6d7148 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 27 Feb 2025 22:14:38 +0100 Subject: [PATCH 61/69] Change QuerySchema to GridSchema to match schema name --- .../src/storage/uncompressed_data_manager.rs | 4 ++-- crates/modelardb_storage/src/parser.rs | 4 ++-- crates/modelardb_storage/src/query/model_table.rs | 4 ++-- crates/modelardb_types/src/schemas.rs | 8 ++++---- crates/modelardb_types/src/types.rs | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 31f67a8ac..c6993dd80 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -1023,13 +1023,13 @@ mod tests { // message inserted would block the thread until the data messages have been processed. let number_of_buffers = reserved_memory / uncompressed_data_buffer::compute_memory_size(number_of_fields); - for tag_hash in 0..number_of_buffers { + for tag_value in 0..number_of_buffers { // Allocate many buffers that are never finished. insert_data_points( 1, &mut data_manager, &model_table_metadata.clone(), - &tag_hash.to_string(), + &tag_value.to_string(), ) .await; } diff --git a/crates/modelardb_storage/src/parser.rs b/crates/modelardb_storage/src/parser.rs index 2731c2812..144c4ea9b 100644 --- a/crates/modelardb_storage/src/parser.rs +++ b/crates/modelardb_storage/src/parser.rs @@ -1184,8 +1184,8 @@ mod tests { // Tracks if sqlparser at some point can parse fields/tags in a TABLE. assert!( tokenize_and_parse_sql_statement( - "CREATE TABLE table_name(timestamp TIMESTAMP, field FIELD, field_one FIELD(10.5), - field_two FIELD(1%), tag TAG)", + "CREATE TABLE table_name(timestamp TIMESTAMP, field FIELD, + field_one FIELD(10.5), field_two FIELD(1%), tag TAG)", ) .is_err() ); diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 4eeb9c2b0..e3d40e8fe 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -217,10 +217,10 @@ fn query_order_and_requirement( let tag_column_name = model_table_metadata.schema.field(*index).name(); // unwrap() is safe as the tag columns are always present in the schema. - let segment_index = schema.index_of(tag_column_name).unwrap(); + let schema_index = schema.index_of(tag_column_name).unwrap(); physical_sort_exprs.push(PhysicalSortExpr { - expr: Arc::new(Column::new(tag_column_name, segment_index)), + expr: Arc::new(Column::new(tag_column_name, schema_index)), options: sort_options, }); } diff --git a/crates/modelardb_types/src/schemas.rs b/crates/modelardb_types/src/schemas.rs index 7963bdc3e..c819a9185 100644 --- a/crates/modelardb_types/src/schemas.rs +++ b/crates/modelardb_types/src/schemas.rs @@ -21,8 +21,8 @@ use std::sync::LazyLock; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; use crate::types::{ - ArrowTimestamp, ArrowValue, CompressedSchema, ConfigurationSchema, QueryCompressedSchema, - QuerySchema, TableMetadataSchema, + ArrowTimestamp, ArrowValue, CompressedSchema, ConfigurationSchema, GridSchema, + QueryCompressedSchema, TableMetadataSchema, }; /// Name of the column used to partition the compressed segments. @@ -65,8 +65,8 @@ pub static COMPRESSED_METADATA_SIZE_IN_BYTES: LazyLock = LazyLock::new(|| }); /// [`RecordBatch`](arrow::record_batch::RecordBatch) [`Schema`] used internally during query processing. -pub static GRID_SCHEMA: LazyLock = LazyLock::new(|| { - QuerySchema(Arc::new(Schema::new(vec![ +pub static GRID_SCHEMA: LazyLock = LazyLock::new(|| { + GridSchema(Arc::new(Schema::new(vec![ Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), Field::new("value", ArrowValue::DATA_TYPE, false), ]))) diff --git a/crates/modelardb_types/src/types.rs b/crates/modelardb_types/src/types.rs index 851060886..141676212 100644 --- a/crates/modelardb_types/src/types.rs +++ b/crates/modelardb_types/src/types.rs @@ -48,7 +48,7 @@ pub struct CompressedSchema(pub arrow::datatypes::SchemaRef); pub struct QueryCompressedSchema(pub arrow::datatypes::SchemaRef); #[derive(Clone)] -pub struct QuerySchema(pub arrow::datatypes::SchemaRef); +pub struct GridSchema(pub arrow::datatypes::SchemaRef); #[derive(Clone)] pub struct ConfigurationSchema(pub arrow::datatypes::SchemaRef); From c777e3dd1765cde205c2668f09138152d97d9fd2 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Thu, 27 Feb 2025 22:30:40 +0100 Subject: [PATCH 62/69] Fix cargo doc issue --- crates/modelardb_compression/src/compression.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 1a524ff19..a443e8e59 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -41,7 +41,7 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; /// Assumes `uncompressed_timestamps` and `uncompressed_values` are sorted according to /// `uncompressed_timestamps`. Returns [`ModelarDbCompressionError`] if `uncompressed_timestamps` /// and `uncompressed_values` have different lengths, otherwise the resulting compressed segments -/// are returned as a [`RecordBatch`] with the [`compressed_schema`] schema. +/// are returned as a [`RecordBatch`] with the `compressed_schema` schema. pub fn try_compress( compressed_schema: Arc, tag_values: Vec, From 118c6ce63c96e8d44fabc0d186dbbbbb9dde0cdc Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Fri, 28 Feb 2025 18:05:43 +0100 Subject: [PATCH 63/69] Update based on comments from @chrthomsen --- crates/modelardb_server/src/storage/data_sinks.rs | 4 ++-- .../src/storage/uncompressed_data_manager.rs | 7 +++---- crates/modelardb_storage/src/query/sorted_join_exec.rs | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/crates/modelardb_server/src/storage/data_sinks.rs b/crates/modelardb_server/src/storage/data_sinks.rs index 57a98d601..86743b361 100644 --- a/crates/modelardb_server/src/storage/data_sinks.rs +++ b/crates/modelardb_server/src/storage/data_sinks.rs @@ -153,8 +153,8 @@ impl DataSink for ModelTableDataSink { let record_batch = record_batch?.project(&self.model_table_metadata.query_schema_to_schema)?; - // Manually ensure the fields are not nullable. It is not possible to insert null values - // into model tables but the schema of the record batch may contain nullable fields. + // Ensure the fields are not nullable. It is not possible to insert null values into + // model tables but the schema of the record batch may contain nullable fields. let mut fields: Vec = Vec::with_capacity(record_batch.schema().fields.len()); for field in record_batch.schema().fields() { fields.push(Field::new(field.name(), field.data_type().clone(), false)); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index c6993dd80..d38d01eae 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -176,8 +176,7 @@ impl UncompressedDataManager { let (timestamp_column_array, field_column_arrays, tag_column_arrays) = model_table_metadata.column_arrays(&data_points)?; - // For each data point, compute a hash from the tags and pass the fields to the storage - // engine so they can be added to the appropriate UncompressedDataBuffer. + // For each data point, insert the timestamp and values into the corresponding UncompressedDataBuffer. for (index, timestamp) in timestamp_column_array.iter().enumerate() { let tag_values: Vec = tag_column_arrays .iter() @@ -635,8 +634,8 @@ impl UncompressedDataManager { } } -/// Calculate a unique hash for a specific combination of `table_name` and `tag_values`. The hash -/// can be used to identify a specific multivariate time series during ingestion. +/// Calculate a hash for a combination of `table_name` and `tag_values`. The hash can be used to +/// identify a specific multivariate time series during ingestion. fn calculate_tag_hash(table_name: &str, tag_values: &[String]) -> u64 { let mut hash_data = tag_values.to_vec(); hash_data.push(table_name.to_string()); diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index ffb9a255f..c2ff0ae88 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -49,7 +49,7 @@ pub(crate) enum SortedJoinColumnType { Tag(String), } -/// An execution plan that join arrays of data points sorted by tag columns and `timestamp` from +/// An execution plan that joins arrays of data points sorted by tag columns and `timestamp` from /// multiple execution plans. It is `pub(crate)` so the additional rules added to Apache /// DataFusion's physical optimizer can pattern match on it. #[derive(Debug)] From 548b0ae571ba194b12dd5f1bf844dbe45b9f84c6 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 5 Mar 2025 12:22:29 +0100 Subject: [PATCH 64/69] Change order of arguments in try_compress() --- .../modelardb_compression/src/compression.rs | 45 ++++++++++--------- .../modelardb_compression/src/models/swing.rs | 6 +-- crates/modelardb_compression/src/types.rs | 6 +-- .../src/storage/uncompressed_data_buffer.rs | 4 +- .../src/storage/uncompressed_data_manager.rs | 6 +-- .../tests/integration_test.rs | 4 +- 6 files changed, 37 insertions(+), 34 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index a443e8e59..0812c7b9c 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -39,16 +39,19 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; /// regular and delta-of-deltas followed by a variable length binary encoding if irregular. /// `uncompressed_values` is compressed within `error_bound` using the model types in `models`. /// Assumes `uncompressed_timestamps` and `uncompressed_values` are sorted according to -/// `uncompressed_timestamps`. Returns [`ModelarDbCompressionError`] if `uncompressed_timestamps` -/// and `uncompressed_values` have different lengths, otherwise the resulting compressed segments -/// are returned as a [`RecordBatch`] with the `compressed_schema` schema. +/// `uncompressed_timestamps`. The resulting compressed segments have the schema in `compressed_schema` +/// with the tag columns populated by the values in `tag_values` and the field column index populated +/// by `field_column_index`. Returns [`ModelarDbCompressionError`] if `uncompressed_timestamps` and +/// `uncompressed_values` have different lengths or if `compressed_schema` is not a valid schema for +/// compressed segments, otherwise the resulting compressed segments are returned as a +/// [`RecordBatch`] with the `compressed_schema` schema. pub fn try_compress( + uncompressed_timestamps: &TimestampArray, + uncompressed_values: &ValueArray, + error_bound: ErrorBound, compressed_schema: Arc, tag_values: Vec, field_column_index: u16, - error_bound: ErrorBound, - uncompressed_timestamps: &TimestampArray, - uncompressed_values: &ValueArray, ) -> Result { // The uncompressed data must be passed as arrays instead of a RecordBatch as a TimestampArray // and a ValueArray is the only supported input. However, as a result it is necessary to verify @@ -276,12 +279,12 @@ mod tests { #[test] fn test_try_compress_empty_time_series_within_absolute_error_bound_zero() { let compressed_record_batch = try_compress( + &TimestampBuilder::new().finish(), + &ValueBuilder::new().finish(), + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), compressed_schema(), vec![TAG_VALUE.to_owned()], 0, - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - &TimestampBuilder::new().finish(), - &ValueBuilder::new().finish(), ) .unwrap(); assert_eq!(0, compressed_record_batch.num_rows()); @@ -290,12 +293,12 @@ mod tests { #[test] fn test_try_compress_empty_time_series_within_relative_error_bound_zero() { let compressed_record_batch = try_compress( + &TimestampBuilder::new().finish(), + &ValueBuilder::new().finish(), + ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), compressed_schema(), vec![TAG_VALUE.to_owned()], 0, - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - &TimestampBuilder::new().finish(), - &ValueBuilder::new().finish(), ) .unwrap(); assert_eq!(0, compressed_record_batch.num_rows()); @@ -512,12 +515,12 @@ mod tests { data_generation::generate_values(uncompressed_timestamps.values(), values_structure); let compressed_record_batch = try_compress( + &uncompressed_timestamps, + &uncompressed_values, + error_bound, compressed_schema(), vec![TAG_VALUE.to_owned()], 0, - error_bound, - &uncompressed_timestamps, - &uncompressed_values, ) .unwrap(); @@ -662,12 +665,12 @@ mod tests { assert_eq!(uncompressed_timestamps.len(), uncompressed_values.len()); let compressed_record_batch = try_compress( + &uncompressed_timestamps, + &uncompressed_values, + error_bound, compressed_schema(), vec![TAG_VALUE.to_owned()], 0, - error_bound, - &uncompressed_timestamps, - &uncompressed_values, ) .unwrap(); @@ -878,12 +881,12 @@ mod tests { ); let compressed_record_batch = try_compress( + &uncompressed_timestamps, + &uncompressed_values, + error_bound, compressed_schema(), vec![TAG_VALUE.to_owned()], 0, - error_bound, - &uncompressed_timestamps, - &uncompressed_values, ) .unwrap(); diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 270d4e497..c7dab2407 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -850,12 +850,12 @@ mod tests { let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); let segments = crate::try_compress( + ×tamps, + &values, + error_bound, compressed_schema, vec!["tag".to_owned()], 0, - error_bound, - ×tamps, - &values, ) .unwrap(); diff --git a/crates/modelardb_compression/src/types.rs b/crates/modelardb_compression/src/types.rs index 74aebb2cf..30b74e3a7 100644 --- a/crates/modelardb_compression/src/types.rs +++ b/crates/modelardb_compression/src/types.rs @@ -15,7 +15,7 @@ //! The types used throughout the crate. -use std::debug_assert; +use std::{debug_assert, iter}; use std::sync::Arc; use arrow::array::{ @@ -481,7 +481,7 @@ impl CompressedSegmentBatchBuilder { /// Return [`RecordBatch`] of compressed segments and consume the builder. pub(crate) fn finish(mut self) -> RecordBatch { let batch_length = self.model_type_ids.len(); - let field_column_array: UInt16Array = std::iter::repeat(self.field_column_index) + let field_column_array: UInt16Array = iter::repeat(self.field_column_index) .take(batch_length) .collect(); @@ -499,7 +499,7 @@ impl CompressedSegmentBatchBuilder { ]; for tag_value in &self.tag_values { - let tag_array: StringArray = std::iter::repeat(Some(tag_value)) + let tag_array: StringArray = iter::repeat(Some(tag_value)) .take(batch_length) .collect(); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 4ad3f7e18..9d3fb8b3c 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -19,7 +19,7 @@ //! support for storing uncompressed data points in Apache Parquet files on disk. use std::fmt::{Debug, Formatter, Result as FmtResult}; -use std::mem; +use std::{iter, mem}; use std::sync::Arc; use datafusion::arrow::array::{Array, ArrayBuilder, StringArray}; @@ -173,7 +173,7 @@ impl UncompressedInMemoryDataBuffer { } else if self.model_table_metadata.is_tag(column_index) { // The tag value is the same for each data point so it is not sorted. let tag_value = self.tag_values[tag_column_index].clone(); - let tag_array: StringArray = std::iter::repeat(Some(tag_value)) + let tag_array: StringArray = iter::repeat(Some(tag_value)) .take(buffer_length) .collect(); columns.push(Arc::new(tag_array)); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index d38d01eae..9ed131928 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -588,12 +588,12 @@ impl UncompressedDataManager { // unwrap() is safe as uncompressed_timestamps and uncompressed_values have the same length. modelardb_compression::try_compress( + uncompressed_timestamps, + uncompressed_values, + error_bound, model_table_metadata.compressed_schema.clone(), tag_values.clone(), *field_column_index as u16, - error_bound, - uncompressed_timestamps, - uncompressed_values, ) .unwrap() }) diff --git a/crates/modelardb_server/tests/integration_test.rs b/crates/modelardb_server/tests/integration_test.rs index e123651cb..209dfeb6f 100644 --- a/crates/modelardb_server/tests/integration_test.rs +++ b/crates/modelardb_server/tests/integration_test.rs @@ -18,7 +18,7 @@ use std::collections::HashMap; use std::error::Error; use std::io::Read; -use std::iter::repeat; +use std::iter; use std::ops::Range; use std::process::{Child, Command, Stdio}; use std::str; @@ -331,7 +331,7 @@ impl TestContext { if let Some(tag) = maybe_tag { fields.push(Field::new("tag", DataType::Utf8, false)); columns.push(Arc::new(StringArray::from_iter_values( - repeat(tag).take(time_series_len), + iter::repeat(tag).take(time_series_len), ))); } From 3d30ede5097b52eb63671e2737920bf0262c393f Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 5 Mar 2025 14:43:34 +0100 Subject: [PATCH 65/69] Update method for calculating tag hash --- crates/modelardb_compression/src/types.rs | 30 ++++++++----------- crates/modelardb_manager/src/remote.rs | 6 ++-- .../src/storage/compressed_data_manager.rs | 5 ++-- .../src/storage/data_sinks.rs | 21 ++++++------- .../src/storage/uncompressed_data_buffer.rs | 2 +- .../src/storage/uncompressed_data_manager.rs | 17 ++++++----- 6 files changed, 38 insertions(+), 43 deletions(-) diff --git a/crates/modelardb_compression/src/types.rs b/crates/modelardb_compression/src/types.rs index 30b74e3a7..4b4bde0a7 100644 --- a/crates/modelardb_compression/src/types.rs +++ b/crates/modelardb_compression/src/types.rs @@ -15,8 +15,8 @@ //! The types used throughout the crate. -use std::{debug_assert, iter}; use std::sync::Arc; +use std::{debug_assert, iter}; use arrow::array::{ ArrayBuilder, ArrayRef, BinaryBuilder, Float32Builder, StringArray, UInt8Builder, UInt16Array, @@ -485,24 +485,20 @@ impl CompressedSegmentBatchBuilder { .take(batch_length) .collect(); - let mut columns: Vec = vec![ - Arc::new(self.model_type_ids.finish()), - Arc::new(self.start_times.finish()), - Arc::new(self.end_times.finish()), - Arc::new(self.timestamps.finish()), - Arc::new(self.min_values.finish()), - Arc::new(self.max_values.finish()), - Arc::new(self.values.finish()), - Arc::new(self.residuals.finish()), - Arc::new(self.error.finish()), - Arc::new(field_column_array), - ]; + let mut columns: Vec = Vec::with_capacity(self.compressed_schema.fields.len()); + columns.push(Arc::new(self.model_type_ids.finish())); + columns.push(Arc::new(self.start_times.finish())); + columns.push(Arc::new(self.end_times.finish())); + columns.push(Arc::new(self.timestamps.finish())); + columns.push(Arc::new(self.min_values.finish())); + columns.push(Arc::new(self.max_values.finish())); + columns.push(Arc::new(self.values.finish())); + columns.push(Arc::new(self.residuals.finish())); + columns.push(Arc::new(self.error.finish())); + columns.push(Arc::new(field_column_array)); for tag_value in &self.tag_values { - let tag_array: StringArray = iter::repeat(Some(tag_value)) - .take(batch_length) - .collect(); - + let tag_array: StringArray = iter::repeat(Some(tag_value)).take(batch_length).collect(); columns.push(Arc::new(tag_array)); } diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index fd0d5b32e..cb10451c0 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -282,9 +282,9 @@ impl FlightServiceHandler { Ok(()) } - /// Truncate the table in the data Delta Lake and in each node controlled by the manager. If the - /// table does not exist or the table cannot be truncated in the remote data folder and in each - /// node, return [`Status`]. + /// Truncate the table in the remote data folder and at each node controlled by the manager. If + /// the table does not exist or the table cannot be truncated in the remote data folder and at + /// each node, return [`Status`]. async fn truncate_cluster_table(&self, table_name: &str) -> StdResult<(), Status> { if self.check_if_table_exists(table_name).await.is_ok() { return Err(Status::invalid_argument(format!( diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index d84163976..6301926d7 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -130,8 +130,9 @@ impl CompressedDataManager { } /// Insert `compressed_segment_batch` into the in-memory [`CompressedDataBuffer`] for the model - /// table. If `compressed_segment_batch` is inserted successfully, return [`Ok`], otherwise - /// return [`ModelarDbServerError`](crate::error::ModelarDbServerError). + /// table. If inserting `compressed_segment_batch` exceeded the reserved memory limit, save + /// compressed data to disk until enough memory is available. If compressed data could not be + /// saved to disk, return [`ModelarDbServerError`](crate::error::ModelarDbServerError). async fn insert_compressed_segments( &self, compressed_segment_batch: CompressedSegmentBatch, diff --git a/crates/modelardb_server/src/storage/data_sinks.rs b/crates/modelardb_server/src/storage/data_sinks.rs index 86743b361..b094f5f54 100644 --- a/crates/modelardb_server/src/storage/data_sinks.rs +++ b/crates/modelardb_server/src/storage/data_sinks.rs @@ -21,7 +21,6 @@ use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::sync::Arc; use async_trait::async_trait; -use datafusion::arrow::datatypes::{Field, Schema}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::TaskContext; @@ -106,9 +105,9 @@ impl DisplayAs for NormalTableDataSink { } } -/// [`DataSink`] that writes [`RecordBatches`](datafusion::arrow::record_batch::RecordBatch) -/// containing multivariate time series to [`StorageEngine`]. Assumes the generated columns are -/// included, thus they are dropped without checking the schema. +/// [`DataSink`] that writes [`RecordBatches`](RecordBatch) containing multivariate time series to +/// [`StorageEngine`]. Assumes the generated columns are included, thus they are dropped without +/// checking the schema. pub struct ModelTableDataSink { /// Metadata for the model table inserted data will be written to. model_table_metadata: Arc, @@ -150,18 +149,16 @@ impl DataSink for ModelTableDataSink { let mut data_points_inserted: u64 = 0; while let Some(record_batch) = data.next().await { + // Remove the generated columns from the record batch. The generated columns must be + // part of the inserted data since Apache DataFusion checks it before passing it to + // write_all(). let record_batch = record_batch?.project(&self.model_table_metadata.query_schema_to_schema)?; - // Ensure the fields are not nullable. It is not possible to insert null values into - // model tables but the schema of the record batch may contain nullable fields. - let mut fields: Vec = Vec::with_capacity(record_batch.schema().fields.len()); - for field in record_batch.schema().fields() { - fields.push(Field::new(field.name(), field.data_type().clone(), false)); - } - + // Create a new record batch with the schema of the model table to fix the problem where + // the schema of the inserted data has nullable fields. let record_batch = RecordBatch::try_new( - Arc::new(Schema::new(fields)), + self.model_table_metadata.schema.clone(), record_batch.columns().to_vec(), )?; diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 9d3fb8b3c..4e60815b1 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -358,7 +358,7 @@ impl UncompressedOnDiskDataBuffer { let tag_values: Vec = tag_column_arrays .iter() - .map(|array| array.value(0).to_string()) + .map(|array| array.value(0).to_owned()) .collect(); let mut in_memory_buffer = UncompressedInMemoryDataBuffer::new( diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 9ed131928..739631ba7 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -180,7 +180,7 @@ impl UncompressedDataManager { for (index, timestamp) in timestamp_column_array.iter().enumerate() { let tag_values: Vec = tag_column_arrays .iter() - .map(|array| array.value(index).to_string()) + .map(|array| array.value(index).to_owned()) .collect(); let mut values = field_column_arrays.iter().map(|array| array.value(index)); @@ -577,7 +577,7 @@ impl UncompressedDataManager { let tag_values: Vec = tag_column_arrays .iter() - .map(|array| array.value(0).to_string()) + .map(|array| array.value(0).to_owned()) .collect(); let compressed_segments = field_column_arrays @@ -634,14 +634,15 @@ impl UncompressedDataManager { } } -/// Calculate a hash for a combination of `table_name` and `tag_values`. The hash can be used to +/// Calculate a hash for a combination of `table_name` and `tag_values`. The hash is used to /// identify a specific multivariate time series during ingestion. fn calculate_tag_hash(table_name: &str, tag_values: &[String]) -> u64 { - let mut hash_data = tag_values.to_vec(); - hash_data.push(table_name.to_string()); - let mut hasher = DefaultHasher::new(); - hasher.write(hash_data.join(";").as_bytes()); + for tag_value in tag_values { + hasher.write(tag_value.as_bytes()); + } + + hasher.write(table_name.as_bytes()); hasher.finish() } @@ -668,7 +669,7 @@ mod tests { use crate::{ClusterMode, DataFolders}; const TAG_VALUE: &str = "tag"; - const TAG_HASH: u64 = 15537859409877038916; + const TAG_HASH: u64 = 10828528714290431980; // Tests for UncompressedDataManager. #[tokio::test] From 03289226d3789986e62c155afc45e5f81ced2895 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 5 Mar 2025 15:15:03 +0100 Subject: [PATCH 66/69] Add limitation on number of model table fields back --- .../src/metadata/model_table_metadata.rs | 28 +++++++++++++------ .../src/metadata/table_metadata_manager.rs | 1 + 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/crates/modelardb_storage/src/metadata/model_table_metadata.rs b/crates/modelardb_storage/src/metadata/model_table_metadata.rs index 8155c32f7..a4348bcab 100644 --- a/crates/modelardb_storage/src/metadata/model_table_metadata.rs +++ b/crates/modelardb_storage/src/metadata/model_table_metadata.rs @@ -36,8 +36,6 @@ use crate::parser::tokenize_and_parse_sql_expression; pub struct ModelTableMetadata { /// Name of the model table. pub name: String, - /// Schema of the data that can be written to the model table. - pub schema: Arc, /// Index of the timestamp column in `schema`. pub timestamp_column_index: usize, /// Indices of the field columns in `schema`. @@ -46,13 +44,15 @@ pub struct ModelTableMetadata { pub tag_column_indices: Vec, /// Error bounds of the columns in `schema`. It can only be non-zero for field columns. pub error_bounds: Vec, + /// Expressions to create generated columns in the `query_schema`. Only field columns can be + /// generated by [`Expr`], so [`None`] is stored for timestamp, tag, and stored field columns. + pub generated_columns: Vec>, + /// Schema of the data that can be written to the model table. + pub schema: Arc, /// Schema of the data that can be read from the model table. pub query_schema: Arc, /// Projection that changes `query_schema` to `schema`. pub query_schema_to_schema: Vec, - /// Expressions to create generated columns in the `query_schema`. Only field columns can be - /// generated by [`Expr`], so [`None`] is stored for timestamp, tag, and stored field columns. - pub generated_columns: Vec>, /// Schema of the compressed segments that are stored in the model table. pub compressed_schema: Arc, } @@ -63,6 +63,7 @@ impl ModelTableMetadata { /// * The number of error bounds does not match the number of columns. /// * The number of potentially generated columns does not match the number of columns. /// * A generated column includes another generated column in its expression. + /// * There are more than 32767 columns. /// * The `query_schema` does not include a single timestamp column. /// * The `query_schema` does not include at least one stored field column. pub fn try_new( @@ -97,6 +98,14 @@ impl ModelTableMetadata { } } + // If there are more than 32767 columns, return an error. This limitation is necessary since + // 16 bits are used for the field column index in the compressed segments. + if query_schema.fields.len() > 32767 { + return Err(ModelarDbStorageError::InvalidArgument( + "There cannot be more than 32767 columns in the model table.".to_owned(), + )); + } + // Remove the generated field columns from the query schema and the error bounds as these // columns should never be provided when inserting data points into the model table. let mut fields_without_generated = Vec::with_capacity(query_schema.fields().len()); @@ -145,7 +154,10 @@ impl ModelTableMetadata { compute_indices_of_columns_with_data_type(&schema_without_generated, DataType::Utf8); // Add the tag columns to the base schema for compressed segments. - let mut compressed_schema_fields = COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + let mut compressed_schema_fields = + Vec::with_capacity(COMPRESSED_SCHEMA.0.fields().len() + tag_column_indices.len()); + compressed_schema_fields.extend(COMPRESSED_SCHEMA.0.fields.clone().to_vec()); + for index in &tag_column_indices { compressed_schema_fields.push(Arc::new(schema_without_generated.field(*index).clone())); } @@ -154,14 +166,14 @@ impl ModelTableMetadata { Ok(Self { name, - schema: schema_without_generated, timestamp_column_index: timestamp_column_indices[0], field_column_indices, tag_column_indices, error_bounds: error_bounds_without_generated, + generated_columns, + schema: schema_without_generated, query_schema, query_schema_to_schema: field_indices_without_generated, - generated_columns, compressed_schema, }) } diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 83064150a..ad7ea426d 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -322,6 +322,7 @@ impl TableMetadataManager { (0.0, false) }; + // query_schema_index is simply cast as a model table contains at most 32767 columns. self.delta_lake .write_columns_to_metadata_table( "model_table_field_columns", From 26afb727ee8d5ed3cacf63cae592771b8dc27e34 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 5 Mar 2025 17:24:16 +0100 Subject: [PATCH 67/69] Rename Apache Arrow DataFusion to Apache DataFusion --- README.md | 2 +- crates/modelardb_storage/src/delta_lake.rs | 2 +- .../src/optimizer/model_simple_aggregates.rs | 8 +++---- .../src/query/generated_as_exec.rs | 6 ++--- .../modelardb_storage/src/query/grid_exec.rs | 20 ++++++++-------- .../src/query/metadata_table.rs | 4 ++-- .../src/query/model_table.rs | 23 +++++++++++++------ .../src/query/normal_table.rs | 2 +- .../src/query/sorted_join_exec.rs | 8 +++---- 9 files changed, 43 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 9e64e9894..92d06ac90 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ModelarDB is designed to be cross-platform and is currently automatically tested through [GitHub Actions](https://github.com/ModelarData/ModelarDB-RS/actions). It is also known to work on FreeBSD which is [currently not supported by GitHub Actions](https://github.com/actions/runner/issues/385). It is implemented in [Rust](https://www.rust-lang.org/) and uses [Apache Arrow Flight](https://github.com/apache/arrow-rs/tree/master/arrow-flight) -for communicating with clients, [Apache Arrow DataFusion](https://github.com/apache/arrow-datafusion) as its query +for communicating with clients, [Apache DataFusion](https://github.com/apache/datafusion) as its query engine, [Apache Arrow](https://github.com/apache/arrow-rs) as its in-memory data format, and [Apache Parquet](https://github.com/apache/arrow-rs/tree/master/parquet) as its on-disk data format. diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 2b810176a..74de81d33 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -410,7 +410,7 @@ impl DeltaLake { let table = self.metadata_delta_table(table_name).await?; // TableProvider::schema(&table) is used instead of table.schema() because table.schema() - // returns the Delta Lake schema instead of the Apache Arrow DataFusion schema. + // returns the Delta Lake schema instead of the Apache DataFusion schema. let record_batch = RecordBatch::try_new(TableProvider::schema(&table), columns)?; self.write_record_batches_to_table( diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 2273a7914..8ef73dfd3 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -219,8 +219,8 @@ fn rewrite_aggregates_to_use_segments( && aggregate_exec.filter_expr().iter().all(Option::is_none) && aggregate_exec.group_expr().is_empty() { - // Remove RepartitionExec if added by Apache Arrow DataFusion. Both AggregateExec - // and RepartitionExec can only have one child, so it is not necessary to check it. + // Remove RepartitionExec if added by Apache DataFusion. Both AggregateExec and + // RepartitionExec can only have one child, so it is not necessary to check it. let maybe_repartition_exec = &aggregate_exec_children[0]; let aggregate_exec_input = if let Some(repartition_exec) = maybe_repartition_exec .as_any() @@ -688,8 +688,8 @@ mod tests { #[tokio::test] async fn test_rewrite_aggregates_on_one_column_without_predicates() { - // Apache Arrow DataFusion 30 creates two input columns to AggregateExec when both SUM and - // AVG is computed in the same query, so for now, multiple queries are used for the test. + // Apache DataFusion 30 creates two input columns to AggregateExec when both SUM and AVG is + // computed in the same query, so for now, multiple queries are used for the test. let query_no_avg = &format!( "SELECT COUNT(field_1), MIN(field_1), MAX(field_1), SUM(field_1) FROM {}", test::MODEL_TABLE_NAME diff --git a/crates/modelardb_storage/src/query/generated_as_exec.rs b/crates/modelardb_storage/src/query/generated_as_exec.rs index 0d5d1591b..c6c83d2a2 100644 --- a/crates/modelardb_storage/src/query/generated_as_exec.rs +++ b/crates/modelardb_storage/src/query/generated_as_exec.rs @@ -13,9 +13,9 @@ * limitations under the License. */ -//! Implementation of the Apache Arrow DataFusion execution plan [`GeneratedAsExec`] and its -//! corresponding stream [`GeneratedAsStream`] which computes generated columns and adds them to the -//! result. Generated columns can be computed from other columns and constant values. +//! Implementation of the Apache DataFusion execution plan [`GeneratedAsExec`] and its corresponding +//! stream [`GeneratedAsStream`] which computes generated columns and adds them to the result. +//! Generated columns can be computed from other columns and constant values. use std::any::Any; use std::fmt::{Formatter, Result as FmtResult}; diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index 40ba58262..99cd43504 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -13,7 +13,7 @@ * limitations under the License. */ -//! Implementation of the Apache Arrow DataFusion execution plan [`GridExec`] and its corresponding +//! Implementation of the Apache DataFusion execution plan [`GridExec`] and its corresponding //! stream [`GridStream`] which reconstructs the data points for a specific column from the //! compressed segments containing metadata and models. @@ -188,8 +188,8 @@ impl ExecutionPlan for GridExec { } /// Specify that [`GridExec`] requires one partition for each input as it assumes that the - /// sort order are the same for its input and Apache Arrow DataFusion only guarantees the - /// sort order within each partition rather than the input's global sort order. + /// sort order are the same for its input and Apache DataFusion only guarantees the sort order + /// within each partition rather than the input's global sort order. fn required_input_distribution(&self) -> Vec { vec![Distribution::SinglePartition] } @@ -286,7 +286,8 @@ impl GridStream { _error_array ); - let mut tag_arrays = vec![]; + let mut tag_arrays = + Vec::with_capacity(batch.num_columns() - QUERY_COMPRESSED_SCHEMA.0.fields().len()); for tag_index in QUERY_COMPRESSED_SCHEMA.0.fields().len()..batch.num_columns() { tag_arrays.push(modelardb_types::array!(batch, tag_index, StringArray)); } @@ -299,7 +300,7 @@ impl GridStream { let mut timestamp_builder = TimestampBuilder::with_capacity(current_rows + new_rows); let mut value_builder = ValueBuilder::with_capacity(current_rows + new_rows); - let mut tag_builders = vec![]; + let mut tag_builders = Vec::with_capacity(tag_arrays.len()); for _ in 0..tag_arrays.len() { tag_builders.push(StringBuilder::with_capacity( current_rows + new_rows, @@ -320,6 +321,8 @@ impl GridStream { for (index, tag_builder) in tag_builders.iter_mut().enumerate() { let tag_array = modelardb_types::array!(current_batch, index + 2, StringArray); + + // Append each value individually since StringBuilder does not have an append_slice() method. for i in self.current_batch_offset..current_batch.num_rows() { tag_builder.append_value(tag_array.value(i)); } @@ -359,10 +362,9 @@ impl GridStream { ); } - let mut columns: Vec = vec![ - Arc::new(timestamp_builder.finish()), - Arc::new(value_builder.finish()), - ]; + let mut columns: Vec = Vec::with_capacity(tag_builders.len() + 2); + columns.push(Arc::new(timestamp_builder.finish())); + columns.push(Arc::new(value_builder.finish())); for mut tag_builder in tag_builders { columns.push(Arc::new(tag_builder.finish())); diff --git a/crates/modelardb_storage/src/query/metadata_table.rs b/crates/modelardb_storage/src/query/metadata_table.rs index fee2571b6..e8725b9a7 100644 --- a/crates/modelardb_storage/src/query/metadata_table.rs +++ b/crates/modelardb_storage/src/query/metadata_table.rs @@ -29,8 +29,8 @@ use tonic::async_trait; /// A queryable representation of a metadata table. [`MetadataTable`] wraps the [`TableProvider`] of /// [`DeltaTable`] and passes most methods calls directly to it. Thus, it can be registered with -/// Apache Arrow DataFusion. The only difference from [`DeltaTable`] is that `delta_table` is -/// updated to the latest snapshot when accessed. +/// Apache DataFusion. The only difference from [`DeltaTable`] is that `delta_table` is updated to +/// the latest snapshot when accessed. #[derive(Debug)] pub(crate) struct MetadataTable { /// Access to the Delta Lake table. diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index e3d40e8fe..c8c29c100 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -76,7 +76,7 @@ pub(crate) struct ModelTable { query_order_segment: LexOrdering, /// The sort order that [`GridExec`] requires for the segments it receives as its input. query_requirement_segment: LexRequirement, - /// Schema used internally during query processing. + /// Schema used to reconstruct the data points from each field column in the compressed segments. grid_schema: Arc, /// The sort order [`GridExec`] guarantees for the data points it produces. It is guaranteed by /// [`GridExec`] because it receives segments sorted by `query_order_segment` from [`ParquetExec`] @@ -108,7 +108,11 @@ impl ModelTable { }; // Add the tag columns to the base schema for queryable compressed segments. - let mut query_compressed_schema_fields = QUERY_COMPRESSED_SCHEMA.0.fields.clone().to_vec(); + let mut query_compressed_schema_fields = Vec::with_capacity( + QUERY_COMPRESSED_SCHEMA.0.fields.len() + model_table_metadata.tag_column_indices.len(), + ); + + query_compressed_schema_fields.extend(QUERY_COMPRESSED_SCHEMA.0.fields.clone().to_vec()); for index in &model_table_metadata.tag_column_indices { query_compressed_schema_fields .push(Arc::new(model_table_metadata.schema.field(*index).clone())); @@ -123,7 +127,11 @@ impl ModelTable { ); // Add the tag columns to the base schema for data points. - let mut grid_schema_fields = GRID_SCHEMA.0.fields.clone().to_vec(); + let mut grid_schema_fields = Vec::with_capacity( + GRID_SCHEMA.0.fields.len() + model_table_metadata.tag_column_indices.len(), + ); + + grid_schema_fields.extend(GRID_SCHEMA.0.fields.clone().to_vec()); for index in &model_table_metadata.tag_column_indices { grid_schema_fields.push(Arc::new(model_table_metadata.schema.field(*index).clone())); } @@ -212,7 +220,8 @@ fn query_order_and_requirement( nulls_first: false, }; - let mut physical_sort_exprs = vec![]; + let mut physical_sort_exprs = + Vec::with_capacity(model_table_metadata.tag_column_indices.len() + 1); for index in &model_table_metadata.tag_column_indices { let tag_column_name = model_table_metadata.schema.field(*index).name(); @@ -361,7 +370,7 @@ fn new_binary_expr(left: Expr, op: Operator, right: Expr) -> Expr { } /// Convert `maybe_expr` to a [`PhysicalExpr`] with the types in `query_schema` if possible. -fn maybe_convert_logical_expr_to_physical_expr( +fn try_convert_logical_expr_to_physical_expr( maybe_expr: Option<&Expr>, query_schema: SchemaRef, ) -> DataFusionResult>> { @@ -586,12 +595,12 @@ impl TableProvider for ModelTable { let (maybe_rewritten_parquet_filters, maybe_rewritten_grid_filters) = rewrite_and_combine_filters(schema, filters); - let maybe_physical_parquet_filters = maybe_convert_logical_expr_to_physical_expr( + let maybe_physical_parquet_filters = try_convert_logical_expr_to_physical_expr( maybe_rewritten_parquet_filters.as_ref(), self.query_compressed_schema.clone(), )?; - let maybe_physical_grid_filters = maybe_convert_logical_expr_to_physical_expr( + let maybe_physical_grid_filters = try_convert_logical_expr_to_physical_expr( maybe_rewritten_grid_filters.as_ref(), self.grid_schema.clone(), )?; diff --git a/crates/modelardb_storage/src/query/normal_table.rs b/crates/modelardb_storage/src/query/normal_table.rs index 191714a24..fdfd1b8be 100644 --- a/crates/modelardb_storage/src/query/normal_table.rs +++ b/crates/modelardb_storage/src/query/normal_table.rs @@ -34,7 +34,7 @@ use tonic::async_trait; /// A queryable representation of a normal table. [`NormalTable`] wraps the [`TableProvider`] /// [`DeltaTable`] and passes most methods calls directly to it. Thus, it can be registered with -/// Apache Arrow DataFusion. [`DeltaTable`] is extended in two ways, `delta_table` is updated to the +/// Apache DataFusion. [`DeltaTable`] is extended in two ways, `delta_table` is updated to the /// latest snapshot when accessed and support for inserting has been added. #[derive(Debug)] pub(crate) struct NormalTable { diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index c2ff0ae88..0367f0c33 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -13,8 +13,8 @@ * limitations under the License. */ -//! Implementation of the Apache Arrow DataFusion execution plan [`SortedJoinExec`] and its -//! corresponding stream [`SortedJoinStream`] which joins multiple sorted array produced by +//! Implementation of the Apache DataFusion execution plan [`SortedJoinExec`] and its corresponding +//! stream [`SortedJoinStream`] which joins multiple sorted array produced by //! [`GridExecs`](crate::query::grid_exec::GridExec) streams and combines them with the time series //! tags retrieved from the [`TableMetadataManager`](metadata::table_metadata_manager::TableMetadataManager) //! to create the complete results containing a timestamp column, one or more field columns, and zero @@ -171,8 +171,8 @@ impl ExecutionPlan for SortedJoinExec { } /// Specify that [`SortedJoinStream`] requires one partition for each input as it assumes that - /// the sort order is the same for all inputs and Apache Arrow DataFusion only guarantees the - /// sort order within each partition rather than the inputs' global sort order. + /// the sort order is the same for all inputs and Apache DataFusion only guarantees the sort + /// order within each partition rather than the inputs' global sort order. fn required_input_distribution(&self) -> Vec { vec![Distribution::SinglePartition; self.inputs.len()] } From a9eaeb844785664312086622f2b79621080d51b1 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 5 Mar 2025 17:44:51 +0100 Subject: [PATCH 68/69] Use Arc instead of SchemaRef --- crates/modelardb_client/src/main.rs | 6 +++--- crates/modelardb_common/src/remote.rs | 5 +++-- crates/modelardb_server/src/context.rs | 4 ++-- crates/modelardb_server/src/remote.rs | 3 +-- crates/modelardb_storage/src/lib.rs | 4 ++-- .../src/query/generated_as_exec.rs | 14 +++++++------- .../modelardb_storage/src/query/grid_exec.rs | 11 +++++------ .../src/query/metadata_table.rs | 5 +++-- .../src/query/model_table.rs | 19 +++++++------------ .../src/query/normal_table.rs | 5 +++-- .../src/query/sorted_join_exec.rs | 14 +++++++------- crates/modelardb_types/src/types.rs | 13 ++++++++----- 12 files changed, 51 insertions(+), 52 deletions(-) diff --git a/crates/modelardb_client/src/main.rs b/crates/modelardb_client/src/main.rs index fec58c286..7cd07166b 100644 --- a/crates/modelardb_client/src/main.rs +++ b/crates/modelardb_client/src/main.rs @@ -27,7 +27,7 @@ use std::sync::Arc; use std::time::Instant; use arrow::array::ArrayRef; -use arrow::datatypes::{Schema, SchemaRef, ToByteSlice}; +use arrow::datatypes::{Schema, ToByteSlice}; use arrow::ipc::convert; use arrow::util::pretty; use arrow_flight::flight_service_client::FlightServiceClient; @@ -387,7 +387,7 @@ async fn execute_query_and_print_result( /// Returns [`ModelarDbClientError`] if the batches in the result set could not be printed. async fn print_batches_with_confirmation( mut stream: Streaming, - schema: SchemaRef, + schema: Arc, dictionaries_by_id: &HashMap, ) -> Result<()> { let mut user_input = String::new(); @@ -424,7 +424,7 @@ async fn print_batches_with_confirmation( /// batches in the result set could not be printed. async fn print_batches_without_confirmation( mut stream: Streaming, - schema: SchemaRef, + schema: Arc, dictionaries_by_id: &HashMap, ) -> Result<()> { while let Some(flight_data) = stream.message().await? { diff --git a/crates/modelardb_common/src/remote.rs b/crates/modelardb_common/src/remote.rs index 1e0450499..ab2a621e3 100644 --- a/crates/modelardb_common/src/remote.rs +++ b/crates/modelardb_common/src/remote.rs @@ -17,9 +17,10 @@ use std::collections::HashMap; use std::error::Error; +use std::sync::Arc; use arrow::array::ArrayRef; -use arrow::datatypes::SchemaRef; +use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; use arrow_flight::{FlightData, FlightDescriptor, utils}; use tonic::Status; @@ -39,7 +40,7 @@ pub fn table_name_from_flight_descriptor( /// could not be converted, [`Status`] is returned. pub fn flight_data_to_record_batch( flight_data: &FlightData, - schema: &SchemaRef, + schema: &Arc, dictionaries_by_id: &HashMap, ) -> Result { debug_assert_eq!(flight_data.flight_descriptor, None); diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 59c2f7b0d..3f4aec0ad 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -18,7 +18,7 @@ use std::sync::Arc; -use datafusion::arrow::datatypes::{Schema, SchemaRef}; +use datafusion::arrow::datatypes::Schema; use datafusion::catalog::SchemaProvider; use datafusion::prelude::SessionContext; use modelardb_storage::metadata::model_table_metadata::ModelTableMetadata; @@ -391,7 +391,7 @@ impl Context { pub async fn schema_of_table_in_default_database_schema( &self, table_name: &str, - ) -> Result { + ) -> Result> { let database_schema = self.default_database_schema()?; let table = database_schema.table(table_name).await?.ok_or_else(|| { diff --git a/crates/modelardb_server/src/remote.rs b/crates/modelardb_server/src/remote.rs index 33de90ad8..0d5cccfe4 100644 --- a/crates/modelardb_server/src/remote.rs +++ b/crates/modelardb_server/src/remote.rs @@ -32,7 +32,6 @@ use arrow_flight::{ SchemaResult, Ticket, utils, }; use datafusion::arrow::array::{ArrayRef, StringArray, UInt64Array}; -use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::DataFusionError; @@ -273,7 +272,7 @@ impl FlightServiceHandler { async fn ingest_into_normal_table( &self, table_name: &str, - schema: &SchemaRef, + schema: &Arc, flight_data_stream: &mut Streaming, ) -> StdResult<(), Status> { // Retrieve the data until the request does not contain any more data. diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index be4fb70ee..9a4779bcb 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -33,7 +33,7 @@ use arrow::array::{ }; use arrow::compute; use arrow::compute::concat_batches; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, Schema}; use arrow::ipc::reader::StreamReader; use arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; use arrow_flight::{IpcMessage, SchemaAsIpc}; @@ -300,7 +300,7 @@ pub fn try_convert_record_batch_to_bytes(record_batch: &RecordBatch) -> Result, - schema: &SchemaRef, + schema: &Arc, ) -> Result { let bytes: Bytes = record_batch_bytes.into(); let reader = StreamReader::try_new(bytes.reader(), None)?; diff --git a/crates/modelardb_storage/src/query/generated_as_exec.rs b/crates/modelardb_storage/src/query/generated_as_exec.rs index c6c83d2a2..1ee6b4ad9 100644 --- a/crates/modelardb_storage/src/query/generated_as_exec.rs +++ b/crates/modelardb_storage/src/query/generated_as_exec.rs @@ -23,8 +23,8 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; +use arrow::datatypes::Schema; use datafusion::arrow::array::StringArray; -use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; use datafusion::arrow::temporal_conversions; use datafusion::error::{DataFusionError, Result as DataFusionResult}; @@ -63,7 +63,7 @@ impl ColumnToGenerate { #[derive(Debug)] pub(super) struct GeneratedAsExec { /// Schema of the execution plan. - schema: SchemaRef, + schema: Arc, /// Columns to generate and the index they should be at. columns_to_generate: Vec, /// Execution plan to read batches of segments from. @@ -76,7 +76,7 @@ pub(super) struct GeneratedAsExec { impl GeneratedAsExec { pub(super) fn new( - schema: SchemaRef, + schema: Arc, columns_to_generate: Vec, input: Arc, ) -> Arc { @@ -113,7 +113,7 @@ impl ExecutionPlan for GeneratedAsExec { } /// Return the schema of the plan. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.schema.clone() } @@ -184,7 +184,7 @@ impl DisplayAs for GeneratedAsExec { /// adds them to the batch, and then returns the result. struct GeneratedAsStream { /// Schema of the stream. - schema: SchemaRef, + schema: Arc, /// Columns to generate and the index they should be at. columns_to_generate: Vec, /// Stream to read batches of rows from. @@ -195,7 +195,7 @@ struct GeneratedAsStream { impl GeneratedAsStream { fn new( - schema: SchemaRef, + schema: Arc, columns_to_generate: Vec, input: SendableRecordBatchStream, baseline_metrics: BaselineMetrics, @@ -321,7 +321,7 @@ impl Stream for GeneratedAsStream { impl RecordBatchStream for GeneratedAsStream { /// Return the schema of the stream. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.schema.clone() } } diff --git a/crates/modelardb_storage/src/query/grid_exec.rs b/crates/modelardb_storage/src/query/grid_exec.rs index 99cd43504..114c879ef 100644 --- a/crates/modelardb_storage/src/query/grid_exec.rs +++ b/crates/modelardb_storage/src/query/grid_exec.rs @@ -31,7 +31,6 @@ use datafusion::arrow::array::{ Array, ArrayBuilder, ArrayRef, BinaryArray, Float32Array, UInt8Array, }; use datafusion::arrow::compute::filter_record_batch; -use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; use datafusion::common::cast::as_boolean_array; use datafusion::error::{DataFusionError, Result as DataFusionResult}; @@ -56,7 +55,7 @@ use modelardb_types::types::{TimestampArray, TimestampBuilder, ValueArray, Value #[derive(Debug, Clone)] pub(crate) struct GridExec { /// Schema of the execution plan. - schema: SchemaRef, + schema: Arc, /// Predicate to filter data points by. maybe_predicate: Option>, /// Number of data points requested by the query. @@ -123,7 +122,7 @@ impl ExecutionPlan for GridExec { } /// Return the schema of the plan. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.schema.clone() } @@ -218,7 +217,7 @@ impl DisplayAs for GridExec { /// points from the metadata and models in the segments, and returns batches of data points. struct GridStream { /// Schema of the stream. - schema: SchemaRef, + schema: Arc, /// Predicate to filter data points by. maybe_predicate: Option>, /// Stream to read batches of compressed segments from. @@ -235,7 +234,7 @@ struct GridStream { impl GridStream { fn new( - schema: SchemaRef, + schema: Arc, maybe_predicate: Option>, limit: Option, input: SendableRecordBatchStream, @@ -429,7 +428,7 @@ impl Stream for GridStream { impl RecordBatchStream for GridStream { /// Return the schema of the stream. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.schema.clone() } } diff --git a/crates/modelardb_storage/src/query/metadata_table.rs b/crates/modelardb_storage/src/query/metadata_table.rs index e8725b9a7..5a814070f 100644 --- a/crates/modelardb_storage/src/query/metadata_table.rs +++ b/crates/modelardb_storage/src/query/metadata_table.rs @@ -19,12 +19,13 @@ use std::{any::Any, sync::Arc}; +use arrow::datatypes::Schema; use datafusion::catalog::Session; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::logical_expr::Expr; use datafusion::physical_plan::ExecutionPlan; -use deltalake::{DeltaTable, arrow::datatypes::SchemaRef}; +use deltalake::DeltaTable; use tonic::async_trait; /// A queryable representation of a metadata table. [`MetadataTable`] wraps the [`TableProvider`] of @@ -51,7 +52,7 @@ impl TableProvider for MetadataTable { } /// Return the query schema of the metadata table registered with Apache DataFusion. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { TableProvider::schema(&self.delta_table) } diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index c8c29c100..205928c23 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -26,9 +26,7 @@ use std::sync::Arc; use arrow::compute::SortOptions; use arrow::datatypes::DataType::Utf8; use async_trait::async_trait; -use datafusion::arrow::datatypes::{ - ArrowPrimitiveType, DataType, Field, Schema, SchemaRef, TimeUnit, -}; +use datafusion::arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema, TimeUnit}; use datafusion::catalog::Session; use datafusion::common::{Statistics, ToDFSchema}; use datafusion::datasource::listing::PartitionedFile; @@ -255,10 +253,7 @@ fn query_order_and_requirement( /// to a filter that is written in terms of the schema used for compressed segments by the storage /// engine and a filter that is written in terms of the schema used for univariate time series by /// [`GridExec`] for its output. If the filters cannot be rewritten an empty [`None`] is returned. -fn rewrite_and_combine_filters( - schema: &SchemaRef, - filters: &[Expr], -) -> (Option, Option) { +fn rewrite_and_combine_filters(schema: &Schema, filters: &[Expr]) -> (Option, Option) { let rewritten_filters = filters .iter() .filter_map(|filter| rewrite_filter(schema, filter)); @@ -279,7 +274,7 @@ fn rewrite_and_combine_filters( /// that is written in terms of the schema used for compressed segments by the storage engine and a /// filter that is written in terms of the schema used for univariate time series by [`GridExec`]. /// If the filter cannot be rewritten, [`None`] is returned. -fn rewrite_filter(query_schema: &SchemaRef, filter: &Expr) -> Option<(Expr, Expr)> { +fn rewrite_filter(query_schema: &Schema, filter: &Expr) -> Option<(Expr, Expr)> { match filter { Expr::BinaryExpr(BinaryExpr { left, op, right }) => { if let Expr::Column(column) = &**left { @@ -372,7 +367,7 @@ fn new_binary_expr(left: Expr, op: Operator, right: Expr) -> Expr { /// Convert `maybe_expr` to a [`PhysicalExpr`] with the types in `query_schema` if possible. fn try_convert_logical_expr_to_physical_expr( maybe_expr: Option<&Expr>, - query_schema: SchemaRef, + query_schema: Arc, ) -> DataFusionResult>> { // Option.map() is not used so errors can be returned with ?. if let Some(maybe_expr) = maybe_expr { @@ -388,7 +383,7 @@ fn try_convert_logical_expr_to_physical_expr( /// Convert `expr` to a [`PhysicalExpr`] with the types in `query_schema`. fn convert_logical_expr_to_physical_expr( expr: &Expr, - query_schema: SchemaRef, + query_schema: Arc, ) -> DataFusionResult> { let df_query_schema = query_schema.clone().to_dfschema()?; planner::create_physical_expr(expr, &df_query_schema, &ExecutionProps::new()) @@ -402,7 +397,7 @@ fn new_apache_parquet_exec( partition_filters: &[PartitionFilter], maybe_limit: Option, maybe_parquet_filters: &Option>, - file_schema: SchemaRef, + file_schema: Arc, output_ordering: Vec, ) -> DataFusionResult> { // Collect the LogicalFiles into a Vec so they can be sorted the same for all field columns. @@ -486,7 +481,7 @@ impl TableProvider for ModelTable { } /// Return the query schema of the model table registered with Apache DataFusion. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.model_table_metadata.query_schema.clone() } diff --git a/crates/modelardb_storage/src/query/normal_table.rs b/crates/modelardb_storage/src/query/normal_table.rs index fdfd1b8be..31aa5d6cc 100644 --- a/crates/modelardb_storage/src/query/normal_table.rs +++ b/crates/modelardb_storage/src/query/normal_table.rs @@ -21,6 +21,7 @@ use std::borrow::Cow; use std::{any::Any, sync::Arc}; +use arrow::datatypes::Schema; use datafusion::catalog::Session; use datafusion::common::Constraints; use datafusion::datasource::{TableProvider, TableType}; @@ -29,7 +30,7 @@ use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{Expr, LogicalPlan, TableProviderFilterPushDown}; use datafusion::physical_plan::insert::{DataSink, DataSinkExec}; use datafusion::physical_plan::{ExecutionPlan, Statistics}; -use deltalake::{DeltaTable, arrow::datatypes::SchemaRef}; +use deltalake::DeltaTable; use tonic::async_trait; /// A queryable representation of a normal table. [`NormalTable`] wraps the [`TableProvider`] @@ -61,7 +62,7 @@ impl TableProvider for NormalTable { } /// Return the query schema of the normal table registered with Apache DataFusion. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { TableProvider::schema(&self.delta_table) } diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index 0367f0c33..11bc8e3fe 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -26,8 +26,8 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context as StdTaskContext, Poll}; +use arrow::datatypes::Schema; use datafusion::arrow::array::ArrayRef; -use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError, Result as DataFusionResult}; use datafusion::execution::context::TaskContext; @@ -55,7 +55,7 @@ pub(crate) enum SortedJoinColumnType { #[derive(Debug)] pub(crate) struct SortedJoinExec { /// Schema of the execution plan. - schema: SchemaRef, + schema: Arc, /// Order of columns to return. return_order: Vec, /// Execution plans to read batches of data points from. @@ -70,7 +70,7 @@ pub(crate) struct SortedJoinExec { impl SortedJoinExec { pub(crate) fn new( - schema: SchemaRef, + schema: Arc, return_order: Vec, inputs: Vec>, query_requirement_data_point: LexRequirement, @@ -108,7 +108,7 @@ impl ExecutionPlan for SortedJoinExec { } /// Return the schema of the plan. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.schema.clone() } @@ -199,7 +199,7 @@ impl DisplayAs for SortedJoinExec { struct SortedJoinStream { /// Schema of the stream. - schema: SchemaRef, + schema: Arc, /// Order of columns to return. return_order: Vec, /// Streams to read batches of data points from. @@ -212,7 +212,7 @@ struct SortedJoinStream { impl SortedJoinStream { fn new( - schema: SchemaRef, + schema: Arc, return_order: Vec, inputs: Vec, baseline_metrics: BaselineMetrics, @@ -334,7 +334,7 @@ impl Stream for SortedJoinStream { impl RecordBatchStream for SortedJoinStream { /// Return the schema of the stream. - fn schema(&self) -> SchemaRef { + fn schema(&self) -> Arc { self.schema.clone() } } diff --git a/crates/modelardb_types/src/types.rs b/crates/modelardb_types/src/types.rs index 141676212..093a36031 100644 --- a/crates/modelardb_types/src/types.rs +++ b/crates/modelardb_types/src/types.rs @@ -21,6 +21,9 @@ use std::fmt; use std::str::FromStr; +use std::sync::Arc; + +use arrow::datatypes::Schema; use crate::error::{ModelarDbTypesError, Result}; @@ -42,19 +45,19 @@ pub type ValueArray = arrow::array::PrimitiveArray; // Types used for the schema of compressed data, the configuration, and table metadata. #[derive(Clone)] -pub struct CompressedSchema(pub arrow::datatypes::SchemaRef); +pub struct CompressedSchema(pub Arc); #[derive(Clone)] -pub struct QueryCompressedSchema(pub arrow::datatypes::SchemaRef); +pub struct QueryCompressedSchema(pub Arc); #[derive(Clone)] -pub struct GridSchema(pub arrow::datatypes::SchemaRef); +pub struct GridSchema(pub Arc); #[derive(Clone)] -pub struct ConfigurationSchema(pub arrow::datatypes::SchemaRef); +pub struct ConfigurationSchema(pub Arc); #[derive(Clone)] -pub struct TableMetadataSchema(pub arrow::datatypes::SchemaRef); +pub struct TableMetadataSchema(pub Arc); /// Absolute or relative per-value error bound. #[derive(Debug, Copy, Clone, PartialEq)] From 6bd5d893d9e21b2ec48e8e2a45c0a171dc313866 Mon Sep 17 00:00:00 2001 From: CGodiksen <36046286+CGodiksen@users.noreply.github.com> Date: Wed, 5 Mar 2025 18:06:00 +0100 Subject: [PATCH 69/69] Update based on comments from @skejserjensen --- .../src/storage/uncompressed_data_buffer.rs | 7 +++---- crates/modelardb_storage/src/query/model_table.rs | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs index 4e60815b1..6fc614ff1 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_buffer.rs @@ -19,8 +19,8 @@ //! support for storing uncompressed data points in Apache Parquet files on disk. use std::fmt::{Debug, Formatter, Result as FmtResult}; -use std::{iter, mem}; use std::sync::Arc; +use std::{iter, mem}; use datafusion::arrow::array::{Array, ArrayBuilder, StringArray}; use datafusion::arrow::compute; @@ -173,9 +173,8 @@ impl UncompressedInMemoryDataBuffer { } else if self.model_table_metadata.is_tag(column_index) { // The tag value is the same for each data point so it is not sorted. let tag_value = self.tag_values[tag_column_index].clone(); - let tag_array: StringArray = iter::repeat(Some(tag_value)) - .take(buffer_length) - .collect(); + let tag_array: StringArray = + iter::repeat(Some(tag_value)).take(buffer_length).collect(); columns.push(Arc::new(tag_array)); tag_column_index += 1; diff --git a/crates/modelardb_storage/src/query/model_table.rs b/crates/modelardb_storage/src/query/model_table.rs index 205928c23..5cb60ab63 100644 --- a/crates/modelardb_storage/src/query/model_table.rs +++ b/crates/modelardb_storage/src/query/model_table.rs @@ -91,7 +91,7 @@ impl ModelTable { data_sink: Arc, ) -> Arc { // Compute the index of the first stored field column in the model table's query schema. It - // is used for queries without fields as uids, timestamps, and values are stored together. + // is used for queries without fields as tags, timestamps, and values are stored together. let fallback_field_column = { model_table_metadata .query_schema @@ -407,7 +407,6 @@ fn new_apache_parquet_exec( .collect::, DeltaTableError>>() .map_err(|error| DataFusionError::Plan(error.to_string()))?; - // TODO: prune the Apache Parquet files using metadata and maybe_parquet_filters if possible. logical_files.sort_by_key(|logical_file| logical_file.modification_time()); // Create the data source operator. Assumes the ObjectStore exists. @@ -416,8 +415,6 @@ fn new_apache_parquet_exec( .map(|logical_file| logical_file_to_partitioned_file(logical_file)) .collect::>>()?; - // TODO: give the optimizer more info for timestamps and values through statistics, e.g, min - // can be computed using only the metadata Delta Lake due to the aggregate_statistics rule. let log_store = delta_table.log_store(); let file_scan_config = FileScanConfig { object_store_url: log_store.object_store_url(),