From d96c93078630f67176ed79d59dfea0ef6fbd70ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Wed, 23 Apr 2025 20:32:32 +0000 Subject: [PATCH 01/31] Make delta_lake argument of TMM --- .../src/operations/data_folder.rs | 45 +++--- crates/modelardb_manager/src/metadata.rs | 7 +- crates/modelardb_server/src/data_folders.rs | 8 +- .../src/metadata/table_metadata_manager.rs | 131 ++---------------- 4 files changed, 37 insertions(+), 154 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 091f751a9..399e7f41e 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -110,7 +110,7 @@ impl DisplayAs for DataFolderDataSink { /// Provides access to modelardb_embedded's components. pub struct DataFolder { /// Delta Lake for storing metadata and data in Apache Parquet files. - delta_lake: DeltaLake, + delta_lake: Arc, /// Metadata manager for providing access to metadata related to tables. It is stored in an /// [`Arc`] because it is shared with each of the time series tables for use in query planning. table_metadata_manager: Arc, @@ -122,8 +122,10 @@ impl DataFolder { /// Creates a [`DataFolder`] that manages data in memory and returns it. If the metadata tables /// could not be created, [`ModelarDbEmbeddedError`] is returned. pub async fn open_memory() -> Result { - let delta_lake = DeltaLake::new_in_memory(); - let table_metadata_manager = Arc::new(TableMetadataManager::try_new_in_memory().await?); + let delta_lake = Arc::new(DeltaLake::new_in_memory()); + + let table_metadata_manager = + Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await } @@ -132,10 +134,10 @@ impl DataFolder { /// returns it. If the folder does not exist and could not be created or the metadata tables /// could not be created, [`ModelarDbEmbeddedError`] is returned. pub async fn open_local(data_folder_path: &StdPath) -> Result { - let delta_lake = DeltaLake::try_from_local_path(data_folder_path)?; + let delta_lake = Arc::new(DeltaLake::try_from_local_path(data_folder_path)?); let table_metadata_manager = - Arc::new(TableMetadataManager::try_from_path(data_folder_path).await?); + Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await } @@ -156,22 +158,15 @@ impl DataFolder { deltalake::aws::register_handlers(None); // Construct data folder. - let delta_lake = DeltaLake::try_from_s3_configuration( + let delta_lake = Arc::new(DeltaLake::try_from_s3_configuration( endpoint.clone(), bucket_name.clone(), access_key_id.clone(), secret_access_key.clone(), - )?; - - let table_metadata_manager = Arc::new( - TableMetadataManager::try_from_s3_configuration( - endpoint, - bucket_name, - access_key_id, - secret_access_key, - ) - .await?, - ); + )?); + + let table_metadata_manager = + Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await } @@ -184,20 +179,14 @@ impl DataFolder { access_key: String, container_name: String, ) -> Result { - let delta_lake = DeltaLake::try_from_azure_configuration( + let delta_lake = Arc::new(DeltaLake::try_from_azure_configuration( account_name.clone(), access_key.clone(), container_name.clone(), - )?; + )?); - let table_metadata_manager = Arc::new( - TableMetadataManager::try_from_azure_configuration( - account_name, - access_key, - container_name, - ) - .await?, - ); + let table_metadata_manager = + Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await } @@ -206,7 +195,7 @@ impl DataFolder { /// [`SessionContext`], and return it. If the tables could not be registered, /// [`ModelarDbEmbeddedError`] is returned. async fn try_new_and_register_tables( - delta_lake: DeltaLake, + delta_lake: Arc, table_metadata_manager: Arc, ) -> Result { // Construct data folder. diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index ed057c280..6783382a7 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -37,7 +37,7 @@ use crate::error::Result; /// and persisting edges. The data that needs to be persisted is stored in the metadata Delta Lake. pub struct MetadataManager { /// Delta Lake with functionality to read and write to and from the manager metadata tables. - delta_lake: DeltaLake, + delta_lake: Arc, /// Metadata manager used to interface with the subset of the manager metadata Delta Lake /// related to normal tables and time series tables. pub(crate) table_metadata_manager: TableMetadataManager, @@ -330,12 +330,13 @@ mod tests { async fn create_metadata_manager() -> (TempDir, MetadataManager) { let temp_dir = tempfile::tempdir().unwrap(); - let table_metadata_manager = TableMetadataManager::try_from_path(temp_dir.path()) + let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); + let table_metadata_manager = TableMetadataManager::try_new(delta_lake.clone()) .await .unwrap(); let metadata_manager = MetadataManager { - delta_lake: DeltaLake::try_from_local_path(temp_dir.path()).unwrap(), + delta_lake, table_metadata_manager, session_context: Arc::new(SessionContext::new()), }; diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index 24adf1a50..19d69975c 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -43,8 +43,8 @@ impl DataFolder { /// could not be parsed, if the folder does not exist and could not be created, or if the /// metadata tables could not be created, [`ModelarDbServerError`] is returned. pub async fn try_from_local_url(local_url: &str) -> Result { - let delta_lake = DeltaLake::try_from_local_url(local_url)?; - let table_metadata_manager = TableMetadataManager::try_from_local_url(local_url).await?; + let delta_lake = Arc::new(DeltaLake::try_from_local_url(local_url)?); + let table_metadata_manager = TableMetadataManager::try_new(delta_lake.clone()).await?; if local_url.starts_with("memory://") { warn!( @@ -54,7 +54,7 @@ impl DataFolder { }; Ok(Self { - delta_lake: Arc::new(delta_lake), + delta_lake, table_metadata_manager: Arc::new(table_metadata_manager), }) } @@ -72,7 +72,7 @@ impl DataFolder { TableMetadataManager::try_from_storage_configuration(storage_configuration).await?; Ok(Self { - delta_lake: Arc::new(remote_delta_lake), + delta_lake, table_metadata_manager: Arc::new(remote_table_metadata_manager), }) } diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs index 7bde72bf6..cbb1660d0 100644 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs @@ -17,7 +17,6 @@ //! and the manager metadata Delta Lake. Note that the entire server metadata Delta Lake can be accessed //! through this metadata manager, while it only supports a subset of the manager metadata Delta Lake. -use std::path::Path as StdPath; use std::sync::Arc; use arrow::array::{Array, BinaryArray, BooleanArray, Float32Array, Int16Array, StringArray}; @@ -44,118 +43,17 @@ enum TableType { /// tables. The data that needs to be persisted is stored in the metadata Delta Lake. pub struct TableMetadataManager { /// Delta Lake with functionality to read and write to and from the metadata tables. - delta_lake: DeltaLake, + delta_lake: Arc, /// Session context used to query the metadata Delta Lake tables using Apache DataFusion. session_context: Arc, } impl TableMetadataManager { - /// Create a new [`TableMetadataManager`] that saves the metadata to an object store given by - /// `local_url` and initialize the metadata tables. If `local_url` could not be parsed or the + /// Create a new [`TableMetadataManager`] that saves the metadata to `delta_lake` If the /// metadata tables could not be created, return [`ModelarDbStorageError`]. - pub async fn try_from_local_url(local_url: &str) -> Result { + pub async fn try_new(delta_lake: Arc) -> Result { let table_metadata_manager = Self { - delta_lake: DeltaLake::try_from_local_url(local_url)?, - session_context: Arc::new(SessionContext::new()), - }; - - table_metadata_manager - .create_and_register_metadata_delta_lake_tables() - .await?; - - Ok(table_metadata_manager) - } - - /// Create a new [`TableMetadataManager`] that saves the metadata to an in-memory Delta Lake and - /// initialize the metadata tables. If the metadata tables could not be created, return - /// [`ModelarDbStorageError`]. - pub async fn try_new_in_memory() -> Result { - let table_metadata_manager = Self { - delta_lake: DeltaLake::new_in_memory(), - session_context: Arc::new(SessionContext::new()), - }; - - table_metadata_manager - .create_and_register_metadata_delta_lake_tables() - .await?; - - Ok(table_metadata_manager) - } - - /// Create a new [`TableMetadataManager`] that saves the metadata to `folder_path` and - /// initialize the metadata tables. If the metadata tables could not be created, return - /// [`ModelarDbStorageError`]. - pub async fn try_from_path(folder_path: &StdPath) -> Result { - let table_metadata_manager = Self { - delta_lake: DeltaLake::try_from_local_path(folder_path)?, - session_context: Arc::new(SessionContext::new()), - }; - - table_metadata_manager - .create_and_register_metadata_delta_lake_tables() - .await?; - - Ok(table_metadata_manager) - } - - /// Create a new [`TableMetadataManager`] that saves the metadata to a remote object store given - /// by `storage_configuration` and initialize the metadata tables. If a connection could not be - /// made or the metadata tables could not be created, return [`ModelarDbStorageError`]. - pub async fn try_from_storage_configuration( - storage_configuration: protocol::manager_metadata::StorageConfiguration, - ) -> Result { - let table_metadata_manager = Self { - delta_lake: DeltaLake::try_remote_from_storage_configuration(storage_configuration)?, - session_context: Arc::new(SessionContext::new()), - }; - - table_metadata_manager - .create_and_register_metadata_delta_lake_tables() - .await?; - - Ok(table_metadata_manager) - } - - /// Create a new [`TableMetadataManager`] that saves the metadata to a remote S3-compatible - /// object store and initialize the metadata tables. If the connection cannot be made or the - /// metadata tables could not be created, return [`ModelarDbStorageError`]. - pub async fn try_from_s3_configuration( - endpoint: String, - bucket_name: String, - access_key_id: String, - secret_access_key: String, - ) -> Result { - let table_metadata_manager = Self { - delta_lake: DeltaLake::try_from_s3_configuration( - endpoint, - bucket_name, - access_key_id, - secret_access_key, - )?, - session_context: Arc::new(SessionContext::new()), - }; - - table_metadata_manager - .create_and_register_metadata_delta_lake_tables() - .await?; - - Ok(table_metadata_manager) - } - - /// Create a new [`TableMetadataManager`] that saves the metadata to a remote Azure-compatible - /// object store and initialize the metadata tables. If the connection cannot be made or the - /// metadata tables could not be created, return [`ModelarDbStorageError`]. - pub async fn try_from_azure_configuration( - account_name: String, - access_key: String, - container_name: String, - ) -> Result { - let table_metadata_manager = Self { - delta_lake: DeltaLake::try_from_azure_configuration( - account_name, - access_key, - container_name, - )?, + delta_lake, session_context: Arc::new(SessionContext::new()), }; @@ -619,11 +517,9 @@ mod tests { #[tokio::test] async fn test_create_metadata_delta_lake_tables() { let temp_dir = tempfile::tempdir().unwrap(); - let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path()) - .await - .unwrap(); + let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); + let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); - // Verify that the tables were created, registered, and has the expected columns. assert!( metadata_manager .session_context @@ -850,9 +746,8 @@ mod tests { async fn create_metadata_manager_and_save_normal_tables() -> (TempDir, TableMetadataManager) { let temp_dir = tempfile::tempdir().unwrap(); - let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path()) - .await - .unwrap(); + let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); + let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); metadata_manager .save_normal_table_metadata("normal_table_1") @@ -937,9 +832,8 @@ mod tests { #[tokio::test] async fn test_generated_columns() { let temp_dir = tempfile::tempdir().unwrap(); - let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path()) - .await - .unwrap(); + let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); + let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); let query_schema = Arc::new(Schema::new(vec![ Field::new("generated_column_1", ArrowValue::DATA_TYPE, false), @@ -1000,9 +894,8 @@ mod tests { async fn create_metadata_manager_and_save_time_series_table() -> (TempDir, TableMetadataManager) { let temp_dir = tempfile::tempdir().unwrap(); - let metadata_manager = TableMetadataManager::try_from_path(temp_dir.path()) - .await - .unwrap(); + let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); + let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); // Save a time series table to the metadata Delta Lake. let time_series_table_metadata = table::time_series_table_metadata(); From ae9be084700918952217e778143ed02796a3c36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Thu, 24 Apr 2025 11:57:42 +0000 Subject: [PATCH 02/31] Remove TableMetadataManager --- .../src/operations/data_folder.rs | 79 +- crates/modelardb_manager/src/main.rs | 2 +- crates/modelardb_manager/src/metadata.rs | 16 +- crates/modelardb_manager/src/remote.rs | 27 +- crates/modelardb_server/src/context.rs | 88 +- crates/modelardb_server/src/data_folders.rs | 23 +- .../src/storage/compressed_data_manager.rs | 2 +- .../src/storage/data_transfer.rs | 9 +- .../src/storage/uncompressed_data_manager.rs | 2 +- crates/modelardb_storage/src/delta_lake.rs | 831 +++++++++++++++++- crates/modelardb_storage/src/lib.rs | 2 +- .../src/optimizer/model_simple_aggregates.rs | 4 +- crates/modelardb_storage/src/parser.rs | 1 + .../src/query/time_series_table.rs | 1 + .../src/time_series_table_metadata.rs | 559 ++++++++++++ 15 files changed, 1481 insertions(+), 165 deletions(-) create mode 100644 crates/modelardb_storage/src/time_series_table_metadata.rs diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 399e7f41e..d6c43b45c 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -110,10 +110,7 @@ impl DisplayAs for DataFolderDataSink { /// Provides access to modelardb_embedded's components. pub struct DataFolder { /// Delta Lake for storing metadata and data in Apache Parquet files. - delta_lake: Arc, - /// Metadata manager for providing access to metadata related to tables. It is stored in an - /// [`Arc`] because it is shared with each of the time series tables for use in query planning. - table_metadata_manager: Arc, + delta_lake: DeltaLake, /// Context providing access to a specific session of Apache DataFusion. session_context: SessionContext, } @@ -122,24 +119,16 @@ impl DataFolder { /// Creates a [`DataFolder`] that manages data in memory and returns it. If the metadata tables /// could not be created, [`ModelarDbEmbeddedError`] is returned. pub async fn open_memory() -> Result { - let delta_lake = Arc::new(DeltaLake::new_in_memory()); - - let table_metadata_manager = - Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); - - Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await + let delta_lake = DeltaLake::new_in_memory().await?; + Self::try_new_and_register_tables(delta_lake).await } /// Creates a [`DataFolder`] that manages data in the local folder at `data_folder_path` and /// returns it. If the folder does not exist and could not be created or the metadata tables /// could not be created, [`ModelarDbEmbeddedError`] is returned. pub async fn open_local(data_folder_path: &StdPath) -> Result { - let delta_lake = Arc::new(DeltaLake::try_from_local_path(data_folder_path)?); - - let table_metadata_manager = - Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); - - Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await + let delta_lake = DeltaLake::try_from_local_path(data_folder_path).await?; + Self::try_new_and_register_tables(delta_lake).await } /// Creates a [`DataFolder`] that manages data in an object store with an S3-compatible API and @@ -158,17 +147,15 @@ impl DataFolder { deltalake::aws::register_handlers(None); // Construct data folder. - let delta_lake = Arc::new(DeltaLake::try_from_s3_configuration( + let delta_lake = DeltaLake::try_from_s3_configuration( endpoint.clone(), bucket_name.clone(), access_key_id.clone(), secret_access_key.clone(), - )?); - - let table_metadata_manager = - Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); + ) + .await?; - Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await + Self::try_new_and_register_tables(delta_lake).await } /// Creates a [`DataFolder`] that manages data in an object store with an Azure-compatible API @@ -179,42 +166,32 @@ impl DataFolder { access_key: String, container_name: String, ) -> Result { - let delta_lake = Arc::new(DeltaLake::try_from_azure_configuration( + let delta_lake = DeltaLake::try_from_azure_configuration( account_name.clone(), access_key.clone(), container_name.clone(), - )?); - - let table_metadata_manager = - Arc::new(TableMetadataManager::try_new(delta_lake.clone()).await?); + ) + .await?; - Self::try_new_and_register_tables(delta_lake, table_metadata_manager).await + Self::try_new_and_register_tables(delta_lake).await } /// Create a [`DataFolder`], register all normal tables and time series tables in it with its /// [`SessionContext`], and return it. If the tables could not be registered, /// [`ModelarDbEmbeddedError`] is returned. - async fn try_new_and_register_tables( - delta_lake: Arc, - table_metadata_manager: Arc, - ) -> Result { + async fn try_new_and_register_tables(delta_lake: DeltaLake) -> Result { // Construct data folder. let session_context = modelardb_storage::create_session_context(); let data_folder = DataFolder { delta_lake, - table_metadata_manager, session_context, }; // Register normal tables. let data_sink = Arc::new(DataFolderDataSink::new()); - for normal_table_name in data_folder - .table_metadata_manager - .normal_table_names() - .await? - { + for normal_table_name in data_folder.delta_lake.normal_table_names().await? { let delta_table = data_folder .delta_lake .delta_table(&normal_table_name) @@ -229,11 +206,7 @@ impl DataFolder { } // Register time series tables. - for metadata in data_folder - .table_metadata_manager - .time_series_table_metadata() - .await? - { + for metadata in data_folder.delta_lake.time_series_table_metadata().await? { let delta_table = data_folder.delta_lake.delta_table(&metadata.name).await?; modelardb_storage::register_time_series_table( @@ -382,7 +355,7 @@ impl DataFolder { /// table does not exist or the table is not a normal table, return [`None`]. async fn normal_table_schema(&self, table_name: &str) -> Option { if self - .table_metadata_manager + .delta_lake .is_normal_table(table_name) .await .is_ok_and(|is_normal_table| is_normal_table) @@ -429,7 +402,7 @@ impl Operations for DataFolder { .create_normal_table(table_name, &schema) .await?; - self.table_metadata_manager + self.delta_lake .save_normal_table_metadata(table_name) .await?; @@ -455,7 +428,7 @@ impl Operations for DataFolder { .create_time_series_table(&time_series_table_metadata) .await?; - self.table_metadata_manager + self.delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await?; @@ -476,7 +449,7 @@ impl Operations for DataFolder { /// Returns the name of all the tables. If the table names could not be retrieved from the /// metadata Delta Lake, [`ModelarDbEmbeddedError`] is returned. async fn tables(&mut self) -> Result> { - self.table_metadata_manager + self.delta_lake .table_names() .await .map_err(|error| error.into()) @@ -830,9 +803,7 @@ impl Operations for DataFolder { self.session_context.deregister_table(table_name)?; // Delete the table metadata from the metadata Delta Lake. - self.table_metadata_manager - .drop_table_metadata(table_name) - .await?; + self.delta_lake.drop_table_metadata(table_name).await?; // Drop the table from the Delta Lake. self.delta_lake.drop_table(table_name).await?; @@ -2408,7 +2379,7 @@ mod tests { // Verify that the normal table was dropped from the metadata Delta Lake. assert!( !data_folder - .table_metadata_manager + .delta_lake .is_normal_table(NORMAL_TABLE_NAME) .await .unwrap() @@ -2448,7 +2419,7 @@ mod tests { // Verify that the time series table was dropped from the metadata Delta Lake. assert!( !data_folder - .table_metadata_manager + .delta_lake .is_time_series_table(TIME_SERIES_TABLE_NAME) .await .unwrap() @@ -2683,7 +2654,7 @@ mod tests { // Verify that the normal table exists in the metadata Delta Lake. assert!( data_folder - .table_metadata_manager + .delta_lake .is_normal_table(table_name) .await .unwrap() @@ -2897,7 +2868,7 @@ mod tests { // Verify that the time series table exists in the metadata Delta Lake with the correct schema. let time_series_table_metadata = data_folder - .table_metadata_manager + .delta_lake .time_series_table_metadata_for_time_series_table(table_name) .await .unwrap(); diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index 7b3b8a678..669a297e6 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -48,7 +48,7 @@ pub struct RemoteDataFolder { /// Remote object store for storing data and metadata in Apache Parquet files. delta_lake: Arc, /// Manager for the access to the metadata Delta Lake. - metadata_manager: Arc, + pub(crate) metadata_manager: MetadataManager, } impl RemoteDataFolder { diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 6783382a7..3488a0fd0 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -25,7 +25,6 @@ use deltalake::DeltaTableError; use deltalake::datafusion::logical_expr::{col, lit}; use deltalake::datafusion::prelude::SessionContext; use modelardb_storage::delta_lake::DeltaLake; -use modelardb_storage::metadata::table_metadata_manager::TableMetadataManager; use modelardb_storage::{register_metadata_table, sql_and_concat}; use modelardb_types::flight::protocol; use modelardb_types::types::{Node, ServerMode}; @@ -36,13 +35,10 @@ use crate::error::Result; /// Stores the metadata required for reading from and writing to the normal tables and time series tables /// and persisting edges. The data that needs to be persisted is stored in the metadata Delta Lake. pub struct MetadataManager { - /// Delta Lake with functionality to read and write to and from the manager metadata tables. - delta_lake: Arc, - /// Metadata manager used to interface with the subset of the manager metadata Delta Lake - /// related to normal tables and time series tables. - pub(crate) table_metadata_manager: TableMetadataManager, + /// Delta Lake with functionality to read and write to and from the manager's metadata tables. + pub(crate) delta_lake: DeltaLake, /// Session context used to query the manager metadata Delta Lake tables using Apache DataFusion. - session_context: Arc, + session_context: SessionContext, } impl MetadataManager { @@ -330,15 +326,13 @@ mod tests { async fn create_metadata_manager() -> (TempDir, MetadataManager) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); - let table_metadata_manager = TableMetadataManager::try_new(delta_lake.clone()) + let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) .await .unwrap(); let metadata_manager = MetadataManager { delta_lake, - table_metadata_manager, - session_context: Arc::new(SessionContext::new()), + session_context: SessionContext::new(), }; metadata_manager diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 0619ebc87..914f2aa67 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -95,13 +95,9 @@ impl FlightServiceHandler { /// Return the schema of the table with the name `table_name`. If the table does not exist or /// the schema cannot be retrieved, return [`Status`]. async fn table_schema(&self, table_name: &str) -> StdResult, Status> { - let table_metadata_manager = &self - .context - .remote_data_folder - .metadata_manager - .table_metadata_manager; + let delta_lake = &self.context.remote_data_folder.metadata_manager.delta_lake; - if table_metadata_manager + if delta_lake .is_normal_table(table_name) .await .map_err(error_to_status_internal)? @@ -109,6 +105,7 @@ impl FlightServiceHandler { let delta_table = self .context .remote_data_folder + .metadata_manager .delta_lake .delta_table(table_name) .await @@ -121,12 +118,12 @@ impl FlightServiceHandler { .map_err(error_to_status_internal)?; Ok(Arc::new(schema)) - } else if table_metadata_manager + } else if delta_lake .is_time_series_table(table_name) .await .map_err(error_to_status_internal)? { - let time_series_table_metadata = table_metadata_manager + let time_series_table_metadata = delta_lake .time_series_table_metadata_for_time_series_table(table_name) .await .map_err(error_to_status_internal)?; @@ -146,7 +143,7 @@ impl FlightServiceHandler { .context .remote_data_folder .metadata_manager - .table_metadata_manager + .delta_lake .table_names() .await .map_err(error_to_status_internal)?; @@ -173,6 +170,7 @@ impl FlightServiceHandler { // Create an empty Delta Lake table. self.context .remote_data_folder + .metadata_manager .delta_lake .create_normal_table(table_name, schema) .await @@ -182,7 +180,7 @@ impl FlightServiceHandler { self.context .remote_data_folder .metadata_manager - .table_metadata_manager + .delta_lake .save_normal_table_metadata(table_name) .await .map_err(error_to_status_internal)?; @@ -220,6 +218,7 @@ impl FlightServiceHandler { // Create an empty Delta Lake table. self.context .remote_data_folder + .metadata_manager .delta_lake .create_time_series_table(&time_series_table_metadata) .await @@ -229,7 +228,7 @@ impl FlightServiceHandler { self.context .remote_data_folder .metadata_manager - .table_metadata_manager + .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .map_err(error_to_status_internal)?; @@ -271,7 +270,7 @@ impl FlightServiceHandler { self.context .remote_data_folder .metadata_manager - .table_metadata_manager + .delta_lake .drop_table_metadata(table_name) .await .map_err(error_to_status_internal)?; @@ -279,6 +278,7 @@ impl FlightServiceHandler { // Drop the table from the remote data folder data Delta lake. self.context .remote_data_folder + .metadata_manager .delta_lake .drop_table(table_name) .await @@ -309,6 +309,7 @@ impl FlightServiceHandler { // Truncate the table in the remote data folder data Delta lake. self.context .remote_data_folder + .metadata_manager .delta_lake .truncate_table(table_name) .await @@ -397,7 +398,7 @@ impl FlightService for FlightServiceHandler { .context .remote_data_folder .metadata_manager - .table_metadata_manager + .delta_lake .table_names() .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 387bcceab..df4de9d79 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -94,7 +94,7 @@ impl Context { // Persist the new normal table to the Delta Lake. self.data_folders .local_data_folder - .table_metadata_manager + .delta_lake .save_normal_table_metadata(table_name) .await?; @@ -138,7 +138,7 @@ impl Context { // Persist the new time series table to the metadata Delta Lake. self.data_folders .local_data_folder - .table_metadata_manager + .delta_lake .save_time_series_table_metadata(time_series_table_metadata) .await?; @@ -159,7 +159,7 @@ impl Context { let table_names = self .data_folders .local_data_folder - .table_metadata_manager + .delta_lake .normal_table_names() .await?; @@ -208,7 +208,7 @@ impl Context { let time_series_table_metadata = self .data_folders .local_data_folder - .table_metadata_manager + .delta_lake .time_series_table_metadata() .await?; @@ -273,7 +273,7 @@ impl Context { // Drop the table metadata from the metadata Delta Lake. self.data_folders .local_data_folder - .table_metadata_manager + .delta_lake .drop_table_metadata(table_name) .await?; @@ -503,7 +503,7 @@ mod tests { let time_series_table_metadata = context .data_folders .local_data_folder - .table_metadata_manager + .delta_lake .time_series_table_metadata() .await .unwrap(); @@ -605,14 +605,19 @@ mod tests { // The normal table should be deleted from the metadata Delta Lake. assert!( - !context - .data_folders - .local_data_folder - .table_metadata_manager - .is_normal_table(NORMAL_TABLE_NAME) - .await - .unwrap() - ); + !context + .data_folders + .local_data_folder + <<<<<<< HEAD + .table_metadata_manager + .is_normal_table(NORMAL_TABLE_NAME) + ======= + .delta_lake + .is_normal_table(test::NORMAL_TABLE_NAME) + >>>>>>> 7e75112 (Remove TableMetadataManager) + .await + .unwrap() + ); // The normal table should be deleted from the Delta Lake. assert!(!temp_dir.path().join("tables").exists()); @@ -642,14 +647,19 @@ mod tests { // The time series table should be deleted from the metadata Delta Lake. assert!( - !context - .data_folders - .local_data_folder - .table_metadata_manager - .is_time_series_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap() - ); + !context + .data_folders + .local_data_folder + <<<<<<< HEAD + .table_metadata_manager + .is_time_series_table(TIME_SERIES_TABLE_NAME) + ======= + .delta_lake + .is_time_series_table(test::TIME_SERIES_TABLE_NAME) + >>>>>>> 7e75112 (Remove TableMetadataManager) + .await + .unwrap() + ); // The time series table should be deleted from the Delta Lake. assert!(!temp_dir.path().join("tables").exists()); @@ -686,12 +696,17 @@ mod tests { // The normal table should not be deleted from the metadata Delta Lake. assert!( - local_data_folder - .table_metadata_manager - .is_normal_table(NORMAL_TABLE_NAME) - .await - .unwrap() - ); + local_data_folder + <<<<<<< HEAD + .table_metadata_manager + .is_normal_table(NORMAL_TABLE_NAME) + ======= + .delta_lake + .is_normal_table(test::NORMAL_TABLE_NAME) + >>>>>>> 7e75112 (Remove TableMetadataManager) + .await + .unwrap() + ); // The normal table data should be deleted from the Delta Lake. delta_table.load().await.unwrap(); @@ -719,12 +734,17 @@ mod tests { // The time series table should not be deleted from the metadata Delta Lake. assert!( - local_data_folder - .table_metadata_manager - .is_time_series_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap() - ); + local_data_folder + <<<<<<< HEAD + .table_metadata_manager + .is_time_series_table(TIME_SERIES_TABLE_NAME) + ======= + .delta_lake + .is_time_series_table(test::TIME_SERIES_TABLE_NAME) + >>>>>>> 7e75112 (Remove TableMetadataManager) + .await + .unwrap() + ); // The time series table data should be deleted from the Delta Lake. delta_table.load().await.unwrap(); diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index 19d69975c..fa1a6bd64 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -33,18 +33,15 @@ use crate::manager::Manager; pub struct DataFolder { /// Delta Lake for storing metadata and data in Apache Parquet files. pub delta_lake: Arc, - /// Metadata manager for providing access to metadata related to tables. - pub table_metadata_manager: Arc, } impl DataFolder { - /// Return a [`DataFolder`] with a local [`DeltaLake`] and [`TableMetadataManager`] created from - /// `local_url`. If `local_url` is a folder that does not exist, it is created. If `local_url` - /// could not be parsed, if the folder does not exist and could not be created, or if the - /// metadata tables could not be created, [`ModelarDbServerError`] is returned. + /// Return a [`DataFolder`] with a local [`DeltaLake`] created from `local_url`. If `local_url` + /// is a folder that does not exist, it is created. If `local_url` could not be parsed, if the + /// folder does not exist and could not be created, or if the metadata tables could not be + /// created, [`ModelarDbServerError`] is returned. pub async fn try_from_local_url(local_url: &str) -> Result { - let delta_lake = Arc::new(DeltaLake::try_from_local_url(local_url)?); - let table_metadata_manager = TableMetadataManager::try_new(delta_lake.clone()).await?; + let delta_lake = Arc::new(DeltaLake::try_from_local_url(local_url).await?); if local_url.starts_with("memory://") { warn!( @@ -53,10 +50,7 @@ impl DataFolder { ); }; - Ok(Self { - delta_lake, - table_metadata_manager: Arc::new(table_metadata_manager), - }) + Ok(Self { delta_lake }) } /// Return a [`DataFolder`] created from `storage_configuration`. If a connection could @@ -71,10 +65,7 @@ impl DataFolder { let remote_table_metadata_manager = TableMetadataManager::try_from_storage_configuration(storage_configuration).await?; - Ok(Self { - delta_lake, - table_metadata_manager: Arc::new(remote_table_metadata_manager), - }) + Ok(Self { delta_lake }) } } diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index 56f430640..f2c86dd9f 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -565,7 +565,7 @@ mod tests { let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .table_metadata_manager + .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index c96c0c468..66241993d 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -62,10 +62,7 @@ impl DataTransfer { remote_data_folder: DataFolder, transfer_batch_size_in_bytes: Option, ) -> Result { - let table_names = local_data_folder - .table_metadata_manager - .table_names() - .await?; + let table_names = local_data_folder.delta_lake.table_names().await?; // The size of tables is computed manually as datafusion_table_statistics() is not exact. let table_size_in_bytes = DashMap::with_capacity(table_names.len()); @@ -256,7 +253,7 @@ impl DataTransfer { // Write the data to the remote Delta Lake. if self .local_data_folder - .table_metadata_manager + .delta_lake .is_time_series_table(table_name) .await? { @@ -505,7 +502,7 @@ mod tests { .unwrap(); local_data_folder - .table_metadata_manager + .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 9a3400f61..8491a7c72 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -1287,7 +1287,7 @@ mod tests { let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .table_metadata_manager + .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index c80893e11..f2ac77b29 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -20,13 +20,18 @@ use std::fs; use std::path::Path as StdPath; use std::sync::Arc; -use arrow::array::{ArrayRef, RecordBatch}; +use arrow::array::{ + ArrayRef, BinaryArray, BooleanArray, Float32Array, Int16Array, RecordBatch, StringArray, +}; use arrow::datatypes::{DataType, Field, Schema}; use chrono::TimeDelta; use dashmap::DashMap; use datafusion::catalog::TableProvider; +use datafusion::common::{DFSchema, ToDFSchema}; +use datafusion::logical_expr::lit; use datafusion::parquet::file::properties::WriterProperties; use datafusion::parquet::format::SortingColumn; +use datafusion::prelude::{SessionContext, col}; use deltalake::delta_datafusion::DeltaDataChecker; use deltalake::kernel::transaction::{CommitBuilder, CommitProperties}; use deltalake::kernel::{Action, Add, StructField}; @@ -47,7 +52,17 @@ use url::Url; use uuid::Uuid; use crate::error::{ModelarDbStorageError, Result}; -use crate::{METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties}; +use crate::time_series_table_metadata::{GeneratedColumn, TimeSeriesTableMetadata}; +use crate::{ + METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, register_metadata_table, + sql_and_concat, try_convert_bytes_to_schema, try_convert_schema_to_bytes, +}; + +/// Types of tables supported by ModelarDB. +enum TableType { + NormalTable, + TimeSeriesTable, +} /// Functionality for managing Delta Lake tables in a local folder or an object store. pub struct DeltaLake { @@ -59,18 +74,21 @@ pub struct DeltaLake { object_store: Arc, /// Cache of Delta tables to avoid opening the same table multiple times. delta_table_cache: DashMap, + /// Session context used to query the tables using Apache DataFusion. + session_context: Arc, } impl DeltaLake { /// Create a new [`DeltaLake`] that manages the Delta tables at `local_url`. If `local_url` has /// the schema `file` or no schema, the Delta tables are managed in a local data folder. If /// `local_url` has the schema `memory`, the Delta tables are managed in memory. Return - /// [`ModelarDbStorageError`] if `local_url` cannot be parsed. - pub fn try_from_local_url(local_url: &str) -> Result { + /// [`ModelarDbStorageError`] if `local_url` cannot be parsed or the metadata tables cannot be + /// created. + pub async fn try_from_local_url(local_url: &str) -> Result { match local_url.split_once("://") { - None => Self::try_from_local_path(StdPath::new(local_url)), - Some(("file", local_path)) => Self::try_from_local_path(StdPath::new(local_path)), - Some(("memory", _)) => Ok(Self::new_in_memory()), + None => Self::try_from_local_path(StdPath::new(local_url)).await, + Some(("file", local_path)) => Self::try_from_local_path(StdPath::new(local_path)).await, + Some(("memory", _)) => Self::new_in_memory().await, _ => Err(ModelarDbStorageError::InvalidArgument(format!( "{local_url} is not a valid local URL." ))), @@ -84,12 +102,18 @@ impl DeltaLake { storage_options: HashMap::new(), object_store: Arc::new(InMemory::new()), delta_table_cache: DashMap::new(), - } + session_context: Arc::new(SessionContext::new()), + }; + + delta_lake.create_and_register_metadata_tables().await?; + + Ok(delta_lake) } /// Create a new [`DeltaLake`] that manages the Delta tables in `data_folder_path`. Returns a - /// [`ModelarDbStorageError`] if `data_folder_path` does not exist and could not be created. - pub fn try_from_local_path(data_folder_path: &StdPath) -> Result { + /// [`ModelarDbStorageError`] if `data_folder_path` does not exist and could not be created or + /// the metadata tables cannot be created. + pub async fn try_from_local_path(data_folder_path: &StdPath) -> Result { // Ensure the directories in the path exists as LocalFileSystem otherwise returns an error. fs::create_dir_all(data_folder_path) .map_err(|error| DeltaTableError::generic(error.to_string()))?; @@ -104,12 +128,17 @@ impl DeltaLake { .ok_or_else(|| DeltaTableError::generic("Local data folder path is not UTF-8."))? .to_owned(); - Ok(Self { + let delta_lake = Self { location, storage_options: HashMap::new(), object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), - }) + session_context: Arc::new(SessionContext::new()), + }; + + delta_lake.create_and_register_metadata_tables().await?; + + Ok(delta_lake) } /// Create a new [`DeltaLake`] that manages Delta tables in the remote object store given by @@ -144,8 +173,8 @@ impl DeltaLake { /// Create a new [`DeltaLake`] that manages the Delta tables in an object store with an /// S3-compatible API. Returns a [`ModelarDbStorageError`] if a connection to the object store - /// could not be made. - pub fn try_from_s3_configuration( + /// could not be made or the metadata tables cannot be created. + pub async fn try_from_s3_configuration( endpoint: String, bucket_name: String, access_key_id: String, @@ -179,18 +208,23 @@ impl DeltaLake { ) .build()?; - Ok(DeltaLake { + let delta_lake = DeltaLake { location, storage_options, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), - }) + session_context: Arc::new(SessionContext::new()), + }; + + delta_lake.create_and_register_metadata_tables().await?; + + Ok(delta_lake) } /// Create a new [`DeltaLake`] that manages the Delta tables in an object store with an /// Azure-compatible API. Returns a [`ModelarDbStorageError`] if a connection to the object - /// store could not be made. - pub fn try_from_azure_configuration( + /// store could not be made or the metadata tables cannot be created. + pub async fn try_from_azure_configuration( account_name: String, access_key: String, container_name: String, @@ -207,12 +241,81 @@ impl DeltaLake { .map_err(|error| ModelarDbStorageError::InvalidArgument(error.to_string()))?; let (object_store, _path) = object_store::parse_url_opts(&url, &storage_options)?; - Ok(DeltaLake { + let delta_lake = DeltaLake { location, storage_options, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), - }) + session_context: Arc::new(SessionContext::new()), + }; + + delta_lake.create_and_register_metadata_tables().await?; + + Ok(delta_lake) + } + + /// If they do not already exist, create the tables in the metadata Delta Lake for normal table + /// and time series table metadata and register them with the Apache DataFusion session context. + /// * The `normal_table_metadata` table contains the metadata for normal tables. + /// * The `time_series_table_metadata` table contains the main metadata for time series tables. + /// * The `time_series_table_field_columns` table contains the name, index, error bound value, + /// whether error bound is relative, and generation expression of the field columns in each + /// time series table. + /// + /// If the tables exist or were created, return [`Ok`], otherwise return + /// [`ModelarDbStorageError`]. + async fn create_and_register_metadata_tables(&self) -> Result<()> { + // Create and register the normal_table_metadata table if it does not exist. + let delta_table = self + .create_metadata_table( + "normal_table_metadata", + &Schema::new(vec![Field::new("table_name", DataType::Utf8, false)]), + ) + .await?; + + register_metadata_table(&self.session_context, "normal_table_metadata", delta_table)?; + + // Create and register the time_series_table_metadata table if it does not exist. + let delta_table = self + .create_metadata_table( + "time_series_table_metadata", + &Schema::new(vec![ + Field::new("table_name", DataType::Utf8, false), + Field::new("query_schema", DataType::Binary, false), + ]), + ) + .await?; + + register_metadata_table( + &self.session_context, + "time_series_table_metadata", + delta_table, + )?; + + // Create and register the time_series_table_field_columns table if it does not exist. Note + // that column_index will only use a maximum of 10 bits. generated_column_expr is NULL if + // the fields are stored as segments. + let delta_table = self + .create_metadata_table( + "time_series_table_field_columns", + &Schema::new(vec![ + Field::new("table_name", DataType::Utf8, false), + Field::new("column_name", DataType::Utf8, false), + Field::new("column_index", DataType::Int16, false), + Field::new("error_bound_value", DataType::Float32, false), + Field::new("error_bound_is_relative", DataType::Boolean, false), + Field::new("generated_column_expr", DataType::Utf8, true), + ]), + ) + .await?; + + register_metadata_table( + &self.session_context, + "time_series_table_field_columns", + delta_table, + )?; + + Ok(()) } /// Return an [`ObjectStore`] to access the root of the Delta Lake. @@ -279,6 +382,63 @@ impl DeltaLake { } } + /// Return `true` if the table with `table_name` is a normal table, otherwise return `false`. + pub async fn is_normal_table(&self, table_name: &str) -> Result { + Ok(self + .normal_table_names() + .await? + .contains(&table_name.to_owned())) + } + + /// Return `true` if the table with `table_name` is a time series table, otherwise return `false`. + pub async fn is_time_series_table(&self, table_name: &str) -> Result { + Ok(self + .time_series_table_names() + .await? + .contains(&table_name.to_owned())) + } + + /// Return the name of each table currently in the metadata Delta Lake. If the table names + /// cannot be retrieved, [`ModelarDbStorageError`] is returned. + pub async fn table_names(&self) -> Result> { + let normal_table_names = self.normal_table_names().await?; + let time_series_table_names = self.time_series_table_names().await?; + + let mut table_names = normal_table_names; + table_names.extend(time_series_table_names); + + Ok(table_names) + } + + /// Return the name of each normal table currently in the metadata Delta Lake. Note that this + /// does not include time series tables. If the normal table names cannot be retrieved, + /// [`ModelarDbStorageError`] is returned. + pub async fn normal_table_names(&self) -> Result> { + self.table_names_of_type(TableType::NormalTable).await + } + + /// Return the name of each time series table currently in the metadata Delta Lake. Note that + /// this does not include normal tables. If the time series table names cannot be retrieved, + /// [`ModelarDbStorageError`] is returned. + pub async fn time_series_table_names(&self) -> Result> { + self.table_names_of_type(TableType::TimeSeriesTable).await + } + + /// Return the name of tables of `table_type`. Returns [`ModelarDbStorageError`] if the table + /// names cannot be retrieved. + async fn table_names_of_type(&self, table_type: TableType) -> Result> { + let table_type = match table_type { + TableType::NormalTable => "normal_table", + TableType::TimeSeriesTable => "time_series_table", + }; + + let sql = format!("SELECT table_name FROM {table_type}_metadata"); + let batch = sql_and_concat(&self.session_context, &sql).await?; + + let table_names = modelardb_types::array!(batch, 0, StringArray); + Ok(table_names.iter().flatten().map(str::to_owned).collect()) + } + /// Return a [`DeltaTableWriter`] for writing to the time series table with `delta_table` in the /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be /// established or the table does not exist. @@ -335,6 +495,11 @@ impl DeltaLake { .await } + /// Return the location of the metadata table with `table_name`. + fn location_of_metadata_table(&self, table_name: &str) -> String { + format!("{}/{METADATA_FOLDER}/{table_name}", self.location) + } + /// Create a Delta Lake table for a normal table with `table_name` and `schema` if it does not /// already exist. If the normal table could not be created, e.g., because it already exists, /// [`ModelarDbStorageError`] is returned. @@ -370,6 +535,11 @@ impl DeltaLake { .await } + /// Return the location of the compressed time series or normal table with `table_name`. + fn location_of_compressed_table(&self, table_name: &str) -> String { + format!("{}/{TABLE_FOLDER}/{table_name}", self.location) + } + /// Create a Delta Lake table with `table_name`, `schema`, and `partition_columns` if it does /// not already exist. Returns [`DeltaTable`] if the table could be created and /// [`ModelarDbStorageError`] if it could not. @@ -556,14 +726,214 @@ impl DeltaLake { } } - /// Return the location of the compressed time series or normal table with `table_name`. - fn location_of_compressed_table(&self, table_name: &str) -> String { - format!("{}/{TABLE_FOLDER}/{table_name}", self.location) + /// Depending on the type of the table with `table_name`, drop either the normal table + /// metadata or the time series table metadata from the metadata Delta Lake. If the table does + /// not exist or the metadata could not be dropped, [`ModelarDbStorageError`] is returned. + pub async fn drop_table_metadata(&self, table_name: &str) -> Result<()> { + if self.is_normal_table(table_name).await? { + self.drop_normal_table_metadata(table_name).await + } else if self.is_time_series_table(table_name).await? { + self.drop_time_series_table_metadata(table_name).await + } else { + Err(ModelarDbStorageError::InvalidArgument(format!( + "Table with name '{table_name}' does not exist." + ))) + } } - /// Return the location of the metadata table with `table_name`. - fn location_of_metadata_table(&self, table_name: &str) -> String { - format!("{}/{METADATA_FOLDER}/{table_name}", self.location) + /// Drop the metadata for the normal table with `table_name` from the `normal_table_metadata` + /// table in the metadata Delta Lake. If the metadata could not be dropped, + /// [`ModelarDbStorageError`] is returned. + async fn drop_normal_table_metadata(&self, table_name: &str) -> Result<()> { + let delta_ops = self.metadata_delta_ops("normal_table_metadata").await?; + + delta_ops + .delete() + .with_predicate(col("table_name").eq(lit(table_name))) + .await?; + + Ok(()) + } + + /// Drop the metadata for the time series table with `table_name` from the metadata Delta Lake. + /// This includes deleting a row from the `time_series_table_metadata` table and deleting a row + /// from the `time_series_table_field_columns` table for each field column. If the metadata + /// could not be dropped, [`ModelarDbStorageError`] is returned. + async fn drop_time_series_table_metadata(&self, table_name: &str) -> Result<()> { + // Delete the table metadata from the time_series_table_metadata table. + self.metadata_delta_ops("time_series_table_metadata") + .await? + .delete() + .with_predicate(col("table_name").eq(lit(table_name))) + .await?; + + // Delete the column metadata from the time_series_table_field_columns table. + self.metadata_delta_ops("time_series_table_field_columns") + .await? + .delete() + .with_predicate(col("table_name").eq(lit(table_name))) + .await?; + + Ok(()) + } + + /// Return the [`TimeSeriesTableMetadata`] of each time series table currently in the metadata + /// Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, + /// [`ModelarDbStorageError`] is returned. + pub async fn time_series_table_metadata(&self) -> Result>> { + let sql = "SELECT table_name, query_schema FROM time_series_table_metadata"; + let batch = sql_and_concat(&self.session_context, sql).await?; + + let mut time_series_table_metadata: Vec> = vec![]; + let table_name_array = modelardb_types::array!(batch, 0, StringArray); + let query_schema_bytes_array = modelardb_types::array!(batch, 1, BinaryArray); + + for row_index in 0..batch.num_rows() { + let table_name = table_name_array.value(row_index); + let query_schema_bytes = query_schema_bytes_array.value(row_index); + + let metadata = self + .time_series_table_metadata_row_to_time_series_table_metadata( + table_name, + query_schema_bytes, + ) + .await?; + + time_series_table_metadata.push(Arc::new(metadata)) + } + + Ok(time_series_table_metadata) + } + + /// Return the [`TimeSeriesTableMetadata`] for the time series table with `table_name` in the + /// metadata Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, + /// [`ModelarDbStorageError`] is returned. + pub async fn time_series_table_metadata_for_time_series_table( + &self, + table_name: &str, + ) -> Result { + let sql = format!( + "SELECT table_name, query_schema FROM time_series_table_metadata WHERE table_name = '{table_name}'" + ); + let batch = sql_and_concat(&self.session_context, &sql).await?; + + if batch.num_rows() == 0 { + return Err(ModelarDbStorageError::InvalidArgument(format!( + "No metadata for time series table named '{table_name}'." + ))); + } + + let table_name_array = modelardb_types::array!(batch, 0, StringArray); + let query_schema_bytes_array = modelardb_types::array!(batch, 1, BinaryArray); + + let table_name = table_name_array.value(0); + let query_schema_bytes = query_schema_bytes_array.value(0); + + self.time_series_table_metadata_row_to_time_series_table_metadata( + table_name, + query_schema_bytes, + ) + .await + } + + /// Convert a row from the table "time_series_table_metadata" to an instance of + /// [`TimeSeriesTableMetadata`]. Returns [`ModelarDbStorageError`] if a time_series table with + /// `table_name` does not exist or the bytes in `query_schema_bytes` are not a valid schema. + async fn time_series_table_metadata_row_to_time_series_table_metadata( + &self, + table_name: &str, + query_schema_bytes: &[u8], + ) -> Result { + let query_schema = try_convert_bytes_to_schema(query_schema_bytes.into())?; + + let error_bounds = self + .error_bounds(table_name, query_schema.fields().len()) + .await?; + + let df_query_schema = query_schema.clone().to_dfschema()?; + let generated_columns = self.generated_columns(table_name, &df_query_schema).await?; + + TimeSeriesTableMetadata::try_new( + table_name.to_owned(), + Arc::new(query_schema), + error_bounds, + generated_columns, + ) + } + + /// Return the error bounds for the columns in the time series table with `table_name`. If a + /// time series table with `table_name` does not exist, [`ModelarDbStorageError`] is returned. + async fn error_bounds( + &self, + table_name: &str, + query_schema_columns: usize, + ) -> Result> { + let sql = format!( + "SELECT column_index, error_bound_value, error_bound_is_relative + FROM time_series_table_field_columns + WHERE table_name = '{table_name}' + ORDER BY column_index" + ); + let batch = sql_and_concat(&self.session_context, &sql).await?; + + let mut column_to_error_bound = + vec![ErrorBound::try_new_absolute(ERROR_BOUND_ZERO)?; query_schema_columns]; + + let column_index_array = modelardb_types::array!(batch, 0, Int16Array); + let error_bound_value_array = modelardb_types::array!(batch, 1, Float32Array); + let error_bound_is_relative_array = modelardb_types::array!(batch, 2, BooleanArray); + + for row_index in 0..batch.num_rows() { + let error_bound_index = column_index_array.value(row_index); + let error_bound_value = error_bound_value_array.value(row_index); + let error_bound_is_relative = error_bound_is_relative_array.value(row_index); + + let error_bound = if error_bound_is_relative { + ErrorBound::try_new_relative(error_bound_value) + } else { + ErrorBound::try_new_absolute(error_bound_value) + }?; + + column_to_error_bound[error_bound_index as usize] = error_bound; + } + + Ok(column_to_error_bound) + } + + /// Return the generated columns for the time series table with `table_name` and `df_schema`. If + /// a time series table with `table_name` does not exist, [`ModelarDbStorageError`] is returned. + async fn generated_columns( + &self, + table_name: &str, + df_schema: &DFSchema, + ) -> Result>> { + let sql = format!( + "SELECT column_index, generated_column_expr + FROM time_series_table_field_columns + WHERE table_name = '{table_name}' + ORDER BY column_index" + ); + let batch = sql_and_concat(&self.session_context, &sql).await?; + + let mut generated_columns = vec![None; df_schema.fields().len()]; + + let column_index_array = modelardb_types::array!(batch, 0, Int16Array); + let generated_column_expr_array = modelardb_types::array!(batch, 1, StringArray); + + for row_index in 0..batch.num_rows() { + let generated_column_index = column_index_array.value(row_index); + let generated_column_expr = generated_column_expr_array.value(row_index); + + // If generated_column_expr is null, it is saved as an empty string in the column values. + if !generated_column_expr.is_empty() { + let generated_column = + GeneratedColumn::try_from_sql_expr(generated_column_expr, df_schema)?; + + generated_columns[generated_column_index as usize] = Some(generated_column); + } + } + + Ok(generated_columns) } } @@ -722,3 +1092,412 @@ async fn delete_added_files(object_store: &dyn ObjectStore, added_files: Vec>) + ); + } + + #[tokio::test] + async fn test_drop_normal_table_metadata() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + + delta_lake + .drop_table_metadata("normal_table_2") + .await + .unwrap(); + + // Verify that normal_table_2 was deleted from the normal_table_metadata table. + let sql = "SELECT table_name FROM normal_table_metadata"; + let batch = sql_and_concat(&delta_lake.session_context, sql) + .await + .unwrap(); + + assert_eq!(**batch.column(0), StringArray::from(vec!["normal_table_1"])); + } + + #[tokio::test] + async fn test_drop_time_series_table_metadata() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + + delta_lake + .drop_table_metadata(test::TIME_SERIES_TABLE_NAME) + .await + .unwrap(); + + // Verify that the time series table was deleted from the time_series_table_metadata table. + let sql = "SELECT table_name FROM time_series_table_metadata"; + let batch = sql_and_concat(&delta_lake.session_context, sql) + .await + .unwrap(); + + assert_eq!(batch.num_rows(), 0); + + // Verify that the field columns were deleted from the time_series_table_field_columns table. + let sql = "SELECT table_name FROM time_series_table_field_columns"; + let batch = sql_and_concat(&delta_lake.session_context, sql) + .await + .unwrap(); + + assert_eq!(batch.num_rows(), 0); + } + + #[tokio::test] + async fn test_drop_table_metadata_for_missing_table() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + + assert!( + delta_lake + .drop_table_metadata("missing_table") + .await + .is_err() + ); + } + + async fn create_delta_lake_and_save_normal_tables() -> (TempDir, DeltaLake) { + let temp_dir = tempfile::tempdir().unwrap(); + let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + .await + .unwrap(); + + delta_lake + .save_normal_table_metadata("normal_table_1") + .await + .unwrap(); + + delta_lake + .save_normal_table_metadata("normal_table_2") + .await + .unwrap(); + + (temp_dir, delta_lake) + } + + #[tokio::test] + async fn test_time_series_table_metadata() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + + let time_series_table_metadata = delta_lake.time_series_table_metadata().await.unwrap(); + + assert_eq!( + time_series_table_metadata.first().unwrap().name, + test::time_series_table_metadata().name, + ); + } + + #[tokio::test] + async fn test_time_series_table_metadata_for_existing_time_series_table() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + + let time_series_table_metadata = delta_lake + .time_series_table_metadata_for_time_series_table(test::TIME_SERIES_TABLE_NAME) + .await + .unwrap(); + + assert_eq!( + time_series_table_metadata.name, + test::time_series_table_metadata().name, + ); + } + + #[tokio::test] + async fn test_time_series_table_metadata_for_missing_time_series_table() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + + let time_series_table_metadata = delta_lake + .time_series_table_metadata_for_time_series_table("missing_table") + .await; + + assert!(time_series_table_metadata.is_err()); + } + + #[tokio::test] + async fn test_error_bound() { + let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + + let error_bounds = delta_lake + .error_bounds(test::TIME_SERIES_TABLE_NAME, 4) + .await + .unwrap(); + + let values: Vec = error_bounds + .iter() + .map(|error_bound| match error_bound { + ErrorBound::Absolute(value) => *value, + ErrorBound::Relative(value) => *value, + }) + .collect(); + + assert_eq!(values, &[0.0, 1.0, 5.0, 0.0]); + } + + #[tokio::test] + async fn test_generated_columns() { + let temp_dir = tempfile::tempdir().unwrap(); + let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + .await + .unwrap(); + + let query_schema = Arc::new(Schema::new(vec![ + Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), + Field::new("field_1", ArrowValue::DATA_TYPE, false), + Field::new("field_2", ArrowValue::DATA_TYPE, false), + Field::new("tag", DataType::Utf8, false), + Field::new("generated_column_1", ArrowValue::DATA_TYPE, false), + Field::new("generated_column_2", ArrowValue::DATA_TYPE, false), + ])); + + let error_bounds = vec![ + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(); + query_schema.fields.len() + ]; + + let plus_one_column = Some(GeneratedColumn { + expr: col("field_1") + Literal(Int64(Some(1))), + source_columns: vec![1], + original_expr: "field_1 + 1".to_owned(), + }); + + let addition_column = Some(GeneratedColumn { + expr: col("field_1") + col("field_2"), + source_columns: vec![1, 2], + original_expr: "field_1 + field_2".to_owned(), + }); + + let expected_generated_columns = + vec![None, None, None, None, plus_one_column, addition_column]; + + let time_series_table_metadata = TimeSeriesTableMetadata::try_new( + "generated_columns_table".to_owned(), + query_schema, + error_bounds, + expected_generated_columns.clone(), + ) + .unwrap(); + + delta_lake + .save_time_series_table_metadata(&time_series_table_metadata) + .await + .unwrap(); + + let df_schema = time_series_table_metadata + .query_schema + .to_dfschema() + .unwrap(); + let generated_columns = delta_lake + .generated_columns("generated_columns_table", &df_schema) + .await + .unwrap(); + + assert_eq!( + generated_columns[0..generated_columns.len() - 1], + expected_generated_columns[0..expected_generated_columns.len() - 1] + ); + + // Sort the source columns to ensure the order is consistent. + let mut last_generated_column = generated_columns.last().unwrap().clone().unwrap(); + last_generated_column.source_columns.sort(); + + assert_eq!( + &Some(last_generated_column), + expected_generated_columns.last().unwrap() + ); + } + + async fn create_delta_lake_and_save_time_series_table() -> (TempDir, DeltaLake) { + let temp_dir = tempfile::tempdir().unwrap(); + let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + .await + .unwrap(); + + // Save a time series table to the metadata Delta Lake. + let time_series_table_metadata = test::time_series_table_metadata(); + delta_lake + .save_time_series_table_metadata(&time_series_table_metadata) + .await + .unwrap(); + + (temp_dir, delta_lake) + } +} diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 12facfa4b..5380a41d0 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -18,10 +18,10 @@ pub mod delta_lake; pub mod error; -pub mod metadata; mod optimizer; pub mod parser; mod query; +pub mod time_series_table_metadata; use std::result::Result as StdResult; use std::sync::Arc; diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 50d0e52dc..3f386c211 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -777,7 +777,9 @@ mod tests { ) -> Arc { // Setup access to data and metadata in data folder. let data_folder_path = temp_dir.path(); - let delta_lake = DeltaLake::try_from_local_path(data_folder_path).unwrap(); + let delta_lake = DeltaLake::try_from_local_path(data_folder_path) + .await + .unwrap(); // Setup access to Apache DataFusion. let mut session_state_builder = SessionStateBuilder::new().with_default_features(); diff --git a/crates/modelardb_storage/src/parser.rs b/crates/modelardb_storage/src/parser.rs index 8e2644231..1ae48d4ba 100644 --- a/crates/modelardb_storage/src/parser.rs +++ b/crates/modelardb_storage/src/parser.rs @@ -50,6 +50,7 @@ use sqlparser::parser::{Parser, ParserError}; use sqlparser::tokenizer::{Span, Token}; use crate::error::{ModelarDbStorageError, Result}; +use crate::time_series_table_metadata::{GeneratedColumn, TimeSeriesTableMetadata}; /// A top-level statement (CREATE, INSERT, SELECT, TRUNCATE, DROP, VACUUM etc.) that has been /// tokenized, parsed, and for which semantic checks have verified that it is compatible with diff --git a/crates/modelardb_storage/src/query/time_series_table.rs b/crates/modelardb_storage/src/query/time_series_table.rs index afa742479..cb0eacf63 100644 --- a/crates/modelardb_storage/src/query/time_series_table.rs +++ b/crates/modelardb_storage/src/query/time_series_table.rs @@ -53,6 +53,7 @@ use modelardb_types::types::{ArrowTimestamp, ArrowValue, TimeSeriesTableMetadata use crate::query::generated_as_exec::{ColumnToGenerate, GeneratedAsExec}; use crate::query::grid_exec::GridExec; use crate::query::sorted_join_exec::{SortedJoinColumnType, SortedJoinExec}; +use crate::time_series_table_metadata::TimeSeriesTableMetadata; /// A queryable representation of a time series table which stores multivariate time series as segments /// containing metadata and models. [`TimeSeriesTable`] implements [`TableProvider`] so it can be diff --git a/crates/modelardb_storage/src/time_series_table_metadata.rs b/crates/modelardb_storage/src/time_series_table_metadata.rs new file mode 100644 index 000000000..f43fe4f73 --- /dev/null +++ b/crates/modelardb_storage/src/time_series_table_metadata.rs @@ -0,0 +1,559 @@ +/* Copyright 2022 The ModelarDB Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Implementation of the type containing the metadata required to read from and +//! write to a time series table. + +use std::result::Result as StdResult; +use std::sync::Arc; + +use arrow::array::StringArray; +use arrow::record_batch::RecordBatch; +use datafusion::arrow::datatypes::{ArrowPrimitiveType, DataType, Schema}; +use datafusion::common::DFSchema; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::expr::Expr; +use modelardb_types::schemas::COMPRESSED_SCHEMA; +use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound, TimestampArray, ValueArray}; + +use crate::error::{ModelarDbStorageError, Result}; +use crate::parser::tokenize_and_parse_sql_expression; + +/// Metadata required to ingest data into a time series table and query a time series table. +#[derive(Debug, Clone)] +pub struct TimeSeriesTableMetadata { + /// Name of the time series table. + pub name: String, + /// Index of the timestamp column in `schema`. + pub timestamp_column_index: usize, + /// Indices of the field columns in `schema`. + pub field_column_indices: Vec, + /// Indices of the tag columns in `schema`. + pub tag_column_indices: Vec, + /// Error bounds of the columns in `schema`. It can only be non-zero for field columns. + pub error_bounds: Vec, + /// Expressions to create generated columns in the `query_schema`. Only field columns can be + /// generated by [`Expr`], so [`None`] is stored for timestamp, tag, and stored field columns. + pub generated_columns: Vec>, + /// Schema of the data that can be written to the time series table. + pub schema: Arc, + /// Schema of the data that can be read from the time series table. + pub query_schema: Arc, + /// Projection that changes `query_schema` to `schema`. + pub query_schema_to_schema: Vec, + /// Schema of the compressed segments that are stored in the time series table. + pub compressed_schema: Arc, +} + +impl TimeSeriesTableMetadata { + /// Create a new time series table with the given metadata. If any of the following conditions + /// are true, [`ModelarDbStorageError`] is returned: + /// * The number of error bounds does not match the number of columns. + /// * The number of potentially generated columns does not match the number of columns. + /// * A generated column includes another generated column in its expression. + /// * There are more than 32767 columns. + /// * The `query_schema` does not include a single timestamp column. + /// * The `query_schema` does not include at least one stored field column. + pub fn try_new( + name: String, + query_schema: Arc, + error_bounds: Vec, + generated_columns: Vec>, + ) -> Result { + // If an error bound is not defined for each column, return an error. + if query_schema.fields().len() != error_bounds.len() { + return Err(ModelarDbStorageError::InvalidArgument( + "An error bound must be defined for each column.".to_owned(), + )); + } + + // If a generated column or None is not defined for each column, return an error. + if query_schema.fields().len() != generated_columns.len() { + return Err(ModelarDbStorageError::InvalidArgument( + "A generated column or None must be defined for each column.".to_owned(), + )); + } + + // If a generated field column depends on other generated field columns, return an error. + for generated_column in generated_columns.iter().flatten() { + for source_column in &generated_column.source_columns { + if generated_columns[*source_column].is_some() { + return Err(ModelarDbStorageError::InvalidArgument( + "A generated field column cannot depend on generated field columns." + .to_owned(), + )); + } + } + } + + // If there are more than 32767 columns, return an error. This limitation is necessary since + // 16 bits are used for the field column index in the compressed segments. + if query_schema.fields.len() > 32767 { + return Err(ModelarDbStorageError::InvalidArgument( + "There cannot be more than 32767 columns in the time series table.".to_owned(), + )); + } + + // Remove the generated field columns from the query schema and the error bounds as these + // columns should never be provided when inserting data points into the time series table. + let mut fields_without_generated = Vec::with_capacity(query_schema.fields().len()); + let mut field_indices_without_generated = Vec::with_capacity(query_schema.fields().len()); + let mut error_bounds_without_generated = Vec::with_capacity(error_bounds.len()); + for (index, generated_column) in generated_columns.iter().enumerate() { + if generated_column.is_none() { + fields_without_generated.push(query_schema.fields[index].clone()); + field_indices_without_generated.push(index); + error_bounds_without_generated.push(error_bounds[index]); + } + } + + let schema_without_generated = + if query_schema.fields.len() != fields_without_generated.len() { + Arc::new(Schema::new(fields_without_generated)) + } else { + query_schema.clone() + }; + + // A time series table must only contain one stored timestamp column, one or more stored + // field columns, zero or more generated field columns, and zero or more stored tag columns. + let timestamp_column_indices = compute_indices_of_columns_with_data_type( + &schema_without_generated, + ArrowTimestamp::DATA_TYPE, + ); + + if timestamp_column_indices.len() != 1 { + return Err(ModelarDbStorageError::InvalidArgument( + "There needs to be exactly one timestamp column.".to_owned(), + )); + } + + let field_column_indices = compute_indices_of_columns_with_data_type( + &schema_without_generated, + ArrowValue::DATA_TYPE, + ); + + if field_column_indices.is_empty() { + return Err(ModelarDbStorageError::InvalidArgument( + "There needs to be at least one field column.".to_owned(), + )); + } + + let tag_column_indices = + compute_indices_of_columns_with_data_type(&schema_without_generated, DataType::Utf8); + + // Add the tag columns to the base schema for compressed segments. + let mut compressed_schema_fields = + Vec::with_capacity(COMPRESSED_SCHEMA.0.fields().len() + tag_column_indices.len()); + compressed_schema_fields.extend(COMPRESSED_SCHEMA.0.fields.clone().to_vec()); + + for index in &tag_column_indices { + compressed_schema_fields.push(Arc::new(schema_without_generated.field(*index).clone())); + } + + let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); + + Ok(Self { + name, + timestamp_column_index: timestamp_column_indices[0], + field_column_indices, + tag_column_indices, + error_bounds: error_bounds_without_generated, + generated_columns, + schema: schema_without_generated, + query_schema, + query_schema_to_schema: field_indices_without_generated, + compressed_schema, + }) + } + + /// Return `true` if the column at `index` is the timestamp column. + pub fn is_timestamp(&self, index: usize) -> bool { + index == self.timestamp_column_index + } + + /// Return `true` if the column at `index` is a tag column. + pub fn is_tag(&self, index: usize) -> bool { + self.tag_column_indices.contains(&index) + } + + /// Return the column arrays for the timestamp, field, and tag columns in `record_batch`. If + /// `record_batch` does not contain the required columns, return [`ModelarDbStorageError`]. + pub fn column_arrays<'a>( + &self, + record_batch: &'a RecordBatch, + ) -> Result<( + &'a TimestampArray, + Vec<&'a ValueArray>, + Vec<&'a StringArray>, + )> { + if record_batch.schema() != self.schema { + return Err(ModelarDbStorageError::InvalidArgument( + "The record batch does not match the schema of the time series table.".to_owned(), + )); + } + + let timestamp_column_array = + modelardb_types::array!(record_batch, self.timestamp_column_index, TimestampArray); + + let field_column_arrays: Vec<_> = self + .field_column_indices + .iter() + .map(|index| modelardb_types::array!(record_batch, *index, ValueArray)) + .collect(); + + let tag_column_arrays: Vec<_> = self + .tag_column_indices + .iter() + .map(|index| modelardb_types::array!(record_batch, *index, StringArray)) + .collect(); + + Ok(( + timestamp_column_array, + field_column_arrays, + tag_column_arrays, + )) + } +} + +/// Compute the indices of all columns in `schema` with `data_type`. +fn compute_indices_of_columns_with_data_type(schema: &Schema, data_type: DataType) -> Vec { + let fields = schema.fields(); + (0..fields.len()) + .filter(|index| *fields[*index].data_type() == data_type) + .collect() +} + +/// Column that is generated by a [`Expr`] using zero or more stored columns as input. +#[derive(Clone, Debug, PartialEq)] +pub struct GeneratedColumn { + /// Logical expression that computes the values of the column. + pub expr: Expr, + /// Indices of the stored columns used by `expr` to compute the column's values. + pub source_columns: Vec, + /// Original representation of `expr`. It is copied from the SQL statement, so it can be stored + /// in the metadata Delta Lake as `expr` does not implement serialization and deserialization. + pub original_expr: String, +} + +impl GeneratedColumn { + /// Create a [`GeneratedColumn`] from a SQL expression and a [`DFSchema`]. If the SQL expression + /// is not valid or refers to columns that are not in the [`DFSchema`], + /// a [`ModelarDbStorageError`] is returned. + pub fn try_from_sql_expr(sql_expr: &str, df_schema: &DFSchema) -> Result { + let expr = tokenize_and_parse_sql_expression(sql_expr, df_schema)?; + + let source_columns: StdResult, DataFusionError> = expr + .column_refs() + .iter() + .map(|column| df_schema.index_of_column(column)) + .collect(); + + Ok(Self { + expr, + source_columns: source_columns?, + original_expr: sql_expr.to_owned(), + }) + } +} + +#[cfg(test)] +mod test { + use super::*; + + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::common::ToDFSchema; + use datafusion::logical_expr::col; + use modelardb_common::test::ERROR_BOUND_ZERO; + + use crate::test; + + // Tests for TimeSeriesTableMetadata. + #[test] + fn test_can_create_time_series_table_metadata() { + let (query_schema, error_bounds, generated_columns) = + time_series_table_schema_error_bounds_and_generated_columns(); + let result = TimeSeriesTableMetadata::try_new( + test::TIME_SERIES_TABLE_NAME.to_owned(), + query_schema, + error_bounds, + generated_columns, + ); + + assert!(result.is_ok()); + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_invalid_timestamp_type() { + let schema = Schema::new(vec![ + Field::new("tag", DataType::Utf8, false), + Field::new("timestamp", DataType::UInt8, false), + Field::new("value", ArrowValue::DATA_TYPE, false), + ]); + + let result = create_simple_time_series_table_metadata(schema); + assert!(result.is_err()); + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_invalid_tag_type() { + let schema = Schema::new(vec![ + Field::new("tag", DataType::UInt8, false), + Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), + Field::new("value", ArrowValue::DATA_TYPE, false), + ]); + + let result = create_simple_time_series_table_metadata(schema); + assert!(result.is_err()); + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_no_fields() { + let schema = Schema::new(vec![ + Field::new("tag", DataType::Utf8, false), + Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), + ]); + + let result = create_simple_time_series_table_metadata(schema); + assert!(result.is_err()); + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_invalid_field_type() { + let schema = Schema::new(vec![ + Field::new("tag", DataType::Utf8, false), + Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), + Field::new("value", DataType::UInt8, false), + ]); + + let result = create_simple_time_series_table_metadata(schema); + assert!(result.is_err()); + } + + /// Return metadata for a time series table with one tag column and the timestamp column at index 1. + fn create_simple_time_series_table_metadata( + query_schema: Schema, + ) -> Result { + TimeSeriesTableMetadata::try_new( + test::TIME_SERIES_TABLE_NAME.to_owned(), + Arc::new(query_schema), + vec![ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap()], + vec![None], + ) + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_missing_or_too_many_error_bounds() { + let (query_schema, _error_bounds, generated_columns) = + time_series_table_schema_error_bounds_and_generated_columns(); + let result = TimeSeriesTableMetadata::try_new( + test::TIME_SERIES_TABLE_NAME.to_owned(), + query_schema, + vec![], + generated_columns, + ); + + assert!(result.is_err()); + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_missing_or_too_many_generated_columns() { + let (query_schema, error_bounds, _generated_columns) = + time_series_table_schema_error_bounds_and_generated_columns(); + let result = TimeSeriesTableMetadata::try_new( + test::TIME_SERIES_TABLE_NAME.to_owned(), + query_schema, + error_bounds, + vec![], + ); + + assert!(result.is_err()); + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_generated_columns_using_generated_columns() + { + let (query_schema, error_bounds, mut generated_columns) = + time_series_table_schema_error_bounds_and_generated_columns(); + + generated_columns[5] = Some(GeneratedColumn { + expr: Expr::Column("".into()), + source_columns: vec![], + original_expr: "".to_owned(), + }); + + generated_columns[6] = Some(GeneratedColumn { + expr: Expr::Column("".into()), + source_columns: vec![5], + original_expr: "".to_owned(), + }); + + let result = TimeSeriesTableMetadata::try_new( + test::TIME_SERIES_TABLE_NAME.to_owned(), + query_schema, + error_bounds, + generated_columns, + ); + + assert!(result.is_err()); + } + + fn time_series_table_schema_error_bounds_and_generated_columns() + -> (Arc, Vec, Vec>) { + ( + Arc::new(Schema::new(vec![ + Field::new("location", DataType::Utf8, false), + Field::new("install_year", DataType::Utf8, false), + Field::new("model", DataType::Utf8, false), + Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), + Field::new("power_output", ArrowValue::DATA_TYPE, false), + Field::new("wind_speed", ArrowValue::DATA_TYPE, false), + Field::new("temperature", ArrowValue::DATA_TYPE, false), + ])), + vec![ + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), + ], + vec![None, None, None, None, None, None, None], + ) + } + + #[test] + fn test_cannot_create_time_series_table_metadata_with_too_many_fields() { + // Create 1025 fields that can be used to initialize a schema. + let fields = (0..1025) + .map(|i| Field::new(format!("field_{i}").as_str(), DataType::Float32, false)) + .collect::>(); + + let error_bounds = vec![ + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), + ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), + ]; + + let generated_columns = vec![None, None, None]; + + let result = TimeSeriesTableMetadata::try_new( + test::TIME_SERIES_TABLE_NAME.to_owned(), + Arc::new(Schema::new(fields)), + error_bounds, + generated_columns, + ); + + assert!(result.is_err()); + } + + #[test] + fn test_is_timestamp() { + let time_series_table_metadata = test::time_series_table_metadata(); + + assert!(time_series_table_metadata.is_timestamp(0)); + assert!(!time_series_table_metadata.is_timestamp(1)); + assert!(!time_series_table_metadata.is_timestamp(2)); + assert!(!time_series_table_metadata.is_timestamp(3)); + } + + #[test] + fn test_is_tag() { + let time_series_table_metadata = test::time_series_table_metadata(); + + assert!(!time_series_table_metadata.is_tag(0)); + assert!(!time_series_table_metadata.is_tag(1)); + assert!(!time_series_table_metadata.is_tag(2)); + assert!(time_series_table_metadata.is_tag(3)); + } + + #[test] + fn test_column_arrays() { + let time_series_table_metadata = test::time_series_table_metadata(); + let record_batch = test::uncompressed_time_series_table_record_batch(1); + + let (timestamp_column_array, field_column_arrays, tag_column_arrays) = + time_series_table_metadata + .column_arrays(&record_batch) + .unwrap(); + + assert_eq!( + modelardb_types::array!(record_batch, 0, TimestampArray), + timestamp_column_array + ); + assert_eq!( + modelardb_types::array!(record_batch, 1, ValueArray), + field_column_arrays[0] + ); + assert_eq!( + modelardb_types::array!(record_batch, 2, ValueArray), + field_column_arrays[1] + ); + assert_eq!( + modelardb_types::array!(record_batch, 3, StringArray), + tag_column_arrays[0] + ); + } + + #[test] + fn test_column_arrays_with_invalid_schema() { + let time_series_table_metadata = test::time_series_table_metadata(); + let record_batch = test::normal_table_record_batch(); + + let result = time_series_table_metadata.column_arrays(&record_batch); + + assert_eq!( + result.unwrap_err().to_string(), + "Invalid Argument Error: The record batch does not match the schema of the time series table." + ); + } + + // Tests for GeneratedColumn. + #[test] + fn test_can_create_generated_column() { + let schema = Schema::new(vec![ + Field::new("field_1", ArrowValue::DATA_TYPE, false), + Field::new("field_2", ArrowValue::DATA_TYPE, false), + Field::new("generated_column", ArrowValue::DATA_TYPE, false), + ]); + + let sql_expr = "field_1 + field_2"; + let expected_generated_column = GeneratedColumn { + expr: col("field_1") + col("field_2"), + source_columns: vec![0, 1], + original_expr: sql_expr.to_owned(), + }; + + let df_schema = schema.to_dfschema().unwrap(); + let mut result = GeneratedColumn::try_from_sql_expr(sql_expr, &df_schema).unwrap(); + + // Sort the source columns to ensure the order is consistent. + result.source_columns.sort(); + assert_eq!(expected_generated_column, result); + } + + #[test] + fn test_cannot_create_generated_column_with_invalid_sql_expr() { + let schema = Schema::new(vec![ + Field::new("field_1", ArrowValue::DATA_TYPE, false), + Field::new("generated_column", ArrowValue::DATA_TYPE, false), + ]); + + let df_schema = schema.to_dfschema().unwrap(); + let result = GeneratedColumn::try_from_sql_expr("field_1 + field_2", &df_schema); + + assert!(result.is_err()); + } +} From 10b3314b57cb92c9510263b570b8e1c5a8fb58d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Thu, 24 Apr 2025 13:50:10 +0000 Subject: [PATCH 03/31] Move connection_info to DeltaLake --- crates/modelardb_manager/src/main.rs | 4 ++++ crates/modelardb_storage/src/delta_lake.rs | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index 669a297e6..bf0fa308d 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -64,6 +64,10 @@ impl RemoteDataFolder { } } + pub fn new(metadata_manager: MetadataManager) -> Self { + Self { metadata_manager } + } + /// Create a [`RemoteDataFolder`] from `remote_data_folder_str`. If `remote_data_folder_str` /// cannot be parsed or a connection to the object store cannot be created, /// [`ModelarDbManagerError`] is returned. diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index f2ac77b29..7d40a4385 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -70,6 +70,9 @@ pub struct DeltaLake { location: String, /// Storage options required to access Delta Lake. storage_options: HashMap, + /// Connection information saved as bytes to make it possible to transfer the information using + /// Apache Arrow Flight. Only set to [`Some`] by [`try_remote_from_connection_info()`]. + maybe_connection_info: Option>, /// [`ObjectStore`] to access the root of the Delta Lake. object_store: Arc, /// Cache of Delta tables to avoid opening the same table multiple times. @@ -100,6 +103,7 @@ impl DeltaLake { Self { location: "memory:///modelardb".to_owned(), storage_options: HashMap::new(), + maybe_connection_info: None, object_store: Arc::new(InMemory::new()), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), @@ -131,6 +135,7 @@ impl DeltaLake { let delta_lake = Self { location, storage_options: HashMap::new(), + maybe_connection_info: None, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), @@ -211,6 +216,7 @@ impl DeltaLake { let delta_lake = DeltaLake { location, storage_options, + maybe_connection_info: None, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), @@ -244,6 +250,7 @@ impl DeltaLake { let delta_lake = DeltaLake { location, storage_options, + maybe_connection_info: None, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), @@ -318,6 +325,13 @@ impl DeltaLake { Ok(()) } + /// Return connection information saved as bytes to make it possible to transfer the information + /// using Apache Arrow Flight. Only returns [`Some`] if [`DeltaLake] was created by + /// [`try_remote_from_connection_info()`]. + pub fn connection_info(&self) -> &Option> { + &self.maybe_connection_info + } + /// Return an [`ObjectStore`] to access the root of the Delta Lake. pub fn object_store(&self) -> Arc { self.object_store.clone() From 34cad6739ede30c85d3498b78da6ace87cbdd710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Thu, 24 Apr 2025 15:36:27 +0000 Subject: [PATCH 04/31] Replace MetadataManager with ManagerMetadata trait --- crates/modelardb_manager/src/main.rs | 67 ++------------- crates/modelardb_manager/src/metadata.rs | 95 ++++++++++------------ crates/modelardb_manager/src/remote.rs | 46 +++-------- crates/modelardb_storage/src/delta_lake.rs | 5 ++ 4 files changed, 69 insertions(+), 144 deletions(-) diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index bf0fa308d..9b175db62 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -32,67 +32,17 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use crate::cluster::Cluster; use crate::error::{ModelarDbManagerError, Result}; -use crate::metadata::MetadataManager; +use crate::metadata::ManagerMetadata; use crate::remote::start_apache_arrow_flight_server; /// The port of the Apache Arrow Flight Server. If the environment variable is not set, 9998 is used. pub static PORT: LazyLock = LazyLock::new(|| env::var("MODELARDBM_PORT").map_or(9998, |value| value.parse().unwrap())); -/// Stores the storage configuration with the remote data folder to ensure that the information -/// is consistent with the remote data folder. -pub struct RemoteDataFolder { - /// Storage configuration encoded as a [`StorageConfiguration`](protocol::manager_metadata::StorageConfiguration) - /// protobuf message to make it possible to transfer the configuration using Apache Arrow Flight. - storage_configuration: protocol::manager_metadata::StorageConfiguration, - /// Remote object store for storing data and metadata in Apache Parquet files. - delta_lake: Arc, - /// Manager for the access to the metadata Delta Lake. - pub(crate) metadata_manager: MetadataManager, -} - -impl RemoteDataFolder { - pub fn new( - storage_configuration: protocol::manager_metadata::StorageConfiguration, - delta_lake: Arc, - metadata_manager: Arc, - ) -> Self { - Self { - storage_configuration, - delta_lake, - metadata_manager, - } - } - - pub fn new(metadata_manager: MetadataManager) -> Self { - Self { metadata_manager } - } - - /// Create a [`RemoteDataFolder`] from `remote_data_folder_str`. If `remote_data_folder_str` - /// cannot be parsed or a connection to the object store cannot be created, - /// [`ModelarDbManagerError`] is returned. - async fn try_new(remote_data_folder_str: &str) -> Result { - let storage_configuration = - modelardb_types::flight::argument_to_storage_configuration(remote_data_folder_str)?; - - let delta_lake = - DeltaLake::try_remote_from_storage_configuration(storage_configuration.clone())?; - - let metadata_manager = - MetadataManager::try_from_storage_configuration(storage_configuration.clone()).await?; - - Ok(Self::new( - storage_configuration, - Arc::new(delta_lake), - Arc::new(metadata_manager), - )) - } -} - /// Provides access to the managers components. pub struct Context { - /// Folder for storing metadata and data in Apache Parquet files in a remote object store. - pub remote_data_folder: RemoteDataFolder, + /// Delta Lake for storing metadata and data in Apache Parquet files. + pub remote_delta_lake: DeltaLake, /// Cluster of nodes currently controlled by the manager. pub cluster: RwLock, /// Key used to identify requests coming from the manager. @@ -116,9 +66,9 @@ async fn main() -> Result<()> { _ => print_usage_and_exit_with_error("remote_data_folder"), }; - let remote_data_folder = RemoteDataFolder::try_new(remote_data_folder_str).await?; - - let nodes = remote_data_folder.metadata_manager.nodes().await?; + let connection_info = arguments::argument_to_connection_info(remote_delta_lake_str)?; + let remote_delta_lake = DeltaLake::try_remote_from_connection_info(connection_info).await?; + let nodes = remote_delta_lake.nodes().await?; let mut cluster = Cluster::new(); for node in nodes { @@ -126,8 +76,7 @@ async fn main() -> Result<()> { } // Retrieve and parse the key to a tonic metadata value since it is used in tonic requests. - let key = remote_data_folder - .metadata_manager + let key = remote_delta_lake .manager_key() .await? .to_string() @@ -138,7 +87,7 @@ async fn main() -> Result<()> { // Create the Context. let context = Arc::new(Context { - remote_data_folder, + remote_delta_lake, cluster: RwLock::new(cluster), key, }); diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 3488a0fd0..4b03dafbb 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -23,7 +23,6 @@ use arrow::array::{Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use deltalake::DeltaTableError; use deltalake::datafusion::logical_expr::{col, lit}; -use deltalake::datafusion::prelude::SessionContext; use modelardb_storage::delta_lake::DeltaLake; use modelardb_storage::{register_metadata_table, sql_and_concat}; use modelardb_types::flight::protocol; @@ -34,11 +33,12 @@ use crate::error::Result; /// Stores the metadata required for reading from and writing to the normal tables and time series tables /// and persisting edges. The data that needs to be persisted is stored in the metadata Delta Lake. -pub struct MetadataManager { - /// Delta Lake with functionality to read and write to and from the manager's metadata tables. - pub(crate) delta_lake: DeltaLake, - /// Session context used to query the manager metadata Delta Lake tables using Apache DataFusion. - session_context: SessionContext, +pub trait ManagerMetadata { + async fn create_and_register_manager_metadata_delta_lake_tables(&self) -> Result<()>; + async fn manager_key(&self) -> Result; + async fn save_node(&self, node: Node) -> Result<()>; + async fn remove_node(&self, url: &str) -> Result<()>; + async fn nodes(&self) -> Result>; } impl MetadataManager { @@ -79,18 +79,16 @@ impl MetadataManager { async fn create_and_register_manager_metadata_delta_lake_tables(&self) -> Result<()> { // Create and register the manager_metadata table if it does not exist. let delta_table = self - .delta_lake .create_metadata_table( "manager_metadata", &Schema::new(vec![Field::new("key", DataType::Utf8, false)]), ) .await?; - register_metadata_table(&self.session_context, "manager_metadata", delta_table)?; + register_metadata_table(&self.session_context(), "manager_metadata", delta_table)?; // Create and register the nodes table if it does not exist. let delta_table = self - .delta_lake .create_metadata_table( "nodes", &Schema::new(vec![ @@ -100,7 +98,7 @@ impl MetadataManager { ) .await?; - register_metadata_table(&self.session_context, "nodes", delta_table)?; + register_metadata_table(&self.session_context(), "nodes", delta_table)?; Ok(()) } @@ -108,21 +106,20 @@ impl MetadataManager { /// Retrieve the key for the manager from the `manager_metadata` table. If a key does not /// already exist, create one and save it to the Delta Lake. If a key could not be retrieved /// or created, return [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). - pub async fn manager_key(&self) -> Result { + async fn manager_key(&self) -> Result { let sql = "SELECT key FROM manager_metadata"; - let batch = sql_and_concat(&self.session_context, sql).await?; + let batch = sql_and_concat(&self.session_context(), sql).await?; let keys = modelardb_types::array!(batch, 0, StringArray); if keys.is_empty() { let manager_key = Uuid::new_v4(); // Add a new row to the manager_metadata table to persist the key. - self.delta_lake - .write_columns_to_metadata_table( - "manager_metadata", - vec![Arc::new(StringArray::from(vec![manager_key.to_string()]))], - ) - .await?; + self.write_columns_to_metadata_table( + "manager_metadata", + vec![Arc::new(StringArray::from(vec![manager_key.to_string()]))], + ) + .await?; Ok(manager_key) } else { @@ -136,16 +133,15 @@ impl MetadataManager { /// Save the node to the metadata Delta Lake and return [`Ok`]. If the node could not be saved, /// return [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). - pub async fn save_node(&self, node: Node) -> Result<()> { - self.delta_lake - .write_columns_to_metadata_table( - "nodes", - vec![ - Arc::new(StringArray::from(vec![node.url])), - Arc::new(StringArray::from(vec![node.mode.to_string()])), - ], - ) - .await?; + async fn save_node(&self, node: Node) -> Result<()> { + self.write_columns_to_metadata_table( + "nodes", + vec![ + Arc::new(StringArray::from(vec![node.url])), + Arc::new(StringArray::from(vec![node.mode.to_string()])), + ], + ) + .await?; Ok(()) } @@ -153,8 +149,8 @@ impl MetadataManager { /// Remove the row in the `nodes` table that corresponds to the node with `url` and return /// [`Ok`]. If the row could not be removed, return /// [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). - pub async fn remove_node(&self, url: &str) -> Result<()> { - let delta_ops = self.delta_lake.metadata_delta_ops("nodes").await?; + async fn remove_node(&self, url: &str) -> Result<()> { + let delta_ops = self.metadata_delta_ops("nodes").await?; delta_ops .delete() @@ -167,11 +163,11 @@ impl MetadataManager { /// Return the nodes currently controlled by the manager that have been persisted to the /// metadata Delta Lake. If the nodes could not be retrieved, /// [`ModelarDbManagerError`](crate::error::ModelarDbManagerError) is returned. - pub async fn nodes(&self) -> Result> { + async fn nodes(&self) -> Result> { let mut nodes: Vec = vec![]; let sql = "SELECT url, mode FROM nodes"; - let batch = sql_and_concat(&self.session_context, sql).await?; + let batch = sql_and_concat(self.session_context(), sql).await?; let url_array = modelardb_types::array!(batch, 0, StringArray); let mode_array = modelardb_types::array!(batch, 1, StringArray); @@ -199,12 +195,12 @@ mod tests { // Tests for MetadataManager. #[tokio::test] async fn test_create_manager_metadata_delta_lake_tables() { - let (_temp_dir, metadata_manager) = create_metadata_manager().await; + let (_temp_dir, metadata_manager) = create_delta_lake().await; // Verify that the tables were created, registered, and has the expected columns. assert!( metadata_manager - .session_context + .session_context() .sql("SELECT key FROM manager_metadata") .await .is_ok() @@ -212,7 +208,7 @@ mod tests { assert!( metadata_manager - .session_context + .session_context() .sql("SELECT url, mode FROM nodes") .await .is_ok() @@ -221,13 +217,13 @@ mod tests { #[tokio::test] async fn test_new_manager_key() { - let (_temp_dir, metadata_manager) = create_metadata_manager().await; + let (_temp_dir, metadata_manager) = create_delta_lake().await; // Verify that the manager key is created and saved correctly. let manager_key = metadata_manager.manager_key().await.unwrap(); let sql = "SELECT key FROM manager_metadata"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) + let batch = sql_and_concat(metadata_manager.session_context(), sql) .await .unwrap(); @@ -239,14 +235,14 @@ mod tests { #[tokio::test] async fn test_existing_manager_key() { - let (_temp_dir, metadata_manager) = create_metadata_manager().await; + let (_temp_dir, metadata_manager) = create_delta_lake().await; // Verify that only a single key is created and saved when retrieving multiple times. let manager_key_1 = metadata_manager.manager_key().await.unwrap(); let manager_key_2 = metadata_manager.manager_key().await.unwrap(); let sql = "SELECT key FROM manager_metadata"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) + let batch = sql_and_concat(metadata_manager.session_context(), sql) .await .unwrap(); @@ -256,7 +252,7 @@ mod tests { #[tokio::test] async fn test_save_node() { - let (_temp_dir, metadata_manager) = create_metadata_manager().await; + let (_temp_dir, metadata_manager) = create_delta_lake().await; let node_1 = Node::new("url_1".to_string(), ServerMode::Edge); metadata_manager.save_node(node_1.clone()).await.unwrap(); @@ -266,7 +262,7 @@ mod tests { // Verify that the nodes are saved correctly. let sql = "SELECT url, mode FROM nodes"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) + let batch = sql_and_concat(metadata_manager.session_context(), sql) .await .unwrap(); @@ -282,7 +278,7 @@ mod tests { #[tokio::test] async fn test_remove_node() { - let (_temp_dir, metadata_manager) = create_metadata_manager().await; + let (_temp_dir, metadata_manager) = create_delta_lake().await; let node_1 = Node::new("url_1".to_string(), ServerMode::Edge); metadata_manager.save_node(node_1.clone()).await.unwrap(); @@ -294,7 +290,7 @@ mod tests { // Verify that node_1 is removed correctly. let sql = "SELECT url, mode FROM nodes"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) + let batch = sql_and_concat(metadata_manager.session_context(), sql) .await .unwrap(); @@ -310,7 +306,7 @@ mod tests { #[tokio::test] async fn test_nodes() { - let (_temp_dir, metadata_manager) = create_metadata_manager().await; + let (_temp_dir, metadata_manager) = create_delta_lake().await; let node_1 = Node::new("url_1".to_string(), ServerMode::Edge); metadata_manager.save_node(node_1.clone()).await.unwrap(); @@ -323,23 +319,18 @@ mod tests { assert_eq!(nodes, vec![node_2, node_1]); } - async fn create_metadata_manager() -> (TempDir, MetadataManager) { + async fn create_delta_lake() -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) .await .unwrap(); - let metadata_manager = MetadataManager { - delta_lake, - session_context: SessionContext::new(), - }; - - metadata_manager + delta_lake .create_and_register_manager_metadata_delta_lake_tables() .await .unwrap(); - (temp_dir, metadata_manager) + (temp_dir, delta_lake) } } diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 914f2aa67..5dfe564ee 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -44,6 +44,7 @@ use tracing::info; use crate::Context; use crate::error::{ModelarDbManagerError, Result}; +use crate::metadata::ManagerMetadata; /// Start an Apache Arrow Flight server on 0.0.0.0:`port`. pub async fn start_apache_arrow_flight_server(context: Arc, port: u16) -> Result<()> { @@ -95,7 +96,7 @@ impl FlightServiceHandler { /// Return the schema of the table with the name `table_name`. If the table does not exist or /// the schema cannot be retrieved, return [`Status`]. async fn table_schema(&self, table_name: &str) -> StdResult, Status> { - let delta_lake = &self.context.remote_data_folder.metadata_manager.delta_lake; + let delta_lake = &self.context.remote_delta_lake; if delta_lake .is_normal_table(table_name) @@ -104,9 +105,7 @@ impl FlightServiceHandler { { let delta_table = self .context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .delta_table(table_name) .await .map_err(error_to_status_internal)?; @@ -141,9 +140,7 @@ impl FlightServiceHandler { async fn check_if_table_exists(&self, table_name: &str) -> StdResult<(), Status> { let existing_tables = self .context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .table_names() .await .map_err(error_to_status_internal)?; @@ -169,18 +166,14 @@ impl FlightServiceHandler { ) -> StdResult<(), Status> { // Create an empty Delta Lake table. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .create_normal_table(table_name, schema) .await .map_err(error_to_status_internal)?; // Persist the new normal table to the metadata Delta Lake. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .save_normal_table_metadata(table_name) .await .map_err(error_to_status_internal)?; @@ -217,18 +210,14 @@ impl FlightServiceHandler { ) -> StdResult<(), Status> { // Create an empty Delta Lake table. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .create_time_series_table(&time_series_table_metadata) .await .map_err(error_to_status_internal)?; // Persist the new time series table to the metadata Delta Lake. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .map_err(error_to_status_internal)?; @@ -268,18 +257,14 @@ impl FlightServiceHandler { // Drop the table from the remote data folder metadata Delta Lake. This will return an error // if the table does not exist. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .drop_table_metadata(table_name) .await .map_err(error_to_status_internal)?; // Drop the table from the remote data folder data Delta lake. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .drop_table(table_name) .await .map_err(error_to_status_internal)?; @@ -308,9 +293,7 @@ impl FlightServiceHandler { // Truncate the table in the remote data folder data Delta lake. self.context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .truncate_table(table_name) .await .map_err(error_to_status_internal)?; @@ -396,9 +379,7 @@ impl FlightService for FlightServiceHandler { // Retrieve the table names from the metadata Delta Lake. let table_names = self .context - .remote_data_folder - .metadata_manager - .delta_lake + .remote_delta_lake .table_names() .await .map_err(error_to_status_internal)?; @@ -638,8 +619,7 @@ impl FlightService for FlightServiceHandler { // this fails, the metadata Delta Lake and the cluster will be out of sync until the // manager is restarted. self.context - .remote_data_folder - .metadata_manager + .remote_delta_lake .save_node(node) .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 7d40a4385..0aa9b2d0e 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -332,6 +332,11 @@ impl DeltaLake { &self.maybe_connection_info } + /// Return the session context used to query the tables using Apache DataFusion. + pub fn session_context(&self) -> &SessionContext { + &self.session_context + } + /// Return an [`ObjectStore`] to access the root of the Delta Lake. pub fn object_store(&self) -> Arc { self.object_store.clone() From d30a136d8acbee70308395c00b67f31b9e3bd562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 18 Aug 2025 12:59:36 +0000 Subject: [PATCH 05/31] Fix all compile errors from merging main --- .../src/operations/data_folder.rs | 1 - crates/modelardb_manager/src/main.rs | 9 +- crates/modelardb_manager/src/metadata.rs | 29 +- crates/modelardb_manager/src/remote.rs | 3 +- crates/modelardb_server/src/data_folders.rs | 9 +- crates/modelardb_storage/src/delta_lake.rs | 48 +- crates/modelardb_storage/src/lib.rs | 1 - crates/modelardb_storage/src/parser.rs | 1 - .../src/query/time_series_table.rs | 1 - .../src/time_series_table_metadata.rs | 559 ------------------ 10 files changed, 40 insertions(+), 621 deletions(-) delete mode 100644 crates/modelardb_storage/src/time_series_table_metadata.rs diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index d6c43b45c..b30e6f8a1 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -39,7 +39,6 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType, common}; use datafusion::prelude::SessionContext; use futures::TryStreamExt; use modelardb_storage::delta_lake::{DeltaLake, DeltaTableWriter}; -use modelardb_storage::metadata::table_metadata_manager::TableMetadataManager; use modelardb_types::types::{TimeSeriesTableMetadata, TimestampArray}; use crate::error::{ModelarDbEmbeddedError, Result}; diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index 9b175db62..d9ce1fefa 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -43,6 +43,9 @@ pub static PORT: LazyLock = pub struct Context { /// Delta Lake for storing metadata and data in Apache Parquet files. pub remote_delta_lake: DeltaLake, + /// Storage configuration encoded as a [`StorageConfiguration`](protocol::manager_metadata::StorageConfiguration) + /// protobuf message to make it possible to transfer the configuration using Apache Arrow Flight. + pub remote_storage_configuration: protocol::manager_metadata::StorageConfiguration, /// Cluster of nodes currently controlled by the manager. pub cluster: RwLock, /// Key used to identify requests coming from the manager. @@ -66,9 +69,9 @@ async fn main() -> Result<()> { _ => print_usage_and_exit_with_error("remote_data_folder"), }; - let connection_info = arguments::argument_to_connection_info(remote_delta_lake_str)?; - let remote_delta_lake = DeltaLake::try_remote_from_connection_info(connection_info).await?; - let nodes = remote_delta_lake.nodes().await?; + let connection_info = arguments::argument_to_connection_info(remote_delta_lake_str)?; + let remote_delta_lake = DeltaLake::try_remote_from_connection_info(connection_info).await?; + let nodes = remote_delta_lake.nodes().await?; let mut cluster = Cluster::new(); for node in nodes { diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 4b03dafbb..2e793aca5 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -25,7 +25,6 @@ use deltalake::DeltaTableError; use deltalake::datafusion::logical_expr::{col, lit}; use modelardb_storage::delta_lake::DeltaLake; use modelardb_storage::{register_metadata_table, sql_and_concat}; -use modelardb_types::flight::protocol; use modelardb_types::types::{Node, ServerMode}; use uuid::Uuid; @@ -41,33 +40,7 @@ pub trait ManagerMetadata { async fn nodes(&self) -> Result>; } -impl MetadataManager { - /// Create a new [`MetadataManager`] that saves the metadata to a remote object store given by - /// `storage_configuration` and initialize the metadata tables. If a connection could not be - /// made or the metadata tables could not be created, return - /// [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). - pub async fn try_from_storage_configuration( - storage_configuration: protocol::manager_metadata::StorageConfiguration, - ) -> Result { - let metadata_manager = Self { - delta_lake: DeltaLake::try_remote_from_storage_configuration( - storage_configuration.clone(), - )?, - table_metadata_manager: TableMetadataManager::try_from_storage_configuration( - storage_configuration, - ) - .await?, - session_context: Arc::new(SessionContext::new()), - }; - - // Create the necessary tables in the metadata Delta Lake. - metadata_manager - .create_and_register_manager_metadata_delta_lake_tables() - .await?; - - Ok(metadata_manager) - } - +impl ManagerMetadata for DeltaLake { /// If they do not already exist, create the tables that are specific to the manager metadata /// Delta Lake and register them with the Apache DataFusion session context. /// * The `manager_metadata` table contains metadata for the manager itself. It is assumed that diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 5dfe564ee..ea280c971 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -653,8 +653,7 @@ impl FlightService for FlightServiceHandler { // Remove the node with the given url from the metadata Delta Lake. self.context - .remote_data_folder - .metadata_manager + .remote_delta_lake .remove_node(&node_metadata.url) .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index fa1a6bd64..20dd330fb 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use modelardb_storage::delta_lake::DeltaLake; -use modelardb_storage::metadata::table_metadata_manager::TableMetadataManager; use modelardb_types::flight::protocol; use modelardb_types::types::ServerMode; use tracing::warn; @@ -59,11 +58,9 @@ impl DataFolder { pub async fn try_from_storage_configuration( storage_configuration: protocol::manager_metadata::StorageConfiguration, ) -> Result { - let remote_delta_lake = - DeltaLake::try_remote_from_storage_configuration(storage_configuration.clone())?; - - let remote_table_metadata_manager = - TableMetadataManager::try_from_storage_configuration(storage_configuration).await?; + let delta_lake = Arc::new( + DeltaLake::try_remote_from_storage_configuration(storage_configuration.clone()).await?, + ); Ok(Self { delta_lake }) } diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 0aa9b2d0e..8ccd63aa0 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -21,17 +21,19 @@ use std::path::Path as StdPath; use std::sync::Arc; use arrow::array::{ - ArrayRef, BinaryArray, BooleanArray, Float32Array, Int16Array, RecordBatch, StringArray, + ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, Float32Array, Int16Array, RecordBatch, + StringArray, }; use arrow::datatypes::{DataType, Field, Schema}; use chrono::TimeDelta; use dashmap::DashMap; use datafusion::catalog::TableProvider; use datafusion::common::{DFSchema, ToDFSchema}; -use datafusion::logical_expr::lit; +use datafusion::logical_expr::{Expr, lit}; use datafusion::parquet::file::properties::WriterProperties; use datafusion::parquet::format::SortingColumn; use datafusion::prelude::{SessionContext, col}; +use datafusion_proto::bytes::Serializeable; use deltalake::delta_datafusion::DeltaDataChecker; use deltalake::kernel::transaction::{CommitBuilder, CommitProperties}; use deltalake::kernel::{Action, Add, StructField}; @@ -41,8 +43,9 @@ use deltalake::protocol::{DeltaOperation, SaveMode}; use deltalake::{DeltaOps, DeltaTable, DeltaTableError}; use futures::{StreamExt, TryStreamExt}; use modelardb_types::flight::protocol; +use modelardb_types::functions::try_convert_bytes_to_schema; use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; -use modelardb_types::types::{MAX_RETENTION_PERIOD_IN_SECONDS, TimeSeriesTableMetadata}; +use modelardb_types::types::{ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, MAX_RETENTION_PERIOD_IN_SECONDS}; use object_store::ObjectStore; use object_store::aws::AmazonS3Builder; use object_store::local::LocalFileSystem; @@ -52,12 +55,14 @@ use url::Url; use uuid::Uuid; use crate::error::{ModelarDbStorageError, Result}; -use crate::time_series_table_metadata::{GeneratedColumn, TimeSeriesTableMetadata}; use crate::{ METADATA_FOLDER, TABLE_FOLDER, apache_parquet_writer_properties, register_metadata_table, - sql_and_concat, try_convert_bytes_to_schema, try_convert_schema_to_bytes, + sql_and_concat, }; +/// Named error bound with the value 0.0 to make lossless compression more clear. +const ERROR_BOUND_ZERO: f32 = 0.0; + /// Types of tables supported by ModelarDB. enum TableType { NormalTable, @@ -99,8 +104,8 @@ impl DeltaLake { } /// Create a new [`DeltaLake`] that manages the Delta tables in memory. - pub fn new_in_memory() -> Self { - Self { + pub async fn new_in_memory() -> Result { + let delta_lake = Self { location: "memory:///modelardb".to_owned(), storage_options: HashMap::new(), maybe_connection_info: None, @@ -149,7 +154,7 @@ impl DeltaLake { /// Create a new [`DeltaLake`] that manages Delta tables in the remote object store given by /// `storage_configuration`. Returns [`ModelarDbStorageError`] if a connection to the specified /// object store could not be created. - pub fn try_remote_from_storage_configuration( + pub async fn try_remote_from_storage_configuration( storage_configuration: protocol::manager_metadata::StorageConfiguration, ) -> Result { match storage_configuration { @@ -165,14 +170,18 @@ impl DeltaLake { s3_configuration.access_key_id, s3_configuration.secret_access_key, ) + .await } protocol::manager_metadata::StorageConfiguration::AzureConfiguration( azure_configuration, - ) => Self::try_from_azure_configuration( - azure_configuration.account_name, - azure_configuration.access_key, - azure_configuration.container_name, - ), + ) => { + Self::try_from_azure_configuration( + azure_configuration.account_name, + azure_configuration.access_key, + azure_configuration.container_name, + ) + .await + } } } @@ -878,6 +887,7 @@ impl DeltaLake { error_bounds, generated_columns, ) + .map_err(|error| error.into()) } /// Return the error bounds for the columns in the time series table with `table_name`. If a @@ -937,16 +947,16 @@ impl DeltaLake { let mut generated_columns = vec![None; df_schema.fields().len()]; let column_index_array = modelardb_types::array!(batch, 0, Int16Array); - let generated_column_expr_array = modelardb_types::array!(batch, 1, StringArray); + let generated_column_expr_array = modelardb_types::array!(batch, 1, BinaryArray); for row_index in 0..batch.num_rows() { let generated_column_index = column_index_array.value(row_index); - let generated_column_expr = generated_column_expr_array.value(row_index); + let expr_bytes = generated_column_expr_array.value(row_index); - // If generated_column_expr is null, it is saved as an empty string in the column values. - if !generated_column_expr.is_empty() { - let generated_column = - GeneratedColumn::try_from_sql_expr(generated_column_expr, df_schema)?; + // If generated_column_expr is null, it is saved as empty bytes in the column values. + if !expr_bytes.is_empty() { + let expr = Expr::from_bytes(expr_bytes)?; + let generated_column = GeneratedColumn::try_from_expr(expr, df_schema)?; generated_columns[generated_column_index as usize] = Some(generated_column); } diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 5380a41d0..493a3e85b 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -21,7 +21,6 @@ pub mod error; mod optimizer; pub mod parser; mod query; -pub mod time_series_table_metadata; use std::result::Result as StdResult; use std::sync::Arc; diff --git a/crates/modelardb_storage/src/parser.rs b/crates/modelardb_storage/src/parser.rs index 1ae48d4ba..8e2644231 100644 --- a/crates/modelardb_storage/src/parser.rs +++ b/crates/modelardb_storage/src/parser.rs @@ -50,7 +50,6 @@ use sqlparser::parser::{Parser, ParserError}; use sqlparser::tokenizer::{Span, Token}; use crate::error::{ModelarDbStorageError, Result}; -use crate::time_series_table_metadata::{GeneratedColumn, TimeSeriesTableMetadata}; /// A top-level statement (CREATE, INSERT, SELECT, TRUNCATE, DROP, VACUUM etc.) that has been /// tokenized, parsed, and for which semantic checks have verified that it is compatible with diff --git a/crates/modelardb_storage/src/query/time_series_table.rs b/crates/modelardb_storage/src/query/time_series_table.rs index cb0eacf63..afa742479 100644 --- a/crates/modelardb_storage/src/query/time_series_table.rs +++ b/crates/modelardb_storage/src/query/time_series_table.rs @@ -53,7 +53,6 @@ use modelardb_types::types::{ArrowTimestamp, ArrowValue, TimeSeriesTableMetadata use crate::query::generated_as_exec::{ColumnToGenerate, GeneratedAsExec}; use crate::query::grid_exec::GridExec; use crate::query::sorted_join_exec::{SortedJoinColumnType, SortedJoinExec}; -use crate::time_series_table_metadata::TimeSeriesTableMetadata; /// A queryable representation of a time series table which stores multivariate time series as segments /// containing metadata and models. [`TimeSeriesTable`] implements [`TableProvider`] so it can be diff --git a/crates/modelardb_storage/src/time_series_table_metadata.rs b/crates/modelardb_storage/src/time_series_table_metadata.rs deleted file mode 100644 index f43fe4f73..000000000 --- a/crates/modelardb_storage/src/time_series_table_metadata.rs +++ /dev/null @@ -1,559 +0,0 @@ -/* Copyright 2022 The ModelarDB Contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//! Implementation of the type containing the metadata required to read from and -//! write to a time series table. - -use std::result::Result as StdResult; -use std::sync::Arc; - -use arrow::array::StringArray; -use arrow::record_batch::RecordBatch; -use datafusion::arrow::datatypes::{ArrowPrimitiveType, DataType, Schema}; -use datafusion::common::DFSchema; -use datafusion::error::DataFusionError; -use datafusion::logical_expr::expr::Expr; -use modelardb_types::schemas::COMPRESSED_SCHEMA; -use modelardb_types::types::{ArrowTimestamp, ArrowValue, ErrorBound, TimestampArray, ValueArray}; - -use crate::error::{ModelarDbStorageError, Result}; -use crate::parser::tokenize_and_parse_sql_expression; - -/// Metadata required to ingest data into a time series table and query a time series table. -#[derive(Debug, Clone)] -pub struct TimeSeriesTableMetadata { - /// Name of the time series table. - pub name: String, - /// Index of the timestamp column in `schema`. - pub timestamp_column_index: usize, - /// Indices of the field columns in `schema`. - pub field_column_indices: Vec, - /// Indices of the tag columns in `schema`. - pub tag_column_indices: Vec, - /// Error bounds of the columns in `schema`. It can only be non-zero for field columns. - pub error_bounds: Vec, - /// Expressions to create generated columns in the `query_schema`. Only field columns can be - /// generated by [`Expr`], so [`None`] is stored for timestamp, tag, and stored field columns. - pub generated_columns: Vec>, - /// Schema of the data that can be written to the time series table. - pub schema: Arc, - /// Schema of the data that can be read from the time series table. - pub query_schema: Arc, - /// Projection that changes `query_schema` to `schema`. - pub query_schema_to_schema: Vec, - /// Schema of the compressed segments that are stored in the time series table. - pub compressed_schema: Arc, -} - -impl TimeSeriesTableMetadata { - /// Create a new time series table with the given metadata. If any of the following conditions - /// are true, [`ModelarDbStorageError`] is returned: - /// * The number of error bounds does not match the number of columns. - /// * The number of potentially generated columns does not match the number of columns. - /// * A generated column includes another generated column in its expression. - /// * There are more than 32767 columns. - /// * The `query_schema` does not include a single timestamp column. - /// * The `query_schema` does not include at least one stored field column. - pub fn try_new( - name: String, - query_schema: Arc, - error_bounds: Vec, - generated_columns: Vec>, - ) -> Result { - // If an error bound is not defined for each column, return an error. - if query_schema.fields().len() != error_bounds.len() { - return Err(ModelarDbStorageError::InvalidArgument( - "An error bound must be defined for each column.".to_owned(), - )); - } - - // If a generated column or None is not defined for each column, return an error. - if query_schema.fields().len() != generated_columns.len() { - return Err(ModelarDbStorageError::InvalidArgument( - "A generated column or None must be defined for each column.".to_owned(), - )); - } - - // If a generated field column depends on other generated field columns, return an error. - for generated_column in generated_columns.iter().flatten() { - for source_column in &generated_column.source_columns { - if generated_columns[*source_column].is_some() { - return Err(ModelarDbStorageError::InvalidArgument( - "A generated field column cannot depend on generated field columns." - .to_owned(), - )); - } - } - } - - // If there are more than 32767 columns, return an error. This limitation is necessary since - // 16 bits are used for the field column index in the compressed segments. - if query_schema.fields.len() > 32767 { - return Err(ModelarDbStorageError::InvalidArgument( - "There cannot be more than 32767 columns in the time series table.".to_owned(), - )); - } - - // Remove the generated field columns from the query schema and the error bounds as these - // columns should never be provided when inserting data points into the time series table. - let mut fields_without_generated = Vec::with_capacity(query_schema.fields().len()); - let mut field_indices_without_generated = Vec::with_capacity(query_schema.fields().len()); - let mut error_bounds_without_generated = Vec::with_capacity(error_bounds.len()); - for (index, generated_column) in generated_columns.iter().enumerate() { - if generated_column.is_none() { - fields_without_generated.push(query_schema.fields[index].clone()); - field_indices_without_generated.push(index); - error_bounds_without_generated.push(error_bounds[index]); - } - } - - let schema_without_generated = - if query_schema.fields.len() != fields_without_generated.len() { - Arc::new(Schema::new(fields_without_generated)) - } else { - query_schema.clone() - }; - - // A time series table must only contain one stored timestamp column, one or more stored - // field columns, zero or more generated field columns, and zero or more stored tag columns. - let timestamp_column_indices = compute_indices_of_columns_with_data_type( - &schema_without_generated, - ArrowTimestamp::DATA_TYPE, - ); - - if timestamp_column_indices.len() != 1 { - return Err(ModelarDbStorageError::InvalidArgument( - "There needs to be exactly one timestamp column.".to_owned(), - )); - } - - let field_column_indices = compute_indices_of_columns_with_data_type( - &schema_without_generated, - ArrowValue::DATA_TYPE, - ); - - if field_column_indices.is_empty() { - return Err(ModelarDbStorageError::InvalidArgument( - "There needs to be at least one field column.".to_owned(), - )); - } - - let tag_column_indices = - compute_indices_of_columns_with_data_type(&schema_without_generated, DataType::Utf8); - - // Add the tag columns to the base schema for compressed segments. - let mut compressed_schema_fields = - Vec::with_capacity(COMPRESSED_SCHEMA.0.fields().len() + tag_column_indices.len()); - compressed_schema_fields.extend(COMPRESSED_SCHEMA.0.fields.clone().to_vec()); - - for index in &tag_column_indices { - compressed_schema_fields.push(Arc::new(schema_without_generated.field(*index).clone())); - } - - let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); - - Ok(Self { - name, - timestamp_column_index: timestamp_column_indices[0], - field_column_indices, - tag_column_indices, - error_bounds: error_bounds_without_generated, - generated_columns, - schema: schema_without_generated, - query_schema, - query_schema_to_schema: field_indices_without_generated, - compressed_schema, - }) - } - - /// Return `true` if the column at `index` is the timestamp column. - pub fn is_timestamp(&self, index: usize) -> bool { - index == self.timestamp_column_index - } - - /// Return `true` if the column at `index` is a tag column. - pub fn is_tag(&self, index: usize) -> bool { - self.tag_column_indices.contains(&index) - } - - /// Return the column arrays for the timestamp, field, and tag columns in `record_batch`. If - /// `record_batch` does not contain the required columns, return [`ModelarDbStorageError`]. - pub fn column_arrays<'a>( - &self, - record_batch: &'a RecordBatch, - ) -> Result<( - &'a TimestampArray, - Vec<&'a ValueArray>, - Vec<&'a StringArray>, - )> { - if record_batch.schema() != self.schema { - return Err(ModelarDbStorageError::InvalidArgument( - "The record batch does not match the schema of the time series table.".to_owned(), - )); - } - - let timestamp_column_array = - modelardb_types::array!(record_batch, self.timestamp_column_index, TimestampArray); - - let field_column_arrays: Vec<_> = self - .field_column_indices - .iter() - .map(|index| modelardb_types::array!(record_batch, *index, ValueArray)) - .collect(); - - let tag_column_arrays: Vec<_> = self - .tag_column_indices - .iter() - .map(|index| modelardb_types::array!(record_batch, *index, StringArray)) - .collect(); - - Ok(( - timestamp_column_array, - field_column_arrays, - tag_column_arrays, - )) - } -} - -/// Compute the indices of all columns in `schema` with `data_type`. -fn compute_indices_of_columns_with_data_type(schema: &Schema, data_type: DataType) -> Vec { - let fields = schema.fields(); - (0..fields.len()) - .filter(|index| *fields[*index].data_type() == data_type) - .collect() -} - -/// Column that is generated by a [`Expr`] using zero or more stored columns as input. -#[derive(Clone, Debug, PartialEq)] -pub struct GeneratedColumn { - /// Logical expression that computes the values of the column. - pub expr: Expr, - /// Indices of the stored columns used by `expr` to compute the column's values. - pub source_columns: Vec, - /// Original representation of `expr`. It is copied from the SQL statement, so it can be stored - /// in the metadata Delta Lake as `expr` does not implement serialization and deserialization. - pub original_expr: String, -} - -impl GeneratedColumn { - /// Create a [`GeneratedColumn`] from a SQL expression and a [`DFSchema`]. If the SQL expression - /// is not valid or refers to columns that are not in the [`DFSchema`], - /// a [`ModelarDbStorageError`] is returned. - pub fn try_from_sql_expr(sql_expr: &str, df_schema: &DFSchema) -> Result { - let expr = tokenize_and_parse_sql_expression(sql_expr, df_schema)?; - - let source_columns: StdResult, DataFusionError> = expr - .column_refs() - .iter() - .map(|column| df_schema.index_of_column(column)) - .collect(); - - Ok(Self { - expr, - source_columns: source_columns?, - original_expr: sql_expr.to_owned(), - }) - } -} - -#[cfg(test)] -mod test { - use super::*; - - use datafusion::arrow::datatypes::{DataType, Field, Schema}; - use datafusion::common::ToDFSchema; - use datafusion::logical_expr::col; - use modelardb_common::test::ERROR_BOUND_ZERO; - - use crate::test; - - // Tests for TimeSeriesTableMetadata. - #[test] - fn test_can_create_time_series_table_metadata() { - let (query_schema, error_bounds, generated_columns) = - time_series_table_schema_error_bounds_and_generated_columns(); - let result = TimeSeriesTableMetadata::try_new( - test::TIME_SERIES_TABLE_NAME.to_owned(), - query_schema, - error_bounds, - generated_columns, - ); - - assert!(result.is_ok()); - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_invalid_timestamp_type() { - let schema = Schema::new(vec![ - Field::new("tag", DataType::Utf8, false), - Field::new("timestamp", DataType::UInt8, false), - Field::new("value", ArrowValue::DATA_TYPE, false), - ]); - - let result = create_simple_time_series_table_metadata(schema); - assert!(result.is_err()); - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_invalid_tag_type() { - let schema = Schema::new(vec![ - Field::new("tag", DataType::UInt8, false), - Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), - Field::new("value", ArrowValue::DATA_TYPE, false), - ]); - - let result = create_simple_time_series_table_metadata(schema); - assert!(result.is_err()); - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_no_fields() { - let schema = Schema::new(vec![ - Field::new("tag", DataType::Utf8, false), - Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), - ]); - - let result = create_simple_time_series_table_metadata(schema); - assert!(result.is_err()); - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_invalid_field_type() { - let schema = Schema::new(vec![ - Field::new("tag", DataType::Utf8, false), - Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), - Field::new("value", DataType::UInt8, false), - ]); - - let result = create_simple_time_series_table_metadata(schema); - assert!(result.is_err()); - } - - /// Return metadata for a time series table with one tag column and the timestamp column at index 1. - fn create_simple_time_series_table_metadata( - query_schema: Schema, - ) -> Result { - TimeSeriesTableMetadata::try_new( - test::TIME_SERIES_TABLE_NAME.to_owned(), - Arc::new(query_schema), - vec![ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap()], - vec![None], - ) - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_missing_or_too_many_error_bounds() { - let (query_schema, _error_bounds, generated_columns) = - time_series_table_schema_error_bounds_and_generated_columns(); - let result = TimeSeriesTableMetadata::try_new( - test::TIME_SERIES_TABLE_NAME.to_owned(), - query_schema, - vec![], - generated_columns, - ); - - assert!(result.is_err()); - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_missing_or_too_many_generated_columns() { - let (query_schema, error_bounds, _generated_columns) = - time_series_table_schema_error_bounds_and_generated_columns(); - let result = TimeSeriesTableMetadata::try_new( - test::TIME_SERIES_TABLE_NAME.to_owned(), - query_schema, - error_bounds, - vec![], - ); - - assert!(result.is_err()); - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_generated_columns_using_generated_columns() - { - let (query_schema, error_bounds, mut generated_columns) = - time_series_table_schema_error_bounds_and_generated_columns(); - - generated_columns[5] = Some(GeneratedColumn { - expr: Expr::Column("".into()), - source_columns: vec![], - original_expr: "".to_owned(), - }); - - generated_columns[6] = Some(GeneratedColumn { - expr: Expr::Column("".into()), - source_columns: vec![5], - original_expr: "".to_owned(), - }); - - let result = TimeSeriesTableMetadata::try_new( - test::TIME_SERIES_TABLE_NAME.to_owned(), - query_schema, - error_bounds, - generated_columns, - ); - - assert!(result.is_err()); - } - - fn time_series_table_schema_error_bounds_and_generated_columns() - -> (Arc, Vec, Vec>) { - ( - Arc::new(Schema::new(vec![ - Field::new("location", DataType::Utf8, false), - Field::new("install_year", DataType::Utf8, false), - Field::new("model", DataType::Utf8, false), - Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), - Field::new("power_output", ArrowValue::DATA_TYPE, false), - Field::new("wind_speed", ArrowValue::DATA_TYPE, false), - Field::new("temperature", ArrowValue::DATA_TYPE, false), - ])), - vec![ - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ], - vec![None, None, None, None, None, None, None], - ) - } - - #[test] - fn test_cannot_create_time_series_table_metadata_with_too_many_fields() { - // Create 1025 fields that can be used to initialize a schema. - let fields = (0..1025) - .map(|i| Field::new(format!("field_{i}").as_str(), DataType::Float32, false)) - .collect::>(); - - let error_bounds = vec![ - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(), - ErrorBound::try_new_relative(ERROR_BOUND_ZERO).unwrap(), - ]; - - let generated_columns = vec![None, None, None]; - - let result = TimeSeriesTableMetadata::try_new( - test::TIME_SERIES_TABLE_NAME.to_owned(), - Arc::new(Schema::new(fields)), - error_bounds, - generated_columns, - ); - - assert!(result.is_err()); - } - - #[test] - fn test_is_timestamp() { - let time_series_table_metadata = test::time_series_table_metadata(); - - assert!(time_series_table_metadata.is_timestamp(0)); - assert!(!time_series_table_metadata.is_timestamp(1)); - assert!(!time_series_table_metadata.is_timestamp(2)); - assert!(!time_series_table_metadata.is_timestamp(3)); - } - - #[test] - fn test_is_tag() { - let time_series_table_metadata = test::time_series_table_metadata(); - - assert!(!time_series_table_metadata.is_tag(0)); - assert!(!time_series_table_metadata.is_tag(1)); - assert!(!time_series_table_metadata.is_tag(2)); - assert!(time_series_table_metadata.is_tag(3)); - } - - #[test] - fn test_column_arrays() { - let time_series_table_metadata = test::time_series_table_metadata(); - let record_batch = test::uncompressed_time_series_table_record_batch(1); - - let (timestamp_column_array, field_column_arrays, tag_column_arrays) = - time_series_table_metadata - .column_arrays(&record_batch) - .unwrap(); - - assert_eq!( - modelardb_types::array!(record_batch, 0, TimestampArray), - timestamp_column_array - ); - assert_eq!( - modelardb_types::array!(record_batch, 1, ValueArray), - field_column_arrays[0] - ); - assert_eq!( - modelardb_types::array!(record_batch, 2, ValueArray), - field_column_arrays[1] - ); - assert_eq!( - modelardb_types::array!(record_batch, 3, StringArray), - tag_column_arrays[0] - ); - } - - #[test] - fn test_column_arrays_with_invalid_schema() { - let time_series_table_metadata = test::time_series_table_metadata(); - let record_batch = test::normal_table_record_batch(); - - let result = time_series_table_metadata.column_arrays(&record_batch); - - assert_eq!( - result.unwrap_err().to_string(), - "Invalid Argument Error: The record batch does not match the schema of the time series table." - ); - } - - // Tests for GeneratedColumn. - #[test] - fn test_can_create_generated_column() { - let schema = Schema::new(vec![ - Field::new("field_1", ArrowValue::DATA_TYPE, false), - Field::new("field_2", ArrowValue::DATA_TYPE, false), - Field::new("generated_column", ArrowValue::DATA_TYPE, false), - ]); - - let sql_expr = "field_1 + field_2"; - let expected_generated_column = GeneratedColumn { - expr: col("field_1") + col("field_2"), - source_columns: vec![0, 1], - original_expr: sql_expr.to_owned(), - }; - - let df_schema = schema.to_dfschema().unwrap(); - let mut result = GeneratedColumn::try_from_sql_expr(sql_expr, &df_schema).unwrap(); - - // Sort the source columns to ensure the order is consistent. - result.source_columns.sort(); - assert_eq!(expected_generated_column, result); - } - - #[test] - fn test_cannot_create_generated_column_with_invalid_sql_expr() { - let schema = Schema::new(vec![ - Field::new("field_1", ArrowValue::DATA_TYPE, false), - Field::new("generated_column", ArrowValue::DATA_TYPE, false), - ]); - - let df_schema = schema.to_dfschema().unwrap(); - let result = GeneratedColumn::try_from_sql_expr("field_1 + field_2", &df_schema); - - assert!(result.is_err()); - } -} From e12ca03d4fc018ac27f37cb65089dc155ba71c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 18 Aug 2025 13:28:07 +0000 Subject: [PATCH 06/31] Fix compile errors for all tests --- crates/modelardb_server/src/context.rs | 78 +++++++------------ .../src/storage/data_transfer.rs | 2 +- crates/modelardb_storage/src/delta_lake.rs | 5 +- 3 files changed, 31 insertions(+), 54 deletions(-) diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index df4de9d79..d164b2bb0 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -455,7 +455,7 @@ mod tests { context .data_folders .local_data_folder - .table_metadata_manager + .delta_lake .is_normal_table(NORMAL_TABLE_NAME) .await .unwrap() @@ -605,19 +605,14 @@ mod tests { // The normal table should be deleted from the metadata Delta Lake. assert!( - !context - .data_folders - .local_data_folder - <<<<<<< HEAD - .table_metadata_manager - .is_normal_table(NORMAL_TABLE_NAME) - ======= - .delta_lake - .is_normal_table(test::NORMAL_TABLE_NAME) - >>>>>>> 7e75112 (Remove TableMetadataManager) - .await - .unwrap() - ); + !context + .data_folders + .local_data_folder + .delta_lake + .is_normal_table(NORMAL_TABLE_NAME) + .await + .unwrap() + ); // The normal table should be deleted from the Delta Lake. assert!(!temp_dir.path().join("tables").exists()); @@ -647,19 +642,14 @@ mod tests { // The time series table should be deleted from the metadata Delta Lake. assert!( - !context - .data_folders - .local_data_folder - <<<<<<< HEAD - .table_metadata_manager - .is_time_series_table(TIME_SERIES_TABLE_NAME) - ======= - .delta_lake - .is_time_series_table(test::TIME_SERIES_TABLE_NAME) - >>>>>>> 7e75112 (Remove TableMetadataManager) - .await - .unwrap() - ); + !context + .data_folders + .local_data_folder + .delta_lake + .is_time_series_table(TIME_SERIES_TABLE_NAME) + .await + .unwrap() + ); // The time series table should be deleted from the Delta Lake. assert!(!temp_dir.path().join("tables").exists()); @@ -696,17 +686,12 @@ mod tests { // The normal table should not be deleted from the metadata Delta Lake. assert!( - local_data_folder - <<<<<<< HEAD - .table_metadata_manager - .is_normal_table(NORMAL_TABLE_NAME) - ======= - .delta_lake - .is_normal_table(test::NORMAL_TABLE_NAME) - >>>>>>> 7e75112 (Remove TableMetadataManager) - .await - .unwrap() - ); + local_data_folder + .delta_lake + .is_normal_table(NORMAL_TABLE_NAME) + .await + .unwrap() + ); // The normal table data should be deleted from the Delta Lake. delta_table.load().await.unwrap(); @@ -734,17 +719,12 @@ mod tests { // The time series table should not be deleted from the metadata Delta Lake. assert!( - local_data_folder - <<<<<<< HEAD - .table_metadata_manager - .is_time_series_table(TIME_SERIES_TABLE_NAME) - ======= - .delta_lake - .is_time_series_table(test::TIME_SERIES_TABLE_NAME) - >>>>>>> 7e75112 (Remove TableMetadataManager) - .await - .unwrap() - ); + local_data_folder + .delta_lake + .is_time_series_table(TIME_SERIES_TABLE_NAME) + .await + .unwrap() + ); // The time series table data should be deleted from the Delta Lake. delta_table.load().await.unwrap(); diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 66241993d..6b6fa6684 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -488,7 +488,7 @@ mod tests { .unwrap(); local_data_folder - .table_metadata_manager + .delta_lake .save_normal_table_metadata(NORMAL_TABLE_NAME) .await .unwrap(); diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 8ccd63aa0..df6b1dea5 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -1130,11 +1130,10 @@ mod tests { use datafusion::arrow::datatypes::DataType; use datafusion::common::ScalarValue::Int64; use datafusion::logical_expr::Expr::Literal; + use modelardb_test::table as test; use modelardb_types::types::{ArrowTimestamp, ArrowValue}; use tempfile::TempDir; - use crate::test; - // Tests for DeltaLake. #[tokio::test] async fn test_create_metadata_delta_lake_tables() { @@ -1465,13 +1464,11 @@ mod tests { let plus_one_column = Some(GeneratedColumn { expr: col("field_1") + Literal(Int64(Some(1))), source_columns: vec![1], - original_expr: "field_1 + 1".to_owned(), }); let addition_column = Some(GeneratedColumn { expr: col("field_1") + col("field_2"), source_columns: vec![1, 2], - original_expr: "field_1 + field_2".to_owned(), }); let expected_generated_columns = From 117f3f408ccccd5d2a78fd1f99fd9e73663f3678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 18 Aug 2025 14:51:42 +0000 Subject: [PATCH 07/31] Fix failing tests after rebasing on main --- crates/modelardb_storage/src/delta_lake.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index df6b1dea5..44088f8e2 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -320,7 +320,7 @@ impl DeltaLake { Field::new("column_index", DataType::Int16, false), Field::new("error_bound_value", DataType::Float32, false), Field::new("error_bound_is_relative", DataType::Boolean, false), - Field::new("generated_column_expr", DataType::Utf8, true), + Field::new("generated_column_expr", DataType::Binary, true), ]), ) .await?; @@ -1303,7 +1303,7 @@ mod tests { assert_eq!(**batch.column(4), BooleanArray::from(vec![false, true])); assert_eq!( **batch.column(5), - StringArray::from(vec![None, None] as Vec>) + BinaryArray::from_opt_vec(vec![None, None]) ); } From 9280400fe0aaa0c4f3335d8b6edfab6b6af847c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Wed, 3 Sep 2025 09:01:34 +0000 Subject: [PATCH 08/31] Fix compile errors, warnings, tests after rebasing --- crates/modelardb_manager/src/remote.rs | 7 +- crates/modelardb_storage/src/delta_lake.rs | 91 ++++++++++++++++++++++ 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index ea280c971..5bdb18534 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -320,8 +320,7 @@ impl FlightServiceHandler { ) -> StdResult<(), Status> { // Vacuum the table in the remote data folder Delta lake. self.context - .remote_data_folder - .delta_lake + .remote_delta_lake .vacuum_table(table_name, maybe_retention_period_in_seconds) .await .map_err(error_to_status_internal)?; @@ -510,9 +509,7 @@ impl FlightService for FlightServiceHandler { if table_names.is_empty() { table_names = self .context - .remote_data_folder - .metadata_manager - .table_metadata_manager + .remote_delta_lake .table_names() .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 44088f8e2..53541731b 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -694,6 +694,97 @@ impl DeltaLake { Ok(()) } + /// Save the created normal table to the metadata Delta Lake. This consists of adding a row to + /// the `normal_table_metadata` table with the `name` of the table. If the normal table metadata + /// was saved, return [`Ok`], otherwise return [`ModelarDbStorageError`]. + pub async fn save_normal_table_metadata(&self, name: &str) -> Result<()> { + self.write_columns_to_metadata_table( + "normal_table_metadata", + vec![Arc::new(StringArray::from(vec![name]))], + ) + .await?; + + Ok(()) + } + + /// Save the created time series table to the metadata Delta Lake. This includes adding a row to + /// the `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` + /// table for each field column. + pub async fn save_time_series_table_metadata( + &self, + time_series_table_metadata: &TimeSeriesTableMetadata, + ) -> Result<()> { + // Convert the query schema to bytes, so it can be saved in the metadata Delta Lake. + let query_schema_bytes = + try_convert_schema_to_bytes(&time_series_table_metadata.query_schema)?; + + // Add a new row in the time_series_table_metadata table to persist the time series table. + self.write_columns_to_metadata_table( + "time_series_table_metadata", + vec![ + Arc::new(StringArray::from(vec![ + time_series_table_metadata.name.clone(), + ])), + Arc::new(BinaryArray::from_vec(vec![&query_schema_bytes])), + ], + ) + .await?; + + // Add a row for each field column to the time_series_table_field_columns table. + for (query_schema_index, field) in time_series_table_metadata + .query_schema + .fields() + .iter() + .enumerate() + { + if field.data_type() == &ArrowValue::DATA_TYPE { + // Convert the generated column expression to bytes, if it exists. + let maybe_generated_column_expr = match time_series_table_metadata + .generated_columns + .get(query_schema_index) + { + Some(Some(generated_column)) => { + Some(generated_column.expr.to_bytes()?.to_vec()) + } + _ => None, + }; + + // error_bounds matches schema and not query_schema to simplify looking up the error + // bound during ingestion as it occurs far more often than creation of time series tables. + let (error_bound_value, error_bound_is_relative) = if let Ok(schema_index) = + time_series_table_metadata.schema.index_of(field.name()) + { + match time_series_table_metadata.error_bounds[schema_index] { + ErrorBound::Absolute(value) => (value, false), + ErrorBound::Relative(value) => (value, true), + } + } else { + (0.0, false) + }; + + // query_schema_index is simply cast as a time series table contains at most 32767 columns. + self.write_columns_to_metadata_table( + "time_series_table_field_columns", + vec![ + Arc::new(StringArray::from(vec![ + time_series_table_metadata.name.clone(), + ])), + Arc::new(StringArray::from(vec![field.name().clone()])), + Arc::new(Int16Array::from(vec![query_schema_index as i16])), + Arc::new(Float32Array::from(vec![error_bound_value])), + Arc::new(BooleanArray::from(vec![error_bound_is_relative])), + Arc::new(BinaryArray::from_opt_vec(vec![ + maybe_generated_column_expr.as_deref(), + ])), + ], + ) + .await?; + } + } + + Ok(()) + } + /// Write `columns` to a metadata Delta Lake table with `table_name`. Returns an updated /// [`DeltaTable`] version if the file was written successfully, otherwise returns /// [`ModelarDbStorageError`]. From 146ae67bc3be5911dd19dfc0cc2adc13911ae560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Wed, 10 Sep 2025 11:17:41 +0000 Subject: [PATCH 09/31] Fix compile errors and missing create tables --- crates/modelardb_manager/src/main.rs | 5 +++-- crates/modelardb_server/src/manager.rs | 16 +++++++++------- crates/modelardb_storage/src/delta_lake.rs | 8 +++++--- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index d9ce1fefa..7035bf67a 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -69,8 +69,8 @@ async fn main() -> Result<()> { _ => print_usage_and_exit_with_error("remote_data_folder"), }; - let connection_info = arguments::argument_to_connection_info(remote_delta_lake_str)?; - let remote_delta_lake = DeltaLake::try_remote_from_connection_info(connection_info).await?; + let remote_storage_configuration = modelardb_types::flight::argument_to_storage_configuration(remote_data_folder_str)?; + let remote_delta_lake = DeltaLake::try_remote_from_storage_configuration(remote_storage_configuration.clone()).await?; let nodes = remote_delta_lake.nodes().await?; let mut cluster = Cluster::new(); @@ -91,6 +91,7 @@ async fn main() -> Result<()> { // Create the Context. let context = Arc::new(Context { remote_delta_lake, + remote_storage_configuration, cluster: RwLock::new(cluster), key, }); diff --git a/crates/modelardb_server/src/manager.rs b/crates/modelardb_server/src/manager.rs index 1d1dc7980..520fc9865 100644 --- a/crates/modelardb_server/src/manager.rs +++ b/crates/modelardb_server/src/manager.rs @@ -96,7 +96,9 @@ impl Manager { /// retrieved from the remote data folder, or the tables could not be created, /// return [`ModelarDbServerError`]. pub(crate) async fn retrieve_and_create_tables(&self, context: &Arc) -> Result<()> { - let local_data_folder = &context.data_folders.local_data_folder; + let local_data_folder = &context + .data_folders + .local_data_folder; let remote_data_folder = &context .data_folders @@ -180,11 +182,11 @@ async fn validate_local_tables_exist_remotely( remote_data_folder: &DataFolder, ) -> Result<()> { let local_table_names = local_data_folder - .table_metadata_manager + .delta_lake .table_names() .await?; let remote_table_names = remote_data_folder - .table_metadata_manager + .delta_lake .table_names() .await?; @@ -215,7 +217,7 @@ async fn validate_normal_tables( let mut missing_normal_tables = vec![]; let remote_normal_tables = remote_data_folder - .table_metadata_manager + .delta_lake .normal_table_names() .await?; @@ -255,18 +257,18 @@ async fn validate_time_series_tables( let mut missing_time_series_tables = vec![]; let remote_time_series_tables = remote_data_folder - .table_metadata_manager + .delta_lake .time_series_table_names() .await?; for table_name in remote_time_series_tables { let remote_metadata = remote_data_folder - .table_metadata_manager + .delta_lake .time_series_table_metadata_for_time_series_table(&table_name) .await?; if let Ok(local_metadata) = local_data_folder - .table_metadata_manager + .delta_lake .time_series_table_metadata_for_time_series_table(&table_name) .await { diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 53541731b..06ab2bf66 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -43,9 +43,9 @@ use deltalake::protocol::{DeltaOperation, SaveMode}; use deltalake::{DeltaOps, DeltaTable, DeltaTableError}; use futures::{StreamExt, TryStreamExt}; use modelardb_types::flight::protocol; -use modelardb_types::functions::try_convert_bytes_to_schema; +use modelardb_types::functions::{try_convert_bytes_to_schema, try_convert_schema_to_bytes}; use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; -use modelardb_types::types::{ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, MAX_RETENTION_PERIOD_IN_SECONDS}; +use modelardb_types::types::{ArrowValue, ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, MAX_RETENTION_PERIOD_IN_SECONDS}; use object_store::ObjectStore; use object_store::aws::AmazonS3Builder; use object_store::local::LocalFileSystem; @@ -757,6 +757,7 @@ impl DeltaLake { match time_series_table_metadata.error_bounds[schema_index] { ErrorBound::Absolute(value) => (value, false), ErrorBound::Relative(value) => (value, true), + ErrorBound::Lossless => (0.0, false), } } else { (0.0, false) @@ -1222,7 +1223,7 @@ mod tests { use datafusion::common::ScalarValue::Int64; use datafusion::logical_expr::Expr::Literal; use modelardb_test::table as test; - use modelardb_types::types::{ArrowTimestamp, ArrowValue}; + use modelardb_types::types::{ArrowTimestamp}; use tempfile::TempDir; // Tests for DeltaLake. @@ -1525,6 +1526,7 @@ mod tests { .map(|error_bound| match error_bound { ErrorBound::Absolute(value) => *value, ErrorBound::Relative(value) => *value, + ErrorBound::Lossless => 0.0, }) .collect(); From 9c477ed8927e8e8b39ab0f249aec3261549bd737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Wed, 10 Sep 2025 16:32:41 +0000 Subject: [PATCH 10/31] Normalize naming for creating IO types --- .../src/operations/data_folder.rs | 8 ++--- crates/modelardb_manager/src/main.rs | 6 ++-- crates/modelardb_manager/src/metadata.rs | 2 +- crates/modelardb_server/src/data_folders.rs | 4 +-- crates/modelardb_storage/src/delta_lake.rs | 30 +++++++++---------- .../src/optimizer/model_simple_aggregates.rs | 2 +- 6 files changed, 27 insertions(+), 25 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index b30e6f8a1..b0846133a 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -118,7 +118,7 @@ impl DataFolder { /// Creates a [`DataFolder`] that manages data in memory and returns it. If the metadata tables /// could not be created, [`ModelarDbEmbeddedError`] is returned. pub async fn open_memory() -> Result { - let delta_lake = DeltaLake::new_in_memory().await?; + let delta_lake = DeltaLake::open_memory().await?; Self::try_new_and_register_tables(delta_lake).await } @@ -126,7 +126,7 @@ impl DataFolder { /// returns it. If the folder does not exist and could not be created or the metadata tables /// could not be created, [`ModelarDbEmbeddedError`] is returned. pub async fn open_local(data_folder_path: &StdPath) -> Result { - let delta_lake = DeltaLake::try_from_local_path(data_folder_path).await?; + let delta_lake = DeltaLake::open_local(data_folder_path).await?; Self::try_new_and_register_tables(delta_lake).await } @@ -146,7 +146,7 @@ impl DataFolder { deltalake::aws::register_handlers(None); // Construct data folder. - let delta_lake = DeltaLake::try_from_s3_configuration( + let delta_lake = DeltaLake::open_s3( endpoint.clone(), bucket_name.clone(), access_key_id.clone(), @@ -165,7 +165,7 @@ impl DataFolder { access_key: String, container_name: String, ) -> Result { - let delta_lake = DeltaLake::try_from_azure_configuration( + let delta_lake = DeltaLake::open_azure( account_name.clone(), access_key.clone(), container_name.clone(), diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index 7035bf67a..cba733abe 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -70,10 +70,12 @@ async fn main() -> Result<()> { }; let remote_storage_configuration = modelardb_types::flight::argument_to_storage_configuration(remote_data_folder_str)?; - let remote_delta_lake = DeltaLake::try_remote_from_storage_configuration(remote_storage_configuration.clone()).await?; - let nodes = remote_delta_lake.nodes().await?; + let remote_delta_lake = DeltaLake::open_object_store(remote_storage_configuration.clone()).await?; + + remote_delta_lake.create_and_register_manager_metadata_delta_lake_tables().await?; let mut cluster = Cluster::new(); + let nodes = remote_delta_lake.nodes().await?; for node in nodes { cluster.register_node(node)?; } diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 2e793aca5..a63f475f6 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -295,7 +295,7 @@ mod tests { async fn create_delta_lake() -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + let delta_lake = DeltaLake::open_local(temp_dir.path()) .await .unwrap(); diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index 20dd330fb..529dd3511 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -40,7 +40,7 @@ impl DataFolder { /// folder does not exist and could not be created, or if the metadata tables could not be /// created, [`ModelarDbServerError`] is returned. pub async fn try_from_local_url(local_url: &str) -> Result { - let delta_lake = Arc::new(DeltaLake::try_from_local_url(local_url).await?); + let delta_lake = Arc::new(DeltaLake::open_local_url(local_url).await?); if local_url.starts_with("memory://") { warn!( @@ -59,7 +59,7 @@ impl DataFolder { storage_configuration: protocol::manager_metadata::StorageConfiguration, ) -> Result { let delta_lake = Arc::new( - DeltaLake::try_remote_from_storage_configuration(storage_configuration.clone()).await?, + DeltaLake::open_object_store(storage_configuration.clone()).await?, ); Ok(Self { delta_lake }) diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 06ab2bf66..83326e9f9 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -92,11 +92,11 @@ impl DeltaLake { /// `local_url` has the schema `memory`, the Delta tables are managed in memory. Return /// [`ModelarDbStorageError`] if `local_url` cannot be parsed or the metadata tables cannot be /// created. - pub async fn try_from_local_url(local_url: &str) -> Result { + pub async fn open_local_url(local_url: &str) -> Result { match local_url.split_once("://") { - None => Self::try_from_local_path(StdPath::new(local_url)).await, - Some(("file", local_path)) => Self::try_from_local_path(StdPath::new(local_path)).await, - Some(("memory", _)) => Self::new_in_memory().await, + None => Self::open_local(StdPath::new(local_url)).await, + Some(("file", local_path)) => Self::open_local(StdPath::new(local_path)).await, + Some(("memory", _)) => Self::open_memory().await, _ => Err(ModelarDbStorageError::InvalidArgument(format!( "{local_url} is not a valid local URL." ))), @@ -104,7 +104,7 @@ impl DeltaLake { } /// Create a new [`DeltaLake`] that manages the Delta tables in memory. - pub async fn new_in_memory() -> Result { + pub async fn open_memory() -> Result { let delta_lake = Self { location: "memory:///modelardb".to_owned(), storage_options: HashMap::new(), @@ -122,7 +122,7 @@ impl DeltaLake { /// Create a new [`DeltaLake`] that manages the Delta tables in `data_folder_path`. Returns a /// [`ModelarDbStorageError`] if `data_folder_path` does not exist and could not be created or /// the metadata tables cannot be created. - pub async fn try_from_local_path(data_folder_path: &StdPath) -> Result { + pub async fn open_local(data_folder_path: &StdPath) -> Result { // Ensure the directories in the path exists as LocalFileSystem otherwise returns an error. fs::create_dir_all(data_folder_path) .map_err(|error| DeltaTableError::generic(error.to_string()))?; @@ -154,7 +154,7 @@ impl DeltaLake { /// Create a new [`DeltaLake`] that manages Delta tables in the remote object store given by /// `storage_configuration`. Returns [`ModelarDbStorageError`] if a connection to the specified /// object store could not be created. - pub async fn try_remote_from_storage_configuration( + pub async fn open_object_store( storage_configuration: protocol::manager_metadata::StorageConfiguration, ) -> Result { match storage_configuration { @@ -164,7 +164,7 @@ impl DeltaLake { // deltalake_aws storage subcrate. deltalake::aws::register_handlers(None); - Self::try_from_s3_configuration( + Self::open_s3( s3_configuration.endpoint, s3_configuration.bucket_name, s3_configuration.access_key_id, @@ -175,7 +175,7 @@ impl DeltaLake { protocol::manager_metadata::StorageConfiguration::AzureConfiguration( azure_configuration, ) => { - Self::try_from_azure_configuration( + Self::open_azure( azure_configuration.account_name, azure_configuration.access_key, azure_configuration.container_name, @@ -188,7 +188,7 @@ impl DeltaLake { /// Create a new [`DeltaLake`] that manages the Delta tables in an object store with an /// S3-compatible API. Returns a [`ModelarDbStorageError`] if a connection to the object store /// could not be made or the metadata tables cannot be created. - pub async fn try_from_s3_configuration( + pub async fn open_s3( endpoint: String, bucket_name: String, access_key_id: String, @@ -239,7 +239,7 @@ impl DeltaLake { /// Create a new [`DeltaLake`] that manages the Delta tables in an object store with an /// Azure-compatible API. Returns a [`ModelarDbStorageError`] if a connection to the object /// store could not be made or the metadata tables cannot be created. - pub async fn try_from_azure_configuration( + pub async fn open_azure( account_name: String, access_key: String, container_name: String, @@ -1230,7 +1230,7 @@ mod tests { #[tokio::test] async fn test_create_metadata_delta_lake_tables() { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + let delta_lake = DeltaLake::open_local(temp_dir.path()) .await .unwrap(); @@ -1457,7 +1457,7 @@ mod tests { async fn create_delta_lake_and_save_normal_tables() -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + let delta_lake = DeltaLake::open_local(temp_dir.path()) .await .unwrap(); @@ -1536,7 +1536,7 @@ mod tests { #[tokio::test] async fn test_generated_columns() { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + let delta_lake = DeltaLake::open_local(temp_dir.path()) .await .unwrap(); @@ -1606,7 +1606,7 @@ mod tests { async fn create_delta_lake_and_save_time_series_table() -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::try_from_local_path(temp_dir.path()) + let delta_lake = DeltaLake::open_local(temp_dir.path()) .await .unwrap(); diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 3f386c211..11bb2f08c 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -777,7 +777,7 @@ mod tests { ) -> Arc { // Setup access to data and metadata in data folder. let data_folder_path = temp_dir.path(); - let delta_lake = DeltaLake::try_from_local_path(data_folder_path) + let delta_lake = DeltaLake::open_local(data_folder_path) .await .unwrap(); From eac89fb905130acd4aa478c497adb5c5271c8e47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Wed, 15 Oct 2025 07:54:51 +0000 Subject: [PATCH 11/31] Move compression of multivariate to compression --- Cargo.lock | 1 + crates/modelardb_bulkloader/Cargo.toml | 1 + crates/modelardb_bulkloader/src/main.rs | 16 +- .../modelardb_compression/src/compression.rs | 172 +++++++++++++++++- crates/modelardb_compression/src/error.rs | 12 ++ crates/modelardb_compression/src/lib.rs | 4 +- .../modelardb_compression/src/models/swing.rs | 2 +- crates/modelardb_embedded/src/error.rs | 11 ++ .../src/operations/data_folder.rs | 171 +---------------- .../src/storage/uncompressed_data_manager.rs | 2 +- 10 files changed, 211 insertions(+), 181 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index da6719404..0ce6f8163 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3044,6 +3044,7 @@ dependencies = [ "datafusion", "deltalake", "futures", + "modelardb_compression", "modelardb_embedded", "modelardb_storage", "modelardb_types", diff --git a/crates/modelardb_bulkloader/Cargo.toml b/crates/modelardb_bulkloader/Cargo.toml index 456e32316..08d13ecc0 100644 --- a/crates/modelardb_bulkloader/Cargo.toml +++ b/crates/modelardb_bulkloader/Cargo.toml @@ -31,6 +31,7 @@ arrow = { workspace = true, features = ["ffi"] } datafusion.workspace = true deltalake.workspace = true futures.workspace = true +modelardb_compression = { path = "../modelardb_compression" } modelardb_embedded = { path = "../modelardb_embedded" } modelardb_storage = { path = "../modelardb_storage" } modelardb_types = { path = "../modelardb_types" } diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index 63b1481e9..50367bafa 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -220,7 +220,6 @@ async fn import_time_series_table( system.refresh_memory(); if current_batch_size > (system.available_memory() as usize / 10 * 8) && let Err(write_error) = import_and_clear_time_series_table_batch( - data_folder, &mut delta_table_writer, time_series_table_metadata, &mut current_batch, @@ -234,7 +233,6 @@ async fn import_time_series_table( } if let Err(write_error) = import_and_clear_time_series_table_batch( - data_folder, &mut delta_table_writer, time_series_table_metadata, &mut current_batch, @@ -386,12 +384,11 @@ fn cast_record_batch(record_batch: RecordBatch, cast_double_to_float: bool) -> R RecordBatch::try_new(cast_schema, cast_columns).map_err(|error| error.into()) } -/// Import the `current_batch` into the time series table with `time_series_table_metadata` in -/// `data_folder` using `delta_table_writer`. Then clear `current_batch` and zero -/// `current_batch_size`. If a [`RecordBatch`] in `current_batch` has a different schema, the -/// compression fails, or the write fails, a [`ModelarDbEmbeddedError`] is returned. +/// Import the `current_batch` into the time series table with `time_series_table_metadata` using +/// `delta_table_writer`. Then clear `current_batch` and zero `current_batch_size`. If a +/// [`RecordBatch`] in `current_batch` has a different schema, the compression fails, or the write +/// fails, a [`ModelarDbEmbeddedError`] is returned. async fn import_and_clear_time_series_table_batch( - data_folder: &DataFolder, delta_table_writer: &mut DeltaTableWriter, time_series_table_metadata: &TimeSeriesTableMetadata, current_batch: &mut Vec, @@ -400,9 +397,8 @@ async fn import_and_clear_time_series_table_batch( if *current_batch_size != 0 { let schema = current_batch[0].schema(); let uncompressed_data = compute::concat_batches(&schema, &*current_batch)?; - let compressed_data = data_folder - .compress_all(time_series_table_metadata, &uncompressed_data) - .await?; + let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( + time_series_table_metadata, &uncompressed_data)?; delta_table_writer.write_all(&compressed_data).await?; current_batch.clear(); *current_batch_size = 0; diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index f2bcd83da..4abb9984c 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -19,9 +19,11 @@ use std::sync::Arc; +use arrow::array::StringArray; +use arrow::compute::{self, SortColumn, SortOptions}; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; -use modelardb_types::types::{ErrorBound, TimestampArray, ValueArray}; +use modelardb_types::types::{ErrorBound, TimeSeriesTableMetadata, TimestampArray, ValueArray}; use crate::error::{ModelarDbCompressionError, Result}; use crate::models::macaque_v::MacaqueV; @@ -35,6 +37,150 @@ use crate::types::{CompressedSegmentBatchBuilder, CompressedSegmentBuilder, Mode /// that are marked as residuals are stored as separate segments to allow for efficient pruning. const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; +/// Compress the `uncompressed_data` from the table with `time_series_table_metadata` and return the +/// resulting segments. +pub fn try_compress_multivariate_record_batch( + time_series_table_metadata: &TimeSeriesTableMetadata, + uncompressed_time_seires: &RecordBatch, +) -> Result> { + // Sort by all tags and then time to simplify splitting the data into time series. + let sorted_uncompressed_data = + sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_time_seires)?; + + // Split the sorted uncompressed data into time series and compress them separately. + let mut compressed_data = vec![]; + + let tag_column_arrays: Vec<&StringArray> = time_series_table_metadata + .tag_column_indices + .iter() + .map(|index| modelardb_types::array!(sorted_uncompressed_data, *index, StringArray)) + .collect(); + + let mut tag_values = Vec::with_capacity(tag_column_arrays.len()); + for tag_column_array in &tag_column_arrays { + tag_values.push(tag_column_array.value(0).to_owned()); + } + + // The index of the first data point of each time series must be stored so slices + // containing only data points for each time series can be extracted and compressed. + let mut row_index_start = 0; + for row_index in 0..sorted_uncompressed_data.num_rows() { + // If any of the tags differ, the data point is from a new time series. + let mut is_new_time_series = false; + for tag_column_index in 0..tag_column_arrays.len() { + is_new_time_series |= tag_values[tag_column_index] + != tag_column_arrays[tag_column_index].value(row_index); + } + + if is_new_time_series { + let time_series_length = row_index - row_index_start; + let uncompressed_time_series = + sorted_uncompressed_data.slice(row_index_start, time_series_length); + + try_compress_univariate_record_batch( + time_series_table_metadata, + &uncompressed_time_series, + &tag_values, + &mut compressed_data, + )?; + + for (tag_column_index, tag_column_array) in tag_column_arrays.iter().enumerate() { + tag_values[tag_column_index] = tag_column_array.value(row_index).to_owned(); + } + + row_index_start = row_index; + } + } + + let time_series_length = sorted_uncompressed_data.num_rows() - row_index_start; + let uncompressed_time_series = + sorted_uncompressed_data.slice(row_index_start, time_series_length); + + try_compress_univariate_record_batch( + time_series_table_metadata, + &uncompressed_time_series, + &tag_values, + &mut compressed_data, + )?; + + Ok(compressed_data) +} + +/// Sort the `uncompressed_data` from the time series table with `time_series_table_metadata` +/// according to its tags and then timestamps. +fn sort_record_batch_by_tags_and_time( + time_series_table_metadata: &TimeSeriesTableMetadata, + uncompressed_data: &RecordBatch, +) -> Result { + let mut sort_columns = vec![]; + + let sort_options = Some(SortOptions { + descending: false, + nulls_first: false, + }); + + for tag_column_index in &time_series_table_metadata.tag_column_indices { + let tag_column = uncompressed_data.column(*tag_column_index); + sort_columns.push(SortColumn { + values: (*tag_column).clone(), + options: sort_options, + }); + } + + let timestamp_column_index = time_series_table_metadata.timestamp_column_index; + let timestamp_column = uncompressed_data.column(timestamp_column_index); + sort_columns.push(SortColumn { + values: (*timestamp_column).clone(), + options: sort_options, + }); + + + let indices = compute::lexsort_to_indices(&sort_columns, None)?; + let sorted_columns = compute::take_arrays(uncompressed_data.columns(), &indices, None)?; + RecordBatch::try_new(uncompressed_data.schema(), sorted_columns).map_err(|error| error.into()) +} + +/// Compress the field columns in `uncompressed_time_series` from the table with +/// `time_series_table_metadata` using [`try_compress_univariate_arrays`] and append the result to +/// `compressed_data`. It is assumed that all data points in `uncompressed_time_series` have the +/// same tags as in `tag_values`. +pub fn try_compress_univariate_record_batch( + time_series_table_metadata: &TimeSeriesTableMetadata, + uncompressed_time_series: &RecordBatch, + tag_values: &[String], + compressed_data: &mut Vec, +) -> Result<()> { + let uncompressed_timestamps = modelardb_types::array!( + uncompressed_time_series, + time_series_table_metadata.timestamp_column_index, + TimestampArray + ); + + for field_column_index in &time_series_table_metadata.field_column_indices { + let uncompressed_values = modelardb_types::array!( + uncompressed_time_series, + *field_column_index, + ValueArray + ); + + let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; + + let compressed_time_series = try_compress_univariate_arrays( + uncompressed_timestamps, + uncompressed_values, + error_bound, + time_series_table_metadata.compressed_schema.clone(), + tag_values.to_vec(), + *field_column_index as i16, + ) + .expect("uncompressed_timestamps and uncompressed_values should have the same length."); + + compressed_data.push(compressed_time_series); + } + + Ok(()) +} + /// Compress `uncompressed_timestamps` using a start time, end time, and a sampling interval if /// regular and delta-of-deltas followed by a variable length binary encoding if irregular. /// `uncompressed_values` is compressed within `error_bound` using the model types in `models`. @@ -45,7 +191,7 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; /// `uncompressed_values` have different lengths or if `compressed_schema` is not a valid schema for /// compressed segments, otherwise the resulting compressed segments are returned as a /// [`RecordBatch`] with the `compressed_schema` schema. -pub fn try_compress( +pub fn try_compress_univariate_arrays( uncompressed_timestamps: &TimestampArray, uncompressed_values: &ValueArray, error_bound: ErrorBound, @@ -278,7 +424,7 @@ mod tests { // Tests for try_compress(). #[test] fn test_try_compress_empty_time_series_within_lossless_error_bound() { - let compressed_record_batch = try_compress( + let compressed_record_batch = try_compress_univariate_arrays( &TimestampBuilder::new().finish(), &ValueBuilder::new().finish(), ErrorBound::Lossless, @@ -292,6 +438,20 @@ mod tests { #[test] fn test_try_compress_regular_constant_time_series_within_lossless_error_bound() { + let compressed_record_batch = try_compress_univariate_arrays( + &TimestampBuilder::new().finish(), + &ValueBuilder::new().finish(), + ErrorBound::Lossless, + compressed_schema(), + vec![TAG_VALUE.to_owned()], + 0, + ) + .unwrap(); + assert_eq!(0, compressed_record_batch.num_rows()); + } + + #[test] + fn test_try_compress_regular_constant_time_series_within_losless_error_bound() { generate_compress_and_assert_known_segment( false, ValuesStructure::Constant(None), @@ -440,7 +600,7 @@ mod tests { let uncompressed_values = data_generation::generate_values(uncompressed_timestamps.values(), values_structure); - let compressed_record_batch = try_compress( + let compressed_record_batch = try_compress_univariate_arrays( &uncompressed_timestamps, &uncompressed_values, error_bound, @@ -544,7 +704,7 @@ mod tests { let uncompressed_values = uncompressed_values.finish(); assert_eq!(uncompressed_timestamps.len(), uncompressed_values.len()); - let compressed_record_batch = try_compress( + let compressed_record_batch = try_compress_univariate_arrays( &uncompressed_timestamps, &uncompressed_values, error_bound, @@ -701,7 +861,7 @@ mod tests { 100.0..200.0, ); - let compressed_record_batch = try_compress( + let compressed_record_batch = try_compress_univariate_arrays( &uncompressed_timestamps, &uncompressed_values, error_bound, diff --git a/crates/modelardb_compression/src/error.rs b/crates/modelardb_compression/src/error.rs index 5553a6cd2..35b4ac1ed 100644 --- a/crates/modelardb_compression/src/error.rs +++ b/crates/modelardb_compression/src/error.rs @@ -19,12 +19,16 @@ use std::error::Error; use std::fmt::{Display, Formatter}; use std::result::Result as StdResult; +use arrow::error::ArrowError; + /// Result type used throughout `modelardb_compression`. pub type Result = StdResult; /// Error type used throughout `modelardb_compression`. #[derive(Debug)] pub enum ModelarDbCompressionError { + /// Error returned by Apache Arrow. + Arrow(ArrowError), /// Error returned when an invalid argument was passed. InvalidArgument(String), } @@ -32,6 +36,7 @@ pub enum ModelarDbCompressionError { impl Display for ModelarDbCompressionError { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match self { + Self::Arrow(reason) => write!(f, "Arrow Error: {reason}"), Self::InvalidArgument(reason) => write!(f, "Invalid Argument Error: {reason}"), } } @@ -41,7 +46,14 @@ impl Error for ModelarDbCompressionError { fn source(&self) -> Option<&(dyn Error + 'static)> { // Return the error that caused self to occur if one exists. match self { + Self::Arrow(reason) => Some(reason), Self::InvalidArgument(_reason) => None, } } } + +impl From for ModelarDbCompressionError { + fn from(error: ArrowError) -> Self { + Self::Arrow(error) + } +} diff --git a/crates/modelardb_compression/src/lib.rs b/crates/modelardb_compression/src/lib.rs index 888adfc18..8f179b04e 100644 --- a/crates/modelardb_compression/src/lib.rs +++ b/crates/modelardb_compression/src/lib.rs @@ -25,7 +25,9 @@ mod models; mod types; // Re-export the few functions and types users are meant to use. -pub use compression::try_compress; +pub use compression::try_compress_multivariate_record_batch; +pub use compression::try_compress_univariate_record_batch; +pub use compression::try_compress_univariate_arrays; pub use models::grid; pub use models::is_value_within_error_bound; pub use models::len; diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 7a136c96f..4627ad2b4 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -749,7 +749,7 @@ mod tests { compressed_schema_fields.push(Arc::new(Field::new("tag", DataType::Utf8, false))); let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); - let segments = crate::try_compress( + let segments = crate::try_compress_univariate_arrays( ×tamps, &values, error_bound, diff --git a/crates/modelardb_embedded/src/error.rs b/crates/modelardb_embedded/src/error.rs index bb4bb9395..8e84780d5 100644 --- a/crates/modelardb_embedded/src/error.rs +++ b/crates/modelardb_embedded/src/error.rs @@ -26,6 +26,7 @@ use arrow::error::ArrowError; use datafusion::error::DataFusionError; use datafusion::parquet::errors::ParquetError; use deltalake::{DeltaTableError, ObjectStoreError}; +use modelardb_compression::error::ModelarDbCompressionError; use modelardb_storage::error::ModelarDbStorageError; use modelardb_types::error::ModelarDbTypesError; use tonic::Status as TonicStatusError; @@ -47,6 +48,8 @@ pub enum ModelarDbEmbeddedError { EnvironmentVar(VarError), /// Error returned when an invalid argument was passed. InvalidArgument(String), + /// Error returned by modelardb_compression. + ModelarDbCompression(ModelarDbCompressionError), /// Error returned by modelardb_storage. ModelarDbStorage(ModelarDbStorageError), /// Error returned by modelardb_types. @@ -73,6 +76,7 @@ impl Display for ModelarDbEmbeddedError { Self::DeltaLake(reason) => write!(f, "Delta Lake Error: {reason}"), Self::EnvironmentVar(reason) => write!(f, "Environment Variable Error: {reason}"), Self::InvalidArgument(reason) => write!(f, "Invalid Argument Error: {reason}"), + Self::ModelarDbCompression(reason) => write!(f, "ModelarDB Compression Error: {reason}"), Self::ModelarDbStorage(reason) => write!(f, "ModelarDB Storage Error: {reason}"), Self::ModelarDbTypes(reason) => write!(f, "ModelarDB Types Error: {reason}"), Self::ObjectStore(reason) => write!(f, "Object Store Error: {reason}"), @@ -93,6 +97,7 @@ impl Error for ModelarDbEmbeddedError { Self::DeltaLake(reason) => Some(reason), Self::EnvironmentVar(reason) => Some(reason), Self::InvalidArgument(_reason) => None, + Self::ModelarDbCompression(reason) => Some(reason), Self::ModelarDbStorage(reason) => Some(reason), Self::ModelarDbTypes(reason) => Some(reason), Self::ObjectStore(reason) => Some(reason), @@ -129,6 +134,12 @@ impl From for ModelarDbEmbeddedError { } } +impl From for ModelarDbEmbeddedError { + fn from(error: ModelarDbCompressionError) -> Self { + Self::ModelarDbCompression(error) + } +} + impl From for ModelarDbEmbeddedError { fn from(error: ModelarDbStorageError) -> Self { Self::ModelarDbStorage(error) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index b0846133a..e379c198f 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -24,22 +24,17 @@ use std::result::Result as StdResult; use std::sync::Arc; use arrow::array::RecordBatch; -use arrow::array::{Float32Array, StringArray}; -use arrow::compute::SortOptions; use arrow::datatypes::Schema; use async_trait::async_trait; use datafusion::datasource::sink::DataSink; use datafusion::error::DataFusionError; use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; -use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; -use datafusion::physical_plan::expressions::Column; use datafusion::physical_plan::metrics::MetricsSet; -use datafusion::physical_plan::sorts::sort; use datafusion::physical_plan::{DisplayAs, DisplayFormatType, common}; use datafusion::prelude::SessionContext; use futures::TryStreamExt; use modelardb_storage::delta_lake::{DeltaLake, DeltaTableWriter}; -use modelardb_types::types::{TimeSeriesTableMetadata, TimestampArray}; +use modelardb_types::types::TimeSeriesTableMetadata; use crate::error::{ModelarDbEmbeddedError, Result}; use crate::operations::{ @@ -219,119 +214,6 @@ impl DataFolder { Ok(data_folder) } - /// Compress the `uncompressed_data` from the table with `time_series_table_metadata` and return the - /// resulting segments. - pub async fn compress_all( - &self, - time_series_table_metadata: &TimeSeriesTableMetadata, - uncompressed_data: &RecordBatch, - ) -> Result> { - // Sort by all tags and then time to simplify splitting the data into time series. - let sorted_uncompressed_data = - sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_data)?; - - // Split the sorted uncompressed data into time series and compress them separately. - let mut compressed_data = vec![]; - - let tag_column_arrays: Vec<&StringArray> = time_series_table_metadata - .tag_column_indices - .iter() - .map(|index| modelardb_types::array!(sorted_uncompressed_data, *index, StringArray)) - .collect(); - - let mut tag_values = Vec::with_capacity(tag_column_arrays.len()); - for tag_column_array in &tag_column_arrays { - tag_values.push(tag_column_array.value(0).to_owned()); - } - - // The index of the first data point of each time series must be stored so slices - // containing only data points for each time series can be extracted and compressed. - let mut row_index_start = 0; - for row_index in 0..sorted_uncompressed_data.num_rows() { - // If any of the tags differ, the data point is from a new time series. - let mut is_new_time_series = false; - for tag_column_index in 0..tag_column_arrays.len() { - is_new_time_series |= tag_values[tag_column_index] - != tag_column_arrays[tag_column_index].value(row_index); - } - - if is_new_time_series { - let time_series_length = row_index - row_index_start; - let uncompressed_time_series = - sorted_uncompressed_data.slice(row_index_start, time_series_length); - - self.compress( - time_series_table_metadata, - &uncompressed_time_series, - &tag_values, - &mut compressed_data, - ) - .await?; - - for (tag_column_index, tag_column_array) in tag_column_arrays.iter().enumerate() { - tag_values[tag_column_index] = tag_column_array.value(row_index).to_owned(); - } - - row_index_start = row_index; - } - } - - let time_series_length = sorted_uncompressed_data.num_rows() - row_index_start; - let uncompressed_time_series = - sorted_uncompressed_data.slice(row_index_start, time_series_length); - - self.compress( - time_series_table_metadata, - &uncompressed_time_series, - &tag_values, - &mut compressed_data, - ) - .await?; - - Ok(compressed_data) - } - - /// Compress the field columns in `uncompressed_time_series` from the table with - /// `time_series_table_metadata` and append the result to `compressed_data`. It is assumed that - /// all data points in `uncompressed_time_series` have the same tags as in `tag_values`. - async fn compress( - &self, - time_series_table_metadata: &TimeSeriesTableMetadata, - uncompressed_time_series: &RecordBatch, - tag_values: &[String], - compressed_data: &mut Vec, - ) -> Result<()> { - let uncompressed_timestamps = modelardb_types::array!( - uncompressed_time_series, - time_series_table_metadata.timestamp_column_index, - TimestampArray - ); - - for field_column_index in &time_series_table_metadata.field_column_indices { - let uncompressed_values = modelardb_types::array!( - uncompressed_time_series, - *field_column_index, - Float32Array - ); - - let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; - - let compressed_time_series = modelardb_compression::try_compress( - uncompressed_timestamps, - uncompressed_values, - error_bound, - time_series_table_metadata.compressed_schema.clone(), - tag_values.to_vec(), - *field_column_index as i16, - ) - .expect("uncompressed_timestamps and uncompressed_values should have the same length."); - - compressed_data.push(compressed_time_series); - } - - Ok(()) - } - /// Create a writer for writing multiple batches of data to the table with the table name in /// `table_name`. If the table does not exist or a writer for it could not be created, a /// [`ModelarDbEmbeddedError`] is returned. @@ -494,9 +376,8 @@ impl Operations for DataFolder { return Err(schema_mismatch_error); } - let compressed_data = self - .compress_all(&time_series_table_metadata, &uncompressed_data) - .await?; + let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( + &time_series_table_metadata, &uncompressed_data)?; self.delta_lake .write_compressed_segments_to_time_series_table(table_name, compressed_data) @@ -834,44 +715,6 @@ impl Operations for DataFolder { } } -/// Sort the `uncompressed_data` from the time series table with `time_series_table_metadata` -/// according to its tags and then timestamps. -fn sort_record_batch_by_tags_and_time( - time_series_table_metadata: &TimeSeriesTableMetadata, - uncompressed_data: &RecordBatch, -) -> Result { - let mut physical_sort_exprs = vec![]; - - let sort_options = SortOptions { - descending: false, - nulls_first: false, - }; - - for tag_column_index in &time_series_table_metadata.tag_column_indices { - let field = time_series_table_metadata.schema.field(*tag_column_index); - physical_sort_exprs.push(PhysicalSortExpr { - expr: Arc::new(Column::new(field.name(), *tag_column_index)), - options: sort_options, - }); - } - - let timestamp_column_index = time_series_table_metadata.timestamp_column_index; - let field = time_series_table_metadata - .schema - .field(timestamp_column_index); - physical_sort_exprs.push(PhysicalSortExpr { - expr: Arc::new(Column::new(field.name(), timestamp_column_index)), - options: sort_options, - }); - - sort::sort_batch( - uncompressed_data, - &LexOrdering::new(physical_sort_exprs), - None, - ) - .map_err(|error| error.into()) -} - /// Compare `source_schema` and `target_schema` and return [`true`] if they have the same number of /// columns, their columns have the same types, and their columns nullability is less or equally /// restrictive in `source_schema`. Otherwise [`False`] is returned. @@ -903,13 +746,17 @@ fn schemas_are_compatible(source_schema: &Schema, target_schema: &Schema) -> boo mod tests { use super::*; - use arrow::array::{Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array}; + use arrow::array::{Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray}; + use arrow::compute::SortOptions; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field}; use arrow_flight::flight_service_client::FlightServiceClient; use datafusion::datasource::TableProvider; use datafusion::logical_expr::col; + use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; + use datafusion::physical_plan::expressions::Column; + use datafusion::physical_plan::sorts::sort; use modelardb_types::types::{ - ArrowTimestamp, ArrowValue, ErrorBound, GeneratedColumn, ValueArray, + ArrowTimestamp, ArrowValue, ErrorBound, GeneratedColumn, TimestampArray, ValueArray }; use tempfile::TempDir; use tonic::transport::Channel; diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 8491a7c72..077686be0 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -591,7 +591,7 @@ impl UncompressedDataManager { .map(|(uncompressed_values, field_column_index)| { let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; - modelardb_compression::try_compress( + modelardb_compression::try_compress_univariate_arrays( uncompressed_timestamps, uncompressed_values, error_bound, From 2f96f853b8f81d17e1ba4b117fe5daf7e55892a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Wed, 15 Oct 2025 12:43:51 +0000 Subject: [PATCH 12/31] Fix unit tests after rebasing branch --- crates/modelardb_storage/src/delta_lake.rs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 83326e9f9..e7d2f07db 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -60,9 +60,6 @@ use crate::{ sql_and_concat, }; -/// Named error bound with the value 0.0 to make lossless compression more clear. -const ERROR_BOUND_ZERO: f32 = 0.0; - /// Types of tables supported by ModelarDB. enum TableType { NormalTable, @@ -997,8 +994,7 @@ impl DeltaLake { ); let batch = sql_and_concat(&self.session_context, &sql).await?; - let mut column_to_error_bound = - vec![ErrorBound::try_new_absolute(ERROR_BOUND_ZERO)?; query_schema_columns]; + let mut column_to_error_bound = vec![ErrorBound::Lossless; query_schema_columns]; let column_index_array = modelardb_types::array!(batch, 0, Int16Array); let error_bound_value_array = modelardb_types::array!(batch, 1, Float32Array); @@ -1009,13 +1005,15 @@ impl DeltaLake { let error_bound_value = error_bound_value_array.value(row_index); let error_bound_is_relative = error_bound_is_relative_array.value(row_index); - let error_bound = if error_bound_is_relative { - ErrorBound::try_new_relative(error_bound_value) - } else { - ErrorBound::try_new_absolute(error_bound_value) - }?; + if error_bound_value != 0.0 { + let error_bound = if error_bound_is_relative { + ErrorBound::try_new_relative(error_bound_value) + } else { + ErrorBound::try_new_absolute(error_bound_value) + }?; - column_to_error_bound[error_bound_index as usize] = error_bound; + column_to_error_bound[error_bound_index as usize] = error_bound; + } } Ok(column_to_error_bound) @@ -1550,7 +1548,7 @@ mod tests { ])); let error_bounds = vec![ - ErrorBound::try_new_absolute(ERROR_BOUND_ZERO).unwrap(); + ErrorBound::Lossless; query_schema.fields.len() ]; From 10cc29b9348b898c2aa75c7a9e2a60afb4fc5c74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Sun, 19 Oct 2025 18:56:12 +0000 Subject: [PATCH 13/31] Remove SessionContext from DataFolder --- .../src/operations/data_folder.rs | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index e379c198f..e9cd56d64 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -105,8 +105,6 @@ impl DisplayAs for DataFolderDataSink { pub struct DataFolder { /// Delta Lake for storing metadata and data in Apache Parquet files. delta_lake: DeltaLake, - /// Context providing access to a specific session of Apache DataFusion. - session_context: SessionContext, } impl DataFolder { @@ -175,11 +173,8 @@ impl DataFolder { /// [`ModelarDbEmbeddedError`] is returned. async fn try_new_and_register_tables(delta_lake: DeltaLake) -> Result { // Construct data folder. - let session_context = modelardb_storage::create_session_context(); - let data_folder = DataFolder { delta_lake, - session_context, }; // Register normal tables. @@ -192,7 +187,7 @@ impl DataFolder { .await?; modelardb_storage::register_normal_table( - &data_folder.session_context, + data_folder.delta_lake.session_context(), &normal_table_name, delta_table, data_sink.clone(), @@ -204,7 +199,7 @@ impl DataFolder { let delta_table = data_folder.delta_lake.delta_table(&metadata.name).await?; modelardb_storage::register_time_series_table( - &data_folder.session_context, + data_folder.delta_lake.session_context(), delta_table, metadata, data_sink.clone(), @@ -260,7 +255,7 @@ impl DataFolder { &self, table_name: &str, ) -> Option> { - let table_provider = self.session_context.table_provider(table_name).await.ok()?; + let table_provider = self.delta_lake.session_context().table_provider(table_name).await.ok()?; modelardb_storage::maybe_table_provider_to_time_series_table_metadata(table_provider) } } @@ -290,7 +285,7 @@ impl Operations for DataFolder { let data_sink = Arc::new(DataFolderDataSink::new()); modelardb_storage::register_normal_table( - &self.session_context, + self.delta_lake.session_context(), table_name, delta_table, data_sink.clone(), @@ -316,7 +311,7 @@ impl Operations for DataFolder { let data_sink = Arc::new(DataFolderDataSink::new()); modelardb_storage::register_time_series_table( - &self.session_context, + self.delta_lake.session_context(), delta_table, time_series_table_metadata, data_sink.clone(), @@ -403,7 +398,7 @@ impl Operations for DataFolder { /// Executes the SQL in `sql` and returns the result as a [`RecordBatchStream`]. If the SQL /// could not be executed, [`ModelarDbEmbeddedError`] is returned. async fn read(&mut self, sql: &str) -> Result>> { - let data_frame = self.session_context.sql(sql).await?; + let data_frame = self.delta_lake.session_context().sql(sql).await?; data_frame .execute_stream() @@ -680,7 +675,7 @@ impl Operations for DataFolder { /// returned. async fn drop(&mut self, table_name: &str) -> Result<()> { // Drop the table from the Apache Arrow DataFusion session. - self.session_context.deregister_table(table_name)?; + self.delta_lake.session_context().deregister_table(table_name)?; // Delete the table metadata from the metadata Delta Lake. self.delta_lake.drop_table_metadata(table_name).await?; @@ -801,13 +796,15 @@ mod tests { let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); assert!( new_data_folder - .session_context + .delta_lake + .session_context() .table_exist("normal_table_1") .unwrap() ); assert!( new_data_folder - .session_context + .delta_lake + .session_context() .table_exist("normal_table_2") .unwrap() ); @@ -971,13 +968,15 @@ mod tests { let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); assert!( new_data_folder - .session_context + .delta_lake + .session_context() .table_exist("time_series_table_1") .unwrap() ); assert!( new_data_folder - .session_context + .delta_lake + .session_context() .table_exist("time_series_table_2") .unwrap() ); @@ -2207,7 +2206,8 @@ mod tests { assert!( data_folder - .session_context + .delta_lake + .session_context() .table_exist(NORMAL_TABLE_NAME) .unwrap() ); @@ -2217,7 +2217,8 @@ mod tests { // Verify that the normal table was deregistered from Apache DataFusion. assert!( !data_folder - .session_context + .delta_lake + .session_context() .table_exist(NORMAL_TABLE_NAME) .unwrap() ); @@ -2247,7 +2248,8 @@ mod tests { assert!( data_folder - .session_context + .delta_lake + .session_context() .table_exist(TIME_SERIES_TABLE_NAME) .unwrap() ); @@ -2257,7 +2259,8 @@ mod tests { // Verify that the time series table was deregistered from Apache DataFusion. assert!( !data_folder - .session_context + .delta_lake + .session_context() .table_exist(TIME_SERIES_TABLE_NAME) .unwrap() ); @@ -2507,7 +2510,7 @@ mod tests { ); // Verify that the normal table is registered with Apache DataFusion. - assert!(data_folder.session_context.table_exist(table_name).unwrap()) + assert!(data_folder.delta_lake.session_context().table_exist(table_name).unwrap()) } #[tokio::test] @@ -2723,7 +2726,7 @@ mod tests { assert_eq!(*time_series_table_metadata.query_schema, expected_schema); // Verify that the time series table is registered with Apache DataFusion. - assert!(data_folder.session_context.table_exist(table_name).unwrap()); + assert!(data_folder.delta_lake.session_context().table_exist(table_name).unwrap()); time_series_table_metadata } From de0e838b23ae1f98604eba9d10da1add8b32bbee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 20 Oct 2025 06:17:39 +0000 Subject: [PATCH 14/31] Remove metadata and schema methods from DataFolder --- crates/modelardb_bulkloader/src/main.rs | 2 +- .../src/operations/data_folder.rs | 67 ++++++------------- crates/modelardb_storage/src/delta_lake.rs | 29 ++++++++ 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index 50367bafa..cf3cadc1d 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -169,7 +169,7 @@ async fn import( } if let Some(time_series_table_metadata) = - data_folder.time_series_table_metadata(table_name).await + data_folder.delta_lake().time_series_table_metadata_for_registered_time_series_table(table_name).await { import_time_series_table( input_stream, diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index e9cd56d64..7b6595616 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -34,7 +34,6 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType, common}; use datafusion::prelude::SessionContext; use futures::TryStreamExt; use modelardb_storage::delta_lake::{DeltaLake, DeltaTableWriter}; -use modelardb_types::types::TimeSeriesTableMetadata; use crate::error::{ModelarDbEmbeddedError, Result}; use crate::operations::{ @@ -209,12 +208,17 @@ impl DataFolder { Ok(data_folder) } + /// Return the [`DeltaLake`] for the [`DataFolder`]. + pub fn delta_lake(&self) -> &DeltaLake { + &self.delta_lake + } + /// Create a writer for writing multiple batches of data to the table with the table name in /// `table_name`. If the table does not exist or a writer for it could not be created, a /// [`ModelarDbEmbeddedError`] is returned. pub async fn writer(&self, table_name: &str) -> Result { let delta_table = self.delta_lake.delta_table(table_name).await?; - if self.time_series_table_metadata(table_name).await.is_some() { + if self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await.is_some() { self.delta_lake .time_series_table_writer(delta_table) .await @@ -226,38 +230,6 @@ impl DataFolder { .map_err(|error| error.into()) } } - - /// Return the schema of the table with the name in `table_name` if it is a normal table. If the - /// table does not exist or the table is not a normal table, return [`None`]. - async fn normal_table_schema(&self, table_name: &str) -> Option { - if self - .delta_lake - .is_normal_table(table_name) - .await - .is_ok_and(|is_normal_table| is_normal_table) - { - self.delta_lake - .delta_table(table_name) - .await - .expect("Delta Lake table should exist if the table is in the metadata Delta Lake.") - .get_schema() - .expect("Delta Lake table should be loaded and metadata should be in the log.") - .try_into() - .ok() - } else { - None - } - } - - /// Return [`TimeSeriesTableMetadata`] for the table with `table_name` if it exists, is registered - /// with Apache DataFusion, and is a time series table. - pub async fn time_series_table_metadata( - &self, - table_name: &str, - ) -> Option> { - let table_provider = self.delta_lake.session_context().table_provider(table_name).await.ok()?; - modelardb_storage::maybe_table_provider_to_time_series_table_metadata(table_provider) - } } #[async_trait] @@ -334,10 +306,10 @@ impl Operations for DataFolder { /// Returns the schema of the table with the name in `table_name`. If the table does not exist, /// [`ModelarDbEmbeddedError`] is returned. async fn schema(&mut self, table_name: &str) -> Result { - if let Some(time_series_table_metadata) = self.time_series_table_metadata(table_name).await + if let Some(time_series_table_metadata) = self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await { Ok((*time_series_table_metadata.query_schema).to_owned()) - } else if let Some(normal_table_schema) = self.normal_table_schema(table_name).await { + } else if let Some(normal_table_schema) = self.delta_lake.normal_table_schema(table_name).await { Ok(normal_table_schema) } else { Err(ModelarDbEmbeddedError::InvalidArgument(format!( @@ -361,7 +333,7 @@ impl Operations for DataFolder { "The uncompressed data does not match the schema for the table: {table_name}." )); - if let Some(time_series_table_metadata) = self.time_series_table_metadata(table_name).await + if let Some(time_series_table_metadata) = self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await { // Time series table. if !schemas_are_compatible( @@ -377,7 +349,7 @@ impl Operations for DataFolder { self.delta_lake .write_compressed_segments_to_time_series_table(table_name, compressed_data) .await?; - } else if let Some(normal_table_schema) = self.normal_table_schema(table_name).await { + } else if let Some(normal_table_schema) = self.delta_lake.normal_table_schema(table_name).await { // Normal table. if !schemas_are_compatible(&uncompressed_data.schema(), &normal_table_schema) { return Err(schema_mismatch_error); @@ -425,6 +397,7 @@ impl Operations for DataFolder { })?; let target_normal_table_schema = target_data_folder + .delta_lake .normal_table_schema(target_table_name) .await .ok_or_else(|| { @@ -466,7 +439,7 @@ impl Operations for DataFolder { ) -> Result>> { // DataFolder.read() interface is designed for time series tables. let time_series_table_medata = if let Some(time_series_table_metadata) = - self.time_series_table_metadata(table_name).await + self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await { time_series_table_metadata } else { @@ -511,7 +484,8 @@ impl Operations for DataFolder { // DataFolder.copy_time_series_table() interface is designed for time series tables. let source_time_series_table_metadata = self - .time_series_table_metadata(source_table_name) + .delta_lake + .time_series_table_metadata_for_registered_time_series_table(source_table_name) .await .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument(format!( @@ -520,7 +494,8 @@ impl Operations for DataFolder { })?; let target_time_series_table_metadata = target_data_folder - .time_series_table_metadata(target_table_name) + .delta_lake + .time_series_table_metadata_for_registered_time_series_table(target_table_name) .await .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument(format!( @@ -598,9 +573,10 @@ impl Operations for DataFolder { )); if let (Some(source_time_series_table_metadata), Some(target_time_series_table_metadata)) = ( - self.time_series_table_metadata(source_table_name).await, + self.delta_lake.time_series_table_metadata_for_registered_time_series_table(source_table_name).await, target_data_folder - .time_series_table_metadata(target_table_name) + .delta_lake + .time_series_table_metadata_for_registered_time_series_table(target_table_name) .await, ) { // If both tables are time series tables, check if their schemas match and write the @@ -621,8 +597,9 @@ impl Operations for DataFolder { .write_compressed_segments_to_time_series_table(target_table_name, record_batches) .await?; } else if let (Some(source_normal_table_schema), Some(target_normal_table_schema)) = ( - self.normal_table_schema(source_table_name).await, + self.delta_lake.normal_table_schema(source_table_name).await, target_data_folder + .delta_lake .normal_table_schema(target_table_name) .await, ) { @@ -751,7 +728,7 @@ mod tests { use datafusion::physical_plan::expressions::Column; use datafusion::physical_plan::sorts::sort; use modelardb_types::types::{ - ArrowTimestamp, ArrowValue, ErrorBound, GeneratedColumn, TimestampArray, ValueArray + ArrowTimestamp, ArrowValue, ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, TimestampArray, ValueArray }; use tempfile::TempDir; use tonic::transport::Channel; diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index e7d2f07db..e9d7dc165 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -442,6 +442,25 @@ impl DeltaLake { self.table_names_of_type(TableType::NormalTable).await } + /// Return the schema of the table with the name in `table_name` if it is a normal table. If the + /// table does not exist or the table is not a normal table, return [`None`]. + pub async fn normal_table_schema(&self, table_name: &str) -> Option { + if self.is_normal_table(table_name) + .await + .is_ok_and(|is_normal_table| is_normal_table) + { + self.delta_table(table_name) + .await + .expect("Delta Lake table should exist if the table is in the metadata Delta Lake.") + .get_schema() + .expect("Delta Lake table should be loaded and metadata should be in the log.") + .try_into() + .ok() + } else { + None + } + } + /// Return the name of each time series table currently in the metadata Delta Lake. Note that /// this does not include normal tables. If the time series table names cannot be retrieved, /// [`ModelarDbStorageError`] is returned. @@ -953,6 +972,16 @@ impl DeltaLake { .await } + /// Return [`TimeSeriesTableMetadata`] for the time series table with `table_name` if it exists, + /// is registered with Apache DataFusion, and is a time series table. + pub async fn time_series_table_metadata_for_registered_time_series_table( + &self, + table_name: &str, + ) -> Option> { + let table_provider = self.session_context.table_provider(table_name).await.ok()?; + crate::maybe_table_provider_to_time_series_table_metadata(table_provider) + } + /// Convert a row from the table "time_series_table_metadata" to an instance of /// [`TimeSeriesTableMetadata`]. Returns [`ModelarDbStorageError`] if a time_series table with /// `table_name` does not exist or the bytes in `query_schema_bytes` are not a valid schema. From 866a0ef7d976e2f52320b2ac19e65b56819c9957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 20 Oct 2025 07:08:07 +0000 Subject: [PATCH 15/31] Move writer to DeltaLake table_writer --- crates/modelardb_bulkloader/src/main.rs | 4 ++-- .../src/operations/data_folder.rs | 20 +------------------ crates/modelardb_storage/src/delta_lake.rs | 16 +++++++++++++++ 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index cf3cadc1d..ca21caa41 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -205,7 +205,7 @@ async fn import_time_series_table( cast_double_to_float: bool, ) -> Result<()> { let table_name = &time_series_table_metadata.name; - let mut delta_table_writer = data_folder.writer(table_name).await?; + let mut delta_table_writer = data_folder.delta_lake().table_writer(table_name).await?; let mut system = System::new(); let mut current_batch = vec![]; @@ -256,7 +256,7 @@ async fn import_normal_table( table_name: &str, data_folder: &mut DataFolder, ) -> Result<()> { - let mut delta_table_writer = data_folder.writer(table_name).await?; + let mut delta_table_writer = data_folder.delta_lake().table_writer(table_name).await?; while let Some(record_batch) = input_stream.next().await { let record_batch = record_batch?; diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 7b6595616..8cac7bdb6 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -33,7 +33,7 @@ use datafusion::physical_plan::metrics::MetricsSet; use datafusion::physical_plan::{DisplayAs, DisplayFormatType, common}; use datafusion::prelude::SessionContext; use futures::TryStreamExt; -use modelardb_storage::delta_lake::{DeltaLake, DeltaTableWriter}; +use modelardb_storage::delta_lake::DeltaLake; use crate::error::{ModelarDbEmbeddedError, Result}; use crate::operations::{ @@ -212,24 +212,6 @@ impl DataFolder { pub fn delta_lake(&self) -> &DeltaLake { &self.delta_lake } - - /// Create a writer for writing multiple batches of data to the table with the table name in - /// `table_name`. If the table does not exist or a writer for it could not be created, a - /// [`ModelarDbEmbeddedError`] is returned. - pub async fn writer(&self, table_name: &str) -> Result { - let delta_table = self.delta_lake.delta_table(table_name).await?; - if self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await.is_some() { - self.delta_lake - .time_series_table_writer(delta_table) - .await - .map_err(|error| error.into()) - } else { - self.delta_lake - .normal_or_metadata_table_writer(delta_table) - .await - .map_err(|error| error.into()) - } - } } #[async_trait] diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index e9d7dc165..0abfbb3fe 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -483,6 +483,22 @@ impl DeltaLake { Ok(table_names.iter().flatten().map(str::to_owned).collect()) } + /// Return a [`DeltaTableWriter`] for writing to the table with `table_name` in the Delta Lake, + /// or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or + /// the table does not exist. + pub async fn table_writer(&self, table_name: &str) -> Result { + let delta_table = self.delta_table(table_name).await?; + if self.time_series_table_metadata_for_registered_time_series_table(table_name).await.is_some() { + self.time_series_table_writer(delta_table) + .await + .map_err(|error| error.into()) + } else { + self.normal_or_metadata_table_writer(delta_table) + .await + .map_err(|error| error.into()) + } + } + /// Return a [`DeltaTableWriter`] for writing to the time series table with `delta_table` in the /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be /// established or the table does not exist. From a09f1f1aa6a00427248a995cfd91f229eb2fd1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 20 Oct 2025 09:21:59 +0000 Subject: [PATCH 16/31] Move register to DeltaLake and remove DataFolder --- crates/modelardb_bulkloader/src/main.rs | 23 +- crates/modelardb_embedded/src/capi.rs | 39 ++- .../src/operations/data_folder.rs | 308 +++++------------- crates/modelardb_storage/src/delta_lake.rs | 41 ++- 4 files changed, 153 insertions(+), 258 deletions(-) diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index ca21caa41..6a241fe26 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -39,8 +39,7 @@ use deltalake::{ObjectStore, Path}; use futures::stream::StreamExt; use modelardb_embedded::error::{ModelarDbEmbeddedError, Result}; use modelardb_embedded::operations::Operations; -use modelardb_embedded::operations::data_folder::DataFolder; -use modelardb_storage::delta_lake::DeltaTableWriter; +use modelardb_storage::delta_lake::{DeltaLake, DeltaTableWriter}; use modelardb_types::types::TimeSeriesTableMetadata; use sysinfo::System; @@ -169,7 +168,7 @@ async fn import( } if let Some(time_series_table_metadata) = - data_folder.delta_lake().time_series_table_metadata_for_registered_time_series_table(table_name).await + data_folder.time_series_table_metadata_for_registered_time_series_table(table_name).await { import_time_series_table( input_stream, @@ -201,11 +200,11 @@ async fn import( async fn import_time_series_table( mut input_stream: Pin>, time_series_table_metadata: &TimeSeriesTableMetadata, - data_folder: &mut DataFolder, + data_folder: &mut DeltaLake, cast_double_to_float: bool, ) -> Result<()> { let table_name = &time_series_table_metadata.name; - let mut delta_table_writer = data_folder.delta_lake().table_writer(table_name).await?; + let mut delta_table_writer = data_folder.table_writer(table_name).await?; let mut system = System::new(); let mut current_batch = vec![]; @@ -254,9 +253,9 @@ async fn import_time_series_table( async fn import_normal_table( mut input_stream: Pin>, table_name: &str, - data_folder: &mut DataFolder, + data_folder: &mut DeltaLake, ) -> Result<()> { - let mut delta_table_writer = data_folder.delta_lake().table_writer(table_name).await?; + let mut delta_table_writer = data_folder.table_writer(table_name).await?; while let Some(record_batch) = input_stream.next().await { let record_batch = record_batch?; @@ -484,30 +483,30 @@ async fn export( /// Returns a [`DataFolder`] for `data_folder_path`. If the necessary environment variables are not /// set for S3 and Azure or the [`DataFolder`] cannot access `data_folder_path`, a /// [`ModelarDbEmbeddedError`] is returned. -async fn create_data_folder(data_folder_path: &str) -> Result { +async fn create_data_folder(data_folder_path: &str) -> Result { match data_folder_path.split_once("://") { Some(("s3", bucket_name)) => { let endpoint = env::var("AWS_ENDPOINT")?; let access_key_id = env::var("AWS_ACCESS_KEY_ID")?; let secret_access_key = env::var("AWS_SECRET_ACCESS_KEY")?; - DataFolder::open_s3( + DeltaLake::open_s3( endpoint, bucket_name.to_owned(), access_key_id, secret_access_key, ) - .await + .await.map_err(|error| error.into()) } Some(("az", container_name)) => { let account_name = env::var("AZURE_STORAGE_ACCOUNT_NAME")?; let access_key = env::var("AZURE_STORAGE_ACCESS_KEY")?; - DataFolder::open_azure(account_name, access_key, container_name.to_owned()).await + DeltaLake::open_azure(account_name, access_key, container_name.to_owned()).await.map_err(|error| error.into()) } _ => { let data_folder_path = StdPath::new(data_folder_path); - DataFolder::open_local(data_folder_path).await + DeltaLake::open_local(data_folder_path).await.map_err(|error| error.into()) } } } diff --git a/crates/modelardb_embedded/src/capi.rs b/crates/modelardb_embedded/src/capi.rs index ff2a8a44c..44abd232a 100644 --- a/crates/modelardb_embedded/src/capi.rs +++ b/crates/modelardb_embedded/src/capi.rs @@ -39,13 +39,14 @@ use std::sync::{Arc, LazyLock}; use arrow::array::{self, Array, Float32Array, Int8Array, MapArray, StringArray, StructArray}; use arrow::ffi::{self, FFI_ArrowArray, FFI_ArrowSchema}; use arrow::record_batch::RecordBatch; +use modelardb_storage::delta_lake::DeltaLake; use modelardb_types::types::ErrorBound; use tokio::runtime::Runtime; use crate::error::{ModelarDbEmbeddedError, Result}; +use crate::operations::data_folder::DataFolderDataSink; use crate::operations::Operations; use crate::operations::client::{Client, Node}; -use crate::operations::data_folder::DataFolder; use crate::record_batch_stream_to_record_batch; use crate::{Aggregate, TableType}; @@ -87,8 +88,11 @@ pub unsafe extern "C" fn modelardb_embedded_open_memory() -> *const c_void { } /// See documentation for [`modelardb_embedded_open_memory`]. -fn open_memory() -> Result { - TOKIO_RUNTIME.block_on(DataFolder::open_memory()) +fn open_memory() -> Result { + let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_memory())?; + let data_sink = Arc::new(DataFolderDataSink::new()); + TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; + Ok(delta_lake) } /// Creates a [`DataFolder`] that manages data in the local folder at `data_folder_path_path` and @@ -103,11 +107,14 @@ pub unsafe extern "C" fn modelardb_embedded_open_local( } /// See documentation for [`modelardb_embedded_open_local`]. -unsafe fn open_local(data_folder_path_ptr: *const c_char) -> Result { +unsafe fn open_local(data_folder_path_ptr: *const c_char) -> Result { let data_folder_str = unsafe { c_char_ptr_to_str(data_folder_path_ptr)? }; let data_folder_path = StdPath::new(data_folder_str); - TOKIO_RUNTIME.block_on(DataFolder::open_local(data_folder_path)) + let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_local(data_folder_path))?; + let data_sink = Arc::new(DataFolderDataSink::new()); + TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; + Ok(delta_lake) } /// Creates a [`DataFolder`] that manages data in an object store with a S3-compatible API and @@ -139,18 +146,21 @@ unsafe fn open_s3( bucket_name_ptr: *const c_char, access_key_id_ptr: *const c_char, secret_access_key_ptr: *const c_char, -) -> Result { +) -> Result { let endpoint = unsafe { c_char_ptr_to_str(endpoint_ptr)? }; let bucket_name = unsafe { c_char_ptr_to_str(bucket_name_ptr)? }; let access_key_id = unsafe { c_char_ptr_to_str(access_key_id_ptr)? }; let secret_access_key = unsafe { c_char_ptr_to_str(secret_access_key_ptr)? }; - TOKIO_RUNTIME.block_on(DataFolder::open_s3( + let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_s3( endpoint.to_owned(), bucket_name.to_owned(), access_key_id.to_owned(), secret_access_key.to_owned(), - )) + ))?; + let data_sink = Arc::new(DataFolderDataSink::new()); + TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; + Ok(delta_lake) } /// Creates a [`DataFolder`] that manages data in an object store with an Azure-compatible API and @@ -172,16 +182,19 @@ unsafe fn open_azure( account_name_ptr: *const c_char, access_key_ptr: *const c_char, container_name_ptr: *const c_char, -) -> Result { +) -> Result { let account_name = unsafe { c_char_ptr_to_str(account_name_ptr)? }; let access_key = unsafe { c_char_ptr_to_str(access_key_ptr)? }; let container_name = unsafe { c_char_ptr_to_str(container_name_ptr)? }; - TOKIO_RUNTIME.block_on(DataFolder::open_azure( + let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_azure( account_name.to_owned(), access_key.to_owned(), container_name.to_owned(), - )) + ))?; + let data_sink = Arc::new(DataFolderDataSink::new()); + TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; + Ok(delta_lake) } /// Creates a [`Client`] that is connected to the Apache Arrow Flight server URL in `node_url_ptr` @@ -233,7 +246,7 @@ pub unsafe extern "C" fn modelardb_embedded_close( is_data_folder: bool, ) -> c_int { if is_data_folder { - let maybe_data_folder_ptr: *mut DataFolder = maybe_operations_ptr.cast(); + let maybe_data_folder_ptr: *mut DeltaLake = maybe_operations_ptr.cast(); if !maybe_data_folder_ptr.is_null() && maybe_data_folder_ptr.is_aligned() { // The box is assigned to _data_folder as Box::from_raw() is #[must_use]. let _data_folder = unsafe { Box::from_raw(maybe_data_folder_ptr) }; @@ -1022,7 +1035,7 @@ unsafe fn c_void_to_operations<'a>( is_data_folder: bool, ) -> Result<&'a mut dyn Operations> { if is_data_folder { - let maybe_data_folder_ptr: *mut DataFolder = maybe_operations_ptr.cast(); + let maybe_data_folder_ptr: *mut DeltaLake = maybe_operations_ptr.cast(); if !maybe_data_folder_ptr.is_null() && maybe_data_folder_ptr.is_aligned() { unsafe { Ok(&mut *maybe_data_folder_ptr) } } else { diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 8cac7bdb6..a73b5a2e5 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -18,7 +18,6 @@ use std::any::Any; use std::collections::HashMap; use std::fmt::{Debug, Formatter, Result as FmtResult}; -use std::path::Path as StdPath; use std::pin::Pin; use std::result::Result as StdResult; use std::sync::Arc; @@ -42,13 +41,13 @@ use crate::operations::{ use crate::{Aggregate, TableType}; /// [`DataSink`] that rejects INSERT statements passed to [`DataFolder.read()`]. -struct DataFolderDataSink { +pub struct DataFolderDataSink { /// The schema of the data sink is empty since it rejects everything. schema: Arc, } impl DataFolderDataSink { - fn new() -> Self { + pub fn new() -> Self { Self { schema: Arc::new(Schema::empty()), } @@ -100,122 +99,8 @@ impl DisplayAs for DataFolderDataSink { } } -/// Provides access to modelardb_embedded's components. -pub struct DataFolder { - /// Delta Lake for storing metadata and data in Apache Parquet files. - delta_lake: DeltaLake, -} - -impl DataFolder { - /// Creates a [`DataFolder`] that manages data in memory and returns it. If the metadata tables - /// could not be created, [`ModelarDbEmbeddedError`] is returned. - pub async fn open_memory() -> Result { - let delta_lake = DeltaLake::open_memory().await?; - Self::try_new_and_register_tables(delta_lake).await - } - - /// Creates a [`DataFolder`] that manages data in the local folder at `data_folder_path` and - /// returns it. If the folder does not exist and could not be created or the metadata tables - /// could not be created, [`ModelarDbEmbeddedError`] is returned. - pub async fn open_local(data_folder_path: &StdPath) -> Result { - let delta_lake = DeltaLake::open_local(data_folder_path).await?; - Self::try_new_and_register_tables(delta_lake).await - } - - /// Creates a [`DataFolder`] that manages data in an object store with an S3-compatible API and - /// returns it. If a connection to the object store could not be established or the metadata - /// tables could not be created, [`ModelarDbEmbeddedError`] is returned. - pub async fn open_s3( - endpoint: String, - bucket_name: String, - access_key_id: String, - secret_access_key: String, - ) -> Result { - // Register the S3 storage handlers to allow the use of Amazon S3 object stores. This is - // required at runtime to initialize the S3 storage implementation in the deltalake_aws - // storage subcrate. It is safe to call this function multiple times as the handlers are - // stored in a DashMap, thus, the handlers are simply overwritten with the same each time. - deltalake::aws::register_handlers(None); - - // Construct data folder. - let delta_lake = DeltaLake::open_s3( - endpoint.clone(), - bucket_name.clone(), - access_key_id.clone(), - secret_access_key.clone(), - ) - .await?; - - Self::try_new_and_register_tables(delta_lake).await - } - - /// Creates a [`DataFolder`] that manages data in an object store with an Azure-compatible API - /// and returns it. If a connection to the object store could not be established or the metadata - /// tables could not be created, [`ModelarDbEmbeddedError`] is returned. - pub async fn open_azure( - account_name: String, - access_key: String, - container_name: String, - ) -> Result { - let delta_lake = DeltaLake::open_azure( - account_name.clone(), - access_key.clone(), - container_name.clone(), - ) - .await?; - - Self::try_new_and_register_tables(delta_lake).await - } - - /// Create a [`DataFolder`], register all normal tables and time series tables in it with its - /// [`SessionContext`], and return it. If the tables could not be registered, - /// [`ModelarDbEmbeddedError`] is returned. - async fn try_new_and_register_tables(delta_lake: DeltaLake) -> Result { - // Construct data folder. - let data_folder = DataFolder { - delta_lake, - }; - - // Register normal tables. - let data_sink = Arc::new(DataFolderDataSink::new()); - - for normal_table_name in data_folder.delta_lake.normal_table_names().await? { - let delta_table = data_folder - .delta_lake - .delta_table(&normal_table_name) - .await?; - - modelardb_storage::register_normal_table( - data_folder.delta_lake.session_context(), - &normal_table_name, - delta_table, - data_sink.clone(), - )?; - } - - // Register time series tables. - for metadata in data_folder.delta_lake.time_series_table_metadata().await? { - let delta_table = data_folder.delta_lake.delta_table(&metadata.name).await?; - - modelardb_storage::register_time_series_table( - data_folder.delta_lake.session_context(), - delta_table, - metadata, - data_sink.clone(), - )?; - } - - Ok(data_folder) - } - - /// Return the [`DeltaLake`] for the [`DataFolder`]. - pub fn delta_lake(&self) -> &DeltaLake { - &self.delta_lake - } -} - #[async_trait] -impl Operations for DataFolder { +impl Operations for DeltaLake { /// Return `self` as [`Any`] so it can be downcast. fn as_any(&self) -> &dyn Any { self @@ -227,19 +112,16 @@ impl Operations for DataFolder { async fn create(&mut self, table_name: &str, table_type: TableType) -> Result<()> { match table_type { TableType::NormalTable(schema) => { - let delta_table = self - .delta_lake - .create_normal_table(table_name, &schema) + let delta_table = self.create_normal_table(table_name, &schema) .await?; - self.delta_lake - .save_normal_table_metadata(table_name) + self.save_normal_table_metadata(table_name) .await?; let data_sink = Arc::new(DataFolderDataSink::new()); modelardb_storage::register_normal_table( - self.delta_lake.session_context(), + self.session_context(), table_name, delta_table, data_sink.clone(), @@ -254,18 +136,16 @@ impl Operations for DataFolder { )?); let delta_table = self - .delta_lake .create_time_series_table(&time_series_table_metadata) .await?; - self.delta_lake - .save_time_series_table_metadata(&time_series_table_metadata) + self.save_time_series_table_metadata(&time_series_table_metadata) .await?; let data_sink = Arc::new(DataFolderDataSink::new()); modelardb_storage::register_time_series_table( - self.delta_lake.session_context(), + self.session_context(), delta_table, time_series_table_metadata, data_sink.clone(), @@ -279,8 +159,7 @@ impl Operations for DataFolder { /// Returns the name of all the tables. If the table names could not be retrieved from the /// metadata Delta Lake, [`ModelarDbEmbeddedError`] is returned. async fn tables(&mut self) -> Result> { - self.delta_lake - .table_names() + self.table_names() .await .map_err(|error| error.into()) } @@ -288,10 +167,10 @@ impl Operations for DataFolder { /// Returns the schema of the table with the name in `table_name`. If the table does not exist, /// [`ModelarDbEmbeddedError`] is returned. async fn schema(&mut self, table_name: &str) -> Result { - if let Some(time_series_table_metadata) = self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await + if let Some(time_series_table_metadata) = self.time_series_table_metadata_for_registered_time_series_table(table_name).await { Ok((*time_series_table_metadata.query_schema).to_owned()) - } else if let Some(normal_table_schema) = self.delta_lake.normal_table_schema(table_name).await { + } else if let Some(normal_table_schema) = self.normal_table_schema(table_name).await { Ok(normal_table_schema) } else { Err(ModelarDbEmbeddedError::InvalidArgument(format!( @@ -315,7 +194,7 @@ impl Operations for DataFolder { "The uncompressed data does not match the schema for the table: {table_name}." )); - if let Some(time_series_table_metadata) = self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await + if let Some(time_series_table_metadata) = self.time_series_table_metadata_for_registered_time_series_table(table_name).await { // Time series table. if !schemas_are_compatible( @@ -328,17 +207,15 @@ impl Operations for DataFolder { let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( &time_series_table_metadata, &uncompressed_data)?; - self.delta_lake - .write_compressed_segments_to_time_series_table(table_name, compressed_data) + self.write_compressed_segments_to_time_series_table(table_name, compressed_data) .await?; - } else if let Some(normal_table_schema) = self.delta_lake.normal_table_schema(table_name).await { + } else if let Some(normal_table_schema) = self.normal_table_schema(table_name).await { // Normal table. if !schemas_are_compatible(&uncompressed_data.schema(), &normal_table_schema) { return Err(schema_mismatch_error); } - self.delta_lake - .write_record_batches_to_normal_table(table_name, vec![uncompressed_data]) + self.write_record_batches_to_normal_table(table_name, vec![uncompressed_data]) .await?; } else { return Err(ModelarDbEmbeddedError::InvalidArgument(format!( @@ -352,7 +229,7 @@ impl Operations for DataFolder { /// Executes the SQL in `sql` and returns the result as a [`RecordBatchStream`]. If the SQL /// could not be executed, [`ModelarDbEmbeddedError`] is returned. async fn read(&mut self, sql: &str) -> Result>> { - let data_frame = self.delta_lake.session_context().sql(sql).await?; + let data_frame = self.session_context().sql(sql).await?; data_frame .execute_stream() @@ -373,13 +250,12 @@ impl Operations for DataFolder { ) -> Result<()> { let target_data_folder = target .as_any() - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument("target is not a data folder.".to_owned()) })?; let target_normal_table_schema = target_data_folder - .delta_lake .normal_table_schema(target_table_name) .await .ok_or_else(|| { @@ -398,7 +274,6 @@ impl Operations for DataFolder { let record_batches = common::collect(record_batch_stream).await?; target_data_folder - .delta_lake .write_record_batches_to_normal_table(target_table_name, record_batches) .await?; @@ -421,7 +296,7 @@ impl Operations for DataFolder { ) -> Result>> { // DataFolder.read() interface is designed for time series tables. let time_series_table_medata = if let Some(time_series_table_metadata) = - self.delta_lake.time_series_table_metadata_for_registered_time_series_table(table_name).await + self.time_series_table_metadata_for_registered_time_series_table(table_name).await { time_series_table_metadata } else { @@ -459,14 +334,13 @@ impl Operations for DataFolder { ) -> Result<()> { let target_data_folder = target .as_any() - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument("target is not a data folder.".to_owned()) })?; // DataFolder.copy_time_series_table() interface is designed for time series tables. let source_time_series_table_metadata = self - .delta_lake .time_series_table_metadata_for_registered_time_series_table(source_table_name) .await .ok_or_else(|| { @@ -476,7 +350,6 @@ impl Operations for DataFolder { })?; let target_time_series_table_metadata = target_data_folder - .delta_lake .time_series_table_metadata_for_registered_time_series_table(target_table_name) .await .ok_or_else(|| { @@ -516,7 +389,7 @@ impl Operations for DataFolder { let sql = format!("SELECT * FROM {source_table_name} {where_clause}"); // Read data to copy from source_table_name in source. - let source_table = Arc::new(self.delta_lake.delta_table(source_table_name).await?); + let source_table = Arc::new(self.delta_table(source_table_name).await?); let session_context = SessionContext::new(); session_context.register_table(source_table_name, source_table)?; @@ -526,7 +399,6 @@ impl Operations for DataFolder { // Write read data to target_table_name in target. target_data_folder - .delta_lake .write_compressed_segments_to_time_series_table(target_table_name, record_batches) .await?; @@ -545,7 +417,7 @@ impl Operations for DataFolder { ) -> Result<()> { let target_data_folder = target .as_any() - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument("target is not a data folder.".to_owned()) })?; @@ -555,9 +427,8 @@ impl Operations for DataFolder { )); if let (Some(source_time_series_table_metadata), Some(target_time_series_table_metadata)) = ( - self.delta_lake.time_series_table_metadata_for_registered_time_series_table(source_table_name).await, + self.time_series_table_metadata_for_registered_time_series_table(source_table_name).await, target_data_folder - .delta_lake .time_series_table_metadata_for_registered_time_series_table(target_table_name) .await, ) { @@ -570,18 +441,16 @@ impl Operations for DataFolder { return Err(schema_mismatch_error); } - let delta_ops = self.delta_lake.delta_ops(source_table_name).await?; + let delta_ops = self.delta_ops(source_table_name).await?; let (_table, stream) = delta_ops.load().await?; let record_batches: Vec = stream.try_collect().await?; target_data_folder - .delta_lake .write_compressed_segments_to_time_series_table(target_table_name, record_batches) .await?; } else if let (Some(source_normal_table_schema), Some(target_normal_table_schema)) = ( - self.delta_lake.normal_table_schema(source_table_name).await, + self.normal_table_schema(source_table_name).await, target_data_folder - .delta_lake .normal_table_schema(target_table_name) .await, ) { @@ -591,12 +460,11 @@ impl Operations for DataFolder { return Err(schema_mismatch_error); } - let delta_ops = self.delta_lake.delta_ops(source_table_name).await?; + let delta_ops = self.delta_ops(source_table_name).await?; let (_table, stream) = delta_ops.load().await?; let record_batches: Vec = stream.try_collect().await?; target_data_folder - .delta_lake .write_record_batches_to_normal_table(target_table_name, record_batches) .await?; } else { @@ -616,8 +484,7 @@ impl Operations for DataFolder { /// Delta Lake. If the data could not be deleted, [`ModelarDbEmbeddedError`] is returned. async fn truncate(&mut self, table_name: &str) -> Result<()> { if self.tables().await?.contains(&table_name.to_owned()) { - self.delta_lake - .truncate_table(table_name) + self.truncate_table(table_name) .await .map_err(|error| error.into()) } else { @@ -634,13 +501,13 @@ impl Operations for DataFolder { /// returned. async fn drop(&mut self, table_name: &str) -> Result<()> { // Drop the table from the Apache Arrow DataFusion session. - self.delta_lake.session_context().deregister_table(table_name)?; + self.session_context().deregister_table(table_name)?; // Delete the table metadata from the metadata Delta Lake. - self.delta_lake.drop_table_metadata(table_name).await?; + self.drop_table_metadata(table_name).await?; // Drop the table from the Delta Lake. - self.delta_lake.drop_table(table_name).await?; + self.drop_table(table_name).await?; Ok(()) } @@ -657,8 +524,7 @@ impl Operations for DataFolder { maybe_retention_period_in_seconds: Option, ) -> Result<()> { if self.tables().await?.contains(&table_name.to_owned()) { - self.delta_lake - .vacuum_table(table_name, maybe_retention_period_in_seconds) + self.vacuum_table(table_name, maybe_retention_period_in_seconds) .await .map_err(|error| error.into()) } else { @@ -733,7 +599,7 @@ mod tests { #[tokio::test] async fn test_register_existing_normal_tables_on_open() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); data_folder .create( @@ -752,17 +618,17 @@ mod tests { .unwrap(); // Create a new data folder and verify that the existing normal tables are registered. - let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let new_data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let data_sink = Arc::new(DataFolderDataSink::new()); + new_data_folder.register_normal_and_time_series_tables(data_sink).await.unwrap(); assert!( new_data_folder - .delta_lake .session_context() .table_exist("normal_table_1") .unwrap() ); assert!( new_data_folder - .delta_lake .session_context() .table_exist("normal_table_2") .unwrap() @@ -772,7 +638,7 @@ mod tests { #[tokio::test] async fn test_create_normal_table_with_empty_schema() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create(NORMAL_TABLE_NAME, TableType::NormalTable(Schema::empty())) @@ -788,7 +654,7 @@ mod tests { #[tokio::test] async fn test_create_existing_normal_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create( @@ -826,7 +692,7 @@ mod tests { #[tokio::test] async fn test_create_time_series_table_with_error_bounds() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let error_bounds = HashMap::from([ ( @@ -897,7 +763,7 @@ mod tests { #[tokio::test] async fn test_register_existing_time_series_tables_on_open() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); data_folder .create( @@ -924,17 +790,17 @@ mod tests { .unwrap(); // Create a new data folder and verify that the existing time series tables are registered. - let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let new_data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let data_sink = Arc::new(DataFolderDataSink::new()); + new_data_folder.register_normal_and_time_series_tables(data_sink).await.unwrap(); assert!( new_data_folder - .delta_lake .session_context() .table_exist("time_series_table_1") .unwrap() ); assert!( new_data_folder - .delta_lake .session_context() .table_exist("time_series_table_2") .unwrap() @@ -944,7 +810,7 @@ mod tests { #[tokio::test] async fn test_create_time_series_table_with_empty_schema() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create( @@ -963,7 +829,7 @@ mod tests { #[tokio::test] async fn test_create_existing_time_series_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create( @@ -998,7 +864,7 @@ mod tests { #[tokio::test] async fn test_tables() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let table_names = data_folder.tables().await.unwrap(); assert!(table_names.is_empty()); @@ -1046,7 +912,7 @@ mod tests { #[tokio::test] async fn test_missing_table_schema() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.schema(MISSING_TABLE_NAME).await; @@ -1060,7 +926,6 @@ mod tests { async fn test_write_to_normal_table() { let (_temp_dir, mut data_folder) = create_data_folder_with_normal_table().await; let mut delta_table = data_folder - .delta_lake .delta_table(NORMAL_TABLE_NAME) .await .unwrap(); @@ -1110,7 +975,6 @@ mod tests { async fn test_write_to_time_series_table() { let (_temp_dir, mut data_folder) = create_data_folder_with_time_series_table().await; let mut delta_table = data_folder - .delta_lake .delta_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -1159,7 +1023,7 @@ mod tests { #[tokio::test] async fn test_write_to_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .write(MISSING_TABLE_NAME, time_series_table_data()) @@ -1195,7 +1059,7 @@ mod tests { #[tokio::test] async fn test_read_time_series_table_from_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder_read_time_series_table( &mut data_folder, @@ -1549,7 +1413,7 @@ mod tests { } async fn data_folder_read_time_series_table( - data_folder: &mut DataFolder, + data_folder: &mut DeltaLake, table_name: &str, columns: &[(String, Aggregate)], group_by: &[String], @@ -1628,7 +1492,7 @@ mod tests { #[tokio::test] async fn test_copy_time_series_table_from_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let source = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let source = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let (_temp_dir, target) = create_data_folder_with_time_series_table().await; @@ -1654,7 +1518,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_time_series_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); source .write(TIME_SERIES_TABLE_NAME, time_series_table_data()) @@ -1689,7 +1553,7 @@ mod tests { .unwrap(); let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let table_type = TableType::TimeSeriesTable(invalid_table_schema(), HashMap::new(), HashMap::new()); @@ -1943,7 +1807,7 @@ mod tests { #[tokio::test] async fn test_read_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let sql = format!("SELECT * FROM {MISSING_TABLE_NAME}"); let result = data_folder_read(&mut data_folder, &sql).await; @@ -1989,7 +1853,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let schema = normal_table_schema().project(&[0, 1]).unwrap(); target @@ -2050,7 +1914,7 @@ mod tests { // Create a normal table that has the same schema as the time series table in source. let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let schema = time_series_table_schema(); target @@ -2106,7 +1970,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); source .write(NORMAL_TABLE_NAME, normal_table_data()) @@ -2127,7 +1991,7 @@ mod tests { #[tokio::test] async fn test_copy_normal_table_from_missing_table_to_normal_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut source = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut source = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let (_temp_dir, mut target) = create_data_folder_with_normal_table().await; @@ -2165,7 +2029,6 @@ mod tests { assert!( data_folder - .delta_lake .session_context() .table_exist(NORMAL_TABLE_NAME) .unwrap() @@ -2176,7 +2039,6 @@ mod tests { // Verify that the normal table was deregistered from Apache DataFusion. assert!( !data_folder - .delta_lake .session_context() .table_exist(NORMAL_TABLE_NAME) .unwrap() @@ -2185,7 +2047,6 @@ mod tests { // Verify that the normal table was dropped from the metadata Delta Lake. assert!( !data_folder - .delta_lake .is_normal_table(NORMAL_TABLE_NAME) .await .unwrap() @@ -2194,7 +2055,6 @@ mod tests { // Verify that the normal table was dropped from the Delta Lake. assert!( data_folder - .delta_lake .delta_table(NORMAL_TABLE_NAME) .await .is_err() @@ -2207,7 +2067,6 @@ mod tests { assert!( data_folder - .delta_lake .session_context() .table_exist(TIME_SERIES_TABLE_NAME) .unwrap() @@ -2218,7 +2077,6 @@ mod tests { // Verify that the time series table was deregistered from Apache DataFusion. assert!( !data_folder - .delta_lake .session_context() .table_exist(TIME_SERIES_TABLE_NAME) .unwrap() @@ -2227,7 +2085,6 @@ mod tests { // Verify that the time series table was dropped from the metadata Delta Lake. assert!( !data_folder - .delta_lake .is_time_series_table(TIME_SERIES_TABLE_NAME) .await .unwrap() @@ -2236,7 +2093,6 @@ mod tests { // Verify that the time series table was dropped from the Delta Lake. assert!( data_folder - .delta_lake .delta_table(TIME_SERIES_TABLE_NAME) .await .is_err() @@ -2246,7 +2102,7 @@ mod tests { #[tokio::test] async fn test_drop_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.drop(MISSING_TABLE_NAME).await; @@ -2269,7 +2125,6 @@ mod tests { .unwrap(); let mut delta_table = data_folder - .delta_lake .delta_table(NORMAL_TABLE_NAME) .await .unwrap(); @@ -2295,7 +2150,6 @@ mod tests { .unwrap(); let mut delta_table = data_folder - .delta_lake .delta_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -2319,7 +2173,7 @@ mod tests { #[tokio::test] async fn test_truncate_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.truncate(MISSING_TABLE_NAME).await; @@ -2394,7 +2248,7 @@ mod tests { #[tokio::test] async fn test_vacuum_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.vacuum(MISSING_TABLE_NAME, None).await; @@ -2422,7 +2276,6 @@ mod tests { .unwrap(); let mut delta_table = source - .delta_lake .delta_table(NORMAL_TABLE_NAME) .await .unwrap(); @@ -2445,13 +2298,12 @@ mod tests { } async fn assert_normal_table_exists( - data_folder: &DataFolder, + data_folder: &DeltaLake, table_name: &str, expected_schema: Schema, ) { // Verify that the normal table exists in the Delta Lake. let delta_table = data_folder - .delta_lake .delta_table(table_name) .await .unwrap(); @@ -2462,14 +2314,13 @@ mod tests { // Verify that the normal table exists in the metadata Delta Lake. assert!( data_folder - .delta_lake .is_normal_table(table_name) .await .unwrap() ); // Verify that the normal table is registered with Apache DataFusion. - assert!(data_folder.delta_lake.session_context().table_exist(table_name).unwrap()) + assert!(data_folder.session_context().table_exist(table_name).unwrap()) } #[tokio::test] @@ -2477,7 +2328,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); target .create( @@ -2553,7 +2404,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let expected_result = normal_table_data(); source @@ -2591,7 +2442,6 @@ mod tests { .unwrap(); let mut delta_table = source - .delta_lake .delta_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -2634,7 +2484,6 @@ mod tests { .unwrap(); let mut delta_table = source - .delta_lake .delta_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -2667,16 +2516,15 @@ mod tests { } async fn assert_time_series_table_exists( - data_folder: &DataFolder, + data_folder: &DeltaLake, table_name: &str, expected_schema: Schema, ) -> TimeSeriesTableMetadata { // Verify that the time series table exists in the Delta Lake. - assert!(data_folder.delta_lake.delta_table(table_name).await.is_ok()); + assert!(data_folder.delta_table(table_name).await.is_ok()); // Verify that the time series table exists in the metadata Delta Lake with the correct schema. let time_series_table_metadata = data_folder - .delta_lake .time_series_table_metadata_for_time_series_table(table_name) .await .unwrap(); @@ -2685,7 +2533,7 @@ mod tests { assert_eq!(*time_series_table_metadata.query_schema, expected_schema); // Verify that the time series table is registered with Apache DataFusion. - assert!(data_folder.delta_lake.session_context().table_exist(table_name).unwrap()); + assert!(data_folder.session_context().table_exist(table_name).unwrap()); time_series_table_metadata } @@ -2710,7 +2558,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_time_series_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); target .create( @@ -2784,7 +2632,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_time_series_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let target = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); source .write(TIME_SERIES_TABLE_NAME, time_series_table_data()) @@ -2814,10 +2662,10 @@ mod tests { } async fn assert_table_not_moved( - source: &mut DataFolder, + source: &mut DeltaLake, source_table_name: &str, expected_result: RecordBatch, - maybe_target: Option<&mut DataFolder>, + maybe_target: Option<&mut DeltaLake>, maybe_target_table_name: Option<&str>, ) { let source_sql = format!("SELECT * FROM {source_table_name}"); @@ -2831,7 +2679,7 @@ mod tests { } } - async fn data_folder_read(data_folder: &mut DataFolder, sql: &str) -> Result { + async fn data_folder_read(data_folder: &mut DeltaLake, sql: &str) -> Result { let record_batch_stream = data_folder.read(sql).await?; record_batch_stream_to_record_batch(record_batch_stream).await } @@ -2839,7 +2687,7 @@ mod tests { #[tokio::test] async fn test_move_missing_table_to_time_series_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut source = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut source = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let (_temp_dir, target) = create_data_folder_with_time_series_table().await; @@ -2871,9 +2719,9 @@ mod tests { ); } - async fn create_data_folder_with_normal_table() -> (TempDir, DataFolder) { + async fn create_data_folder_with_normal_table() -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); data_folder .create( @@ -2925,9 +2773,9 @@ mod tests { ]) } - async fn create_data_folder_with_time_series_table() -> (TempDir, DataFolder) { + async fn create_data_folder_with_time_series_table() -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let table_type = TableType::TimeSeriesTable(time_series_table_schema(), HashMap::new(), HashMap::new()); @@ -3000,9 +2848,9 @@ mod tests { } async fn create_data_folder_with_time_series_table_with_generated_column() - -> (TempDir, DataFolder) { + -> (TempDir, DeltaLake) { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); let generated_columns = vec![("generated".to_owned(), "field_1 + field_2".to_owned())]; let table_type = TableType::TimeSeriesTable( diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/delta_lake.rs index 0abfbb3fe..cb24bec23 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/delta_lake.rs @@ -29,6 +29,7 @@ use chrono::TimeDelta; use dashmap::DashMap; use datafusion::catalog::TableProvider; use datafusion::common::{DFSchema, ToDFSchema}; +use datafusion::datasource::sink::DataSink; use datafusion::logical_expr::{Expr, lit}; use datafusion::parquet::file::properties::WriterProperties; use datafusion::parquet::format::SortingColumn; @@ -156,9 +157,10 @@ impl DeltaLake { ) -> Result { match storage_configuration { protocol::manager_metadata::StorageConfiguration::S3Configuration(s3_configuration) => { - // Register the S3 storage handlers to allow the use of Amazon S3 object stores. - // This is required at runtime to initialize the S3 storage implementation in the - // deltalake_aws storage subcrate. + // Register the S3 storage handlers to allow the use of Amazon S3 object stores. This is + // required at runtime to initialize the S3 storage implementation in the deltalake_aws + // storage subcrate. It is safe to call this function multiple times as the handlers are + // stored in a DashMap, thus, the handlers are simply overwritten with the same each time. deltalake::aws::register_handlers(None); Self::open_s3( @@ -331,6 +333,39 @@ impl DeltaLake { Ok(()) } + /// Register all normal tables and time series tables in `self` with its [`SessionContext`]. + /// `data_sink` set as the [`DataSink`] for all of the tables. If the tables could not be + /// registered, [`ModelarDbStorageError`] is returned. + pub async fn register_normal_and_time_series_tables(&self, data_sink: Arc) -> Result<()> { + // Register normal tables. + for normal_table_name in self.normal_table_names().await? { + let delta_table = self + .delta_table(&normal_table_name) + .await?; + + crate::register_normal_table( + &self.session_context, + &normal_table_name, + delta_table, + data_sink.clone(), + )?; + } + + // Register time series tables. + for metadata in self.time_series_table_metadata().await? { + let delta_table = self.delta_table(&metadata.name).await?; + + crate::register_time_series_table( + &self.session_context, + delta_table, + metadata, + data_sink.clone(), + )?; + } + + Ok(()) + } + /// Return connection information saved as bytes to make it possible to transfer the information /// using Apache Arrow Flight. Only returns [`Some`] if [`DeltaLake] was created by /// [`try_remote_from_connection_info()`]. From 3f3c8b049b10061cfef603acf3a46d1e7ef1e047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Mon, 20 Oct 2025 14:23:26 +0000 Subject: [PATCH 17/31] Rename DeltaLake to DataFolder --- crates/modelardb_bulkloader/src/main.rs | 14 +- crates/modelardb_embedded/src/capi.rs | 38 ++-- .../src/operations/data_folder.rs | 92 +++++----- crates/modelardb_manager/src/main.rs | 16 +- crates/modelardb_manager/src/metadata.rs | 66 +++---- crates/modelardb_manager/src/remote.rs | 36 ++-- crates/modelardb_server/src/configuration.rs | 7 +- crates/modelardb_server/src/context.rs | 27 +-- crates/modelardb_server/src/data_folders.rs | 55 +----- crates/modelardb_server/src/manager.rs | 10 +- .../src/storage/compressed_data_manager.rs | 11 +- .../src/storage/data_transfer.rs | 21 +-- crates/modelardb_server/src/storage/mod.rs | 2 +- .../src/storage/uncompressed_data_manager.rs | 13 +- .../src/{delta_lake.rs => data_folder.rs} | 165 ++++++++---------- crates/modelardb_storage/src/lib.rs | 2 +- .../src/optimizer/model_simple_aggregates.rs | 6 +- .../src/query/metadata_table.rs | 2 +- .../src/query/normal_table.rs | 2 +- .../src/query/time_series_table.rs | 2 +- 20 files changed, 240 insertions(+), 347 deletions(-) rename crates/modelardb_storage/src/{delta_lake.rs => data_folder.rs} (92%) diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index 6a241fe26..c4da345b1 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -39,7 +39,7 @@ use deltalake::{ObjectStore, Path}; use futures::stream::StreamExt; use modelardb_embedded::error::{ModelarDbEmbeddedError, Result}; use modelardb_embedded::operations::Operations; -use modelardb_storage::delta_lake::{DeltaLake, DeltaTableWriter}; +use modelardb_storage::data_folder::{DataFolder, DeltaTableWriter}; use modelardb_types::types::TimeSeriesTableMetadata; use sysinfo::System; @@ -200,7 +200,7 @@ async fn import( async fn import_time_series_table( mut input_stream: Pin>, time_series_table_metadata: &TimeSeriesTableMetadata, - data_folder: &mut DeltaLake, + data_folder: &mut DataFolder, cast_double_to_float: bool, ) -> Result<()> { let table_name = &time_series_table_metadata.name; @@ -253,7 +253,7 @@ async fn import_time_series_table( async fn import_normal_table( mut input_stream: Pin>, table_name: &str, - data_folder: &mut DeltaLake, + data_folder: &mut DataFolder, ) -> Result<()> { let mut delta_table_writer = data_folder.table_writer(table_name).await?; @@ -483,14 +483,14 @@ async fn export( /// Returns a [`DataFolder`] for `data_folder_path`. If the necessary environment variables are not /// set for S3 and Azure or the [`DataFolder`] cannot access `data_folder_path`, a /// [`ModelarDbEmbeddedError`] is returned. -async fn create_data_folder(data_folder_path: &str) -> Result { +async fn create_data_folder(data_folder_path: &str) -> Result { match data_folder_path.split_once("://") { Some(("s3", bucket_name)) => { let endpoint = env::var("AWS_ENDPOINT")?; let access_key_id = env::var("AWS_ACCESS_KEY_ID")?; let secret_access_key = env::var("AWS_SECRET_ACCESS_KEY")?; - DeltaLake::open_s3( + DataFolder::open_s3( endpoint, bucket_name.to_owned(), access_key_id, @@ -502,11 +502,11 @@ async fn create_data_folder(data_folder_path: &str) -> Result { let account_name = env::var("AZURE_STORAGE_ACCOUNT_NAME")?; let access_key = env::var("AZURE_STORAGE_ACCESS_KEY")?; - DeltaLake::open_azure(account_name, access_key, container_name.to_owned()).await.map_err(|error| error.into()) + DataFolder::open_azure(account_name, access_key, container_name.to_owned()).await.map_err(|error| error.into()) } _ => { let data_folder_path = StdPath::new(data_folder_path); - DeltaLake::open_local(data_folder_path).await.map_err(|error| error.into()) + DataFolder::open_local(data_folder_path).await.map_err(|error| error.into()) } } } diff --git a/crates/modelardb_embedded/src/capi.rs b/crates/modelardb_embedded/src/capi.rs index 44abd232a..26ea64271 100644 --- a/crates/modelardb_embedded/src/capi.rs +++ b/crates/modelardb_embedded/src/capi.rs @@ -39,7 +39,7 @@ use std::sync::{Arc, LazyLock}; use arrow::array::{self, Array, Float32Array, Int8Array, MapArray, StringArray, StructArray}; use arrow::ffi::{self, FFI_ArrowArray, FFI_ArrowSchema}; use arrow::record_batch::RecordBatch; -use modelardb_storage::delta_lake::DeltaLake; +use modelardb_storage::data_folder::DataFolder; use modelardb_types::types::ErrorBound; use tokio::runtime::Runtime; @@ -88,11 +88,11 @@ pub unsafe extern "C" fn modelardb_embedded_open_memory() -> *const c_void { } /// See documentation for [`modelardb_embedded_open_memory`]. -fn open_memory() -> Result { - let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_memory())?; +fn open_memory() -> Result { + let data_folder = TOKIO_RUNTIME.block_on(DataFolder::open_memory())?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; - Ok(delta_lake) + TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + Ok(data_folder) } /// Creates a [`DataFolder`] that manages data in the local folder at `data_folder_path_path` and @@ -107,14 +107,14 @@ pub unsafe extern "C" fn modelardb_embedded_open_local( } /// See documentation for [`modelardb_embedded_open_local`]. -unsafe fn open_local(data_folder_path_ptr: *const c_char) -> Result { +unsafe fn open_local(data_folder_path_ptr: *const c_char) -> Result { let data_folder_str = unsafe { c_char_ptr_to_str(data_folder_path_ptr)? }; let data_folder_path = StdPath::new(data_folder_str); - let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_local(data_folder_path))?; + let data_folder = TOKIO_RUNTIME.block_on(DataFolder::open_local(data_folder_path))?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; - Ok(delta_lake) + TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + Ok(data_folder) } /// Creates a [`DataFolder`] that manages data in an object store with a S3-compatible API and @@ -146,21 +146,21 @@ unsafe fn open_s3( bucket_name_ptr: *const c_char, access_key_id_ptr: *const c_char, secret_access_key_ptr: *const c_char, -) -> Result { +) -> Result { let endpoint = unsafe { c_char_ptr_to_str(endpoint_ptr)? }; let bucket_name = unsafe { c_char_ptr_to_str(bucket_name_ptr)? }; let access_key_id = unsafe { c_char_ptr_to_str(access_key_id_ptr)? }; let secret_access_key = unsafe { c_char_ptr_to_str(secret_access_key_ptr)? }; - let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_s3( + let data_folder = TOKIO_RUNTIME.block_on(DataFolder::open_s3( endpoint.to_owned(), bucket_name.to_owned(), access_key_id.to_owned(), secret_access_key.to_owned(), ))?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; - Ok(delta_lake) + TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + Ok(data_folder) } /// Creates a [`DataFolder`] that manages data in an object store with an Azure-compatible API and @@ -182,19 +182,19 @@ unsafe fn open_azure( account_name_ptr: *const c_char, access_key_ptr: *const c_char, container_name_ptr: *const c_char, -) -> Result { +) -> Result { let account_name = unsafe { c_char_ptr_to_str(account_name_ptr)? }; let access_key = unsafe { c_char_ptr_to_str(access_key_ptr)? }; let container_name = unsafe { c_char_ptr_to_str(container_name_ptr)? }; - let delta_lake = TOKIO_RUNTIME.block_on(DeltaLake::open_azure( + let data_folder = TOKIO_RUNTIME.block_on(DataFolder::open_azure( account_name.to_owned(), access_key.to_owned(), container_name.to_owned(), ))?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(delta_lake.register_normal_and_time_series_tables(data_sink))?; - Ok(delta_lake) + TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + Ok(data_folder) } /// Creates a [`Client`] that is connected to the Apache Arrow Flight server URL in `node_url_ptr` @@ -246,7 +246,7 @@ pub unsafe extern "C" fn modelardb_embedded_close( is_data_folder: bool, ) -> c_int { if is_data_folder { - let maybe_data_folder_ptr: *mut DeltaLake = maybe_operations_ptr.cast(); + let maybe_data_folder_ptr: *mut DataFolder = maybe_operations_ptr.cast(); if !maybe_data_folder_ptr.is_null() && maybe_data_folder_ptr.is_aligned() { // The box is assigned to _data_folder as Box::from_raw() is #[must_use]. let _data_folder = unsafe { Box::from_raw(maybe_data_folder_ptr) }; @@ -1035,7 +1035,7 @@ unsafe fn c_void_to_operations<'a>( is_data_folder: bool, ) -> Result<&'a mut dyn Operations> { if is_data_folder { - let maybe_data_folder_ptr: *mut DeltaLake = maybe_operations_ptr.cast(); + let maybe_data_folder_ptr: *mut DataFolder = maybe_operations_ptr.cast(); if !maybe_data_folder_ptr.is_null() && maybe_data_folder_ptr.is_aligned() { unsafe { Ok(&mut *maybe_data_folder_ptr) } } else { diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index a73b5a2e5..5d216f229 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -32,7 +32,7 @@ use datafusion::physical_plan::metrics::MetricsSet; use datafusion::physical_plan::{DisplayAs, DisplayFormatType, common}; use datafusion::prelude::SessionContext; use futures::TryStreamExt; -use modelardb_storage::delta_lake::DeltaLake; +use modelardb_storage::data_folder::DataFolder; use crate::error::{ModelarDbEmbeddedError, Result}; use crate::operations::{ @@ -100,7 +100,7 @@ impl DisplayAs for DataFolderDataSink { } #[async_trait] -impl Operations for DeltaLake { +impl Operations for DataFolder { /// Return `self` as [`Any`] so it can be downcast. fn as_any(&self) -> &dyn Any { self @@ -250,7 +250,7 @@ impl Operations for DeltaLake { ) -> Result<()> { let target_data_folder = target .as_any() - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument("target is not a data folder.".to_owned()) })?; @@ -334,7 +334,7 @@ impl Operations for DeltaLake { ) -> Result<()> { let target_data_folder = target .as_any() - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument("target is not a data folder.".to_owned()) })?; @@ -417,7 +417,7 @@ impl Operations for DeltaLake { ) -> Result<()> { let target_data_folder = target .as_any() - .downcast_ref::() + .downcast_ref::() .ok_or_else(|| { ModelarDbEmbeddedError::InvalidArgument("target is not a data folder.".to_owned()) })?; @@ -599,7 +599,7 @@ mod tests { #[tokio::test] async fn test_register_existing_normal_tables_on_open() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); data_folder .create( @@ -618,7 +618,7 @@ mod tests { .unwrap(); // Create a new data folder and verify that the existing normal tables are registered. - let new_data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let data_sink = Arc::new(DataFolderDataSink::new()); new_data_folder.register_normal_and_time_series_tables(data_sink).await.unwrap(); assert!( @@ -638,7 +638,7 @@ mod tests { #[tokio::test] async fn test_create_normal_table_with_empty_schema() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create(NORMAL_TABLE_NAME, TableType::NormalTable(Schema::empty())) @@ -654,7 +654,7 @@ mod tests { #[tokio::test] async fn test_create_existing_normal_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create( @@ -692,7 +692,7 @@ mod tests { #[tokio::test] async fn test_create_time_series_table_with_error_bounds() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let error_bounds = HashMap::from([ ( @@ -763,7 +763,7 @@ mod tests { #[tokio::test] async fn test_register_existing_time_series_tables_on_open() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); data_folder .create( @@ -790,7 +790,7 @@ mod tests { .unwrap(); // Create a new data folder and verify that the existing time series tables are registered. - let new_data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let data_sink = Arc::new(DataFolderDataSink::new()); new_data_folder.register_normal_and_time_series_tables(data_sink).await.unwrap(); assert!( @@ -810,7 +810,7 @@ mod tests { #[tokio::test] async fn test_create_time_series_table_with_empty_schema() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create( @@ -829,7 +829,7 @@ mod tests { #[tokio::test] async fn test_create_existing_time_series_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .create( @@ -864,7 +864,7 @@ mod tests { #[tokio::test] async fn test_tables() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let table_names = data_folder.tables().await.unwrap(); assert!(table_names.is_empty()); @@ -912,7 +912,7 @@ mod tests { #[tokio::test] async fn test_missing_table_schema() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.schema(MISSING_TABLE_NAME).await; @@ -1023,7 +1023,7 @@ mod tests { #[tokio::test] async fn test_write_to_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder .write(MISSING_TABLE_NAME, time_series_table_data()) @@ -1059,7 +1059,7 @@ mod tests { #[tokio::test] async fn test_read_time_series_table_from_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder_read_time_series_table( &mut data_folder, @@ -1413,7 +1413,7 @@ mod tests { } async fn data_folder_read_time_series_table( - data_folder: &mut DeltaLake, + data_folder: &mut DataFolder, table_name: &str, columns: &[(String, Aggregate)], group_by: &[String], @@ -1492,7 +1492,7 @@ mod tests { #[tokio::test] async fn test_copy_time_series_table_from_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let source = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let source = DataFolder::open_local(temp_dir.path()).await.unwrap(); let (_temp_dir, target) = create_data_folder_with_time_series_table().await; @@ -1518,7 +1518,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_time_series_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let target = DataFolder::open_local(temp_dir.path()).await.unwrap(); source .write(TIME_SERIES_TABLE_NAME, time_series_table_data()) @@ -1553,7 +1553,7 @@ mod tests { .unwrap(); let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); let table_type = TableType::TimeSeriesTable(invalid_table_schema(), HashMap::new(), HashMap::new()); @@ -1807,7 +1807,7 @@ mod tests { #[tokio::test] async fn test_read_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let sql = format!("SELECT * FROM {MISSING_TABLE_NAME}"); let result = data_folder_read(&mut data_folder, &sql).await; @@ -1853,7 +1853,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); let schema = normal_table_schema().project(&[0, 1]).unwrap(); target @@ -1914,7 +1914,7 @@ mod tests { // Create a normal table that has the same schema as the time series table in source. let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); let schema = time_series_table_schema(); target @@ -1970,7 +1970,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); source .write(NORMAL_TABLE_NAME, normal_table_data()) @@ -1991,7 +1991,7 @@ mod tests { #[tokio::test] async fn test_copy_normal_table_from_missing_table_to_normal_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut source = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut source = DataFolder::open_local(temp_dir.path()).await.unwrap(); let (_temp_dir, mut target) = create_data_folder_with_normal_table().await; @@ -2102,7 +2102,7 @@ mod tests { #[tokio::test] async fn test_drop_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.drop(MISSING_TABLE_NAME).await; @@ -2173,7 +2173,7 @@ mod tests { #[tokio::test] async fn test_truncate_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.truncate(MISSING_TABLE_NAME).await; @@ -2248,7 +2248,7 @@ mod tests { #[tokio::test] async fn test_vacuum_missing_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let result = data_folder.vacuum(MISSING_TABLE_NAME, None).await; @@ -2298,7 +2298,7 @@ mod tests { } async fn assert_normal_table_exists( - data_folder: &DeltaLake, + data_folder: &DataFolder, table_name: &str, expected_schema: Schema, ) { @@ -2328,7 +2328,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); target .create( @@ -2404,7 +2404,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_normal_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let target = DataFolder::open_local(temp_dir.path()).await.unwrap(); let expected_result = normal_table_data(); source @@ -2516,7 +2516,7 @@ mod tests { } async fn assert_time_series_table_exists( - data_folder: &DeltaLake, + data_folder: &DataFolder, table_name: &str, expected_schema: Schema, ) -> TimeSeriesTableMetadata { @@ -2558,7 +2558,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_time_series_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let mut target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut target = DataFolder::open_local(temp_dir.path()).await.unwrap(); target .create( @@ -2632,7 +2632,7 @@ mod tests { let (_temp_dir, mut source) = create_data_folder_with_time_series_table().await; let temp_dir = tempfile::tempdir().unwrap(); - let target = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let target = DataFolder::open_local(temp_dir.path()).await.unwrap(); source .write(TIME_SERIES_TABLE_NAME, time_series_table_data()) @@ -2662,10 +2662,10 @@ mod tests { } async fn assert_table_not_moved( - source: &mut DeltaLake, + source: &mut DataFolder, source_table_name: &str, expected_result: RecordBatch, - maybe_target: Option<&mut DeltaLake>, + maybe_target: Option<&mut DataFolder>, maybe_target_table_name: Option<&str>, ) { let source_sql = format!("SELECT * FROM {source_table_name}"); @@ -2679,7 +2679,7 @@ mod tests { } } - async fn data_folder_read(data_folder: &mut DeltaLake, sql: &str) -> Result { + async fn data_folder_read(data_folder: &mut DataFolder, sql: &str) -> Result { let record_batch_stream = data_folder.read(sql).await?; record_batch_stream_to_record_batch(record_batch_stream).await } @@ -2687,7 +2687,7 @@ mod tests { #[tokio::test] async fn test_move_missing_table_to_time_series_table() { let temp_dir = tempfile::tempdir().unwrap(); - let mut source = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut source = DataFolder::open_local(temp_dir.path()).await.unwrap(); let (_temp_dir, target) = create_data_folder_with_time_series_table().await; @@ -2719,9 +2719,9 @@ mod tests { ); } - async fn create_data_folder_with_normal_table() -> (TempDir, DeltaLake) { + async fn create_data_folder_with_normal_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); data_folder .create( @@ -2773,9 +2773,9 @@ mod tests { ]) } - async fn create_data_folder_with_time_series_table() -> (TempDir, DeltaLake) { + async fn create_data_folder_with_time_series_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let table_type = TableType::TimeSeriesTable(time_series_table_schema(), HashMap::new(), HashMap::new()); @@ -2848,9 +2848,9 @@ mod tests { } async fn create_data_folder_with_time_series_table_with_generated_column() - -> (TempDir, DeltaLake) { + -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let mut data_folder = DeltaLake::open_local(temp_dir.path()).await.unwrap(); + let mut data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let generated_columns = vec![("generated".to_owned(), "field_1 + field_2".to_owned())]; let table_type = TableType::TimeSeriesTable( diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index cba733abe..1c4d1113b 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -23,7 +23,7 @@ mod remote; use std::sync::{Arc, LazyLock}; use std::{env, process}; -use modelardb_storage::delta_lake::DeltaLake; +use modelardb_storage::data_folder::DataFolder; use modelardb_types::flight::protocol; use tokio::sync::RwLock; use tonic::metadata::errors::InvalidMetadataValue; @@ -41,8 +41,8 @@ pub static PORT: LazyLock = /// Provides access to the managers components. pub struct Context { - /// Delta Lake for storing metadata and data in Apache Parquet files. - pub remote_delta_lake: DeltaLake, + /// [`DataFolder`] for storing metadata and data in Apache Parquet files. + pub remote_data_folder: DataFolder, /// Storage configuration encoded as a [`StorageConfiguration`](protocol::manager_metadata::StorageConfiguration) /// protobuf message to make it possible to transfer the configuration using Apache Arrow Flight. pub remote_storage_configuration: protocol::manager_metadata::StorageConfiguration, @@ -70,18 +70,18 @@ async fn main() -> Result<()> { }; let remote_storage_configuration = modelardb_types::flight::argument_to_storage_configuration(remote_data_folder_str)?; - let remote_delta_lake = DeltaLake::open_object_store(remote_storage_configuration.clone()).await?; + let remote_data_folder = DataFolder::open_object_store(remote_storage_configuration.clone()).await?; - remote_delta_lake.create_and_register_manager_metadata_delta_lake_tables().await?; + remote_data_folder.create_and_register_manager_metadata_data_folder_tables().await?; let mut cluster = Cluster::new(); - let nodes = remote_delta_lake.nodes().await?; + let nodes = remote_data_folder.nodes().await?; for node in nodes { cluster.register_node(node)?; } // Retrieve and parse the key to a tonic metadata value since it is used in tonic requests. - let key = remote_delta_lake + let key = remote_data_folder .manager_key() .await? .to_string() @@ -92,7 +92,7 @@ async fn main() -> Result<()> { // Create the Context. let context = Arc::new(Context { - remote_delta_lake, + remote_data_folder, remote_storage_configuration, cluster: RwLock::new(cluster), key, diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index a63f475f6..31b5041f1 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -23,7 +23,7 @@ use arrow::array::{Array, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use deltalake::DeltaTableError; use deltalake::datafusion::logical_expr::{col, lit}; -use modelardb_storage::delta_lake::DeltaLake; +use modelardb_storage::data_folder::DataFolder; use modelardb_storage::{register_metadata_table, sql_and_concat}; use modelardb_types::types::{Node, ServerMode}; use uuid::Uuid; @@ -33,14 +33,14 @@ use crate::error::Result; /// Stores the metadata required for reading from and writing to the normal tables and time series tables /// and persisting edges. The data that needs to be persisted is stored in the metadata Delta Lake. pub trait ManagerMetadata { - async fn create_and_register_manager_metadata_delta_lake_tables(&self) -> Result<()>; + async fn create_and_register_manager_metadata_data_folder_tables(&self) -> Result<()>; async fn manager_key(&self) -> Result; async fn save_node(&self, node: Node) -> Result<()>; async fn remove_node(&self, url: &str) -> Result<()>; async fn nodes(&self) -> Result>; } -impl ManagerMetadata for DeltaLake { +impl ManagerMetadata for DataFolder { /// If they do not already exist, create the tables that are specific to the manager metadata /// Delta Lake and register them with the Apache DataFusion session context. /// * The `manager_metadata` table contains metadata for the manager itself. It is assumed that @@ -49,7 +49,7 @@ impl ManagerMetadata for DeltaLake { /// /// If the tables exist or were created, return [`Ok`], otherwise return /// [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). - async fn create_and_register_manager_metadata_delta_lake_tables(&self) -> Result<()> { + async fn create_and_register_manager_metadata_data_folder_tables(&self) -> Result<()> { // Create and register the manager_metadata table if it does not exist. let delta_table = self .create_metadata_table( @@ -167,12 +167,12 @@ mod tests { // Tests for MetadataManager. #[tokio::test] - async fn test_create_manager_metadata_delta_lake_tables() { - let (_temp_dir, metadata_manager) = create_delta_lake().await; + async fn test_create_manager_metadata_data_folder_tables() { + let (_temp_dir, data_folder) = create_data_folder().await; // Verify that the tables were created, registered, and has the expected columns. assert!( - metadata_manager + data_folder .session_context() .sql("SELECT key FROM manager_metadata") .await @@ -180,7 +180,7 @@ mod tests { ); assert!( - metadata_manager + data_folder .session_context() .sql("SELECT url, mode FROM nodes") .await @@ -190,13 +190,13 @@ mod tests { #[tokio::test] async fn test_new_manager_key() { - let (_temp_dir, metadata_manager) = create_delta_lake().await; + let (_temp_dir, data_folder) = create_data_folder().await; // Verify that the manager key is created and saved correctly. - let manager_key = metadata_manager.manager_key().await.unwrap(); + let manager_key = data_folder.manager_key().await.unwrap(); let sql = "SELECT key FROM manager_metadata"; - let batch = sql_and_concat(metadata_manager.session_context(), sql) + let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -208,14 +208,14 @@ mod tests { #[tokio::test] async fn test_existing_manager_key() { - let (_temp_dir, metadata_manager) = create_delta_lake().await; + let (_temp_dir, data_folder) = create_data_folder().await; // Verify that only a single key is created and saved when retrieving multiple times. - let manager_key_1 = metadata_manager.manager_key().await.unwrap(); - let manager_key_2 = metadata_manager.manager_key().await.unwrap(); + let manager_key_1 = data_folder.manager_key().await.unwrap(); + let manager_key_2 = data_folder.manager_key().await.unwrap(); let sql = "SELECT key FROM manager_metadata"; - let batch = sql_and_concat(metadata_manager.session_context(), sql) + let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -225,17 +225,17 @@ mod tests { #[tokio::test] async fn test_save_node() { - let (_temp_dir, metadata_manager) = create_delta_lake().await; + let (_temp_dir, data_folder) = create_data_folder().await; let node_1 = Node::new("url_1".to_string(), ServerMode::Edge); - metadata_manager.save_node(node_1.clone()).await.unwrap(); + data_folder.save_node(node_1.clone()).await.unwrap(); let node_2 = Node::new("url_2".to_string(), ServerMode::Edge); - metadata_manager.save_node(node_2.clone()).await.unwrap(); + data_folder.save_node(node_2.clone()).await.unwrap(); // Verify that the nodes are saved correctly. let sql = "SELECT url, mode FROM nodes"; - let batch = sql_and_concat(metadata_manager.session_context(), sql) + let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -251,19 +251,19 @@ mod tests { #[tokio::test] async fn test_remove_node() { - let (_temp_dir, metadata_manager) = create_delta_lake().await; + let (_temp_dir, data_folder) = create_data_folder().await; let node_1 = Node::new("url_1".to_string(), ServerMode::Edge); - metadata_manager.save_node(node_1.clone()).await.unwrap(); + data_folder.save_node(node_1.clone()).await.unwrap(); let node_2 = Node::new("url_2".to_string(), ServerMode::Edge); - metadata_manager.save_node(node_2.clone()).await.unwrap(); + data_folder.save_node(node_2.clone()).await.unwrap(); - metadata_manager.remove_node(&node_1.url).await.unwrap(); + data_folder.remove_node(&node_1.url).await.unwrap(); // Verify that node_1 is removed correctly. let sql = "SELECT url, mode FROM nodes"; - let batch = sql_and_concat(metadata_manager.session_context(), sql) + let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -279,31 +279,31 @@ mod tests { #[tokio::test] async fn test_nodes() { - let (_temp_dir, metadata_manager) = create_delta_lake().await; + let (_temp_dir, data_folder) = create_data_folder().await; let node_1 = Node::new("url_1".to_string(), ServerMode::Edge); - metadata_manager.save_node(node_1.clone()).await.unwrap(); + data_folder.save_node(node_1.clone()).await.unwrap(); let node_2 = Node::new("url_2".to_string(), ServerMode::Edge); - metadata_manager.save_node(node_2.clone()).await.unwrap(); + data_folder.save_node(node_2.clone()).await.unwrap(); - let nodes = metadata_manager.nodes().await.unwrap(); + let nodes = data_folder.nodes().await.unwrap(); assert_eq!(nodes, vec![node_2, node_1]); } - async fn create_delta_lake() -> (TempDir, DeltaLake) { + async fn create_data_folder() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::open_local(temp_dir.path()) + let data_folder = DataFolder::open_local(temp_dir.path()) .await .unwrap(); - delta_lake - .create_and_register_manager_metadata_delta_lake_tables() + data_folder + .create_and_register_manager_metadata_data_folder_tables() .await .unwrap(); - (temp_dir, delta_lake) + (temp_dir, data_folder) } } diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 5bdb18534..02cc037f8 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -96,16 +96,16 @@ impl FlightServiceHandler { /// Return the schema of the table with the name `table_name`. If the table does not exist or /// the schema cannot be retrieved, return [`Status`]. async fn table_schema(&self, table_name: &str) -> StdResult, Status> { - let delta_lake = &self.context.remote_delta_lake; + let data_folder = &self.context.remote_data_folder; - if delta_lake + if data_folder .is_normal_table(table_name) .await .map_err(error_to_status_internal)? { let delta_table = self .context - .remote_delta_lake + .remote_data_folder .delta_table(table_name) .await .map_err(error_to_status_internal)?; @@ -117,12 +117,12 @@ impl FlightServiceHandler { .map_err(error_to_status_internal)?; Ok(Arc::new(schema)) - } else if delta_lake + } else if data_folder .is_time_series_table(table_name) .await .map_err(error_to_status_internal)? { - let time_series_table_metadata = delta_lake + let time_series_table_metadata = data_folder .time_series_table_metadata_for_time_series_table(table_name) .await .map_err(error_to_status_internal)?; @@ -140,7 +140,7 @@ impl FlightServiceHandler { async fn check_if_table_exists(&self, table_name: &str) -> StdResult<(), Status> { let existing_tables = self .context - .remote_delta_lake + .remote_data_folder .table_names() .await .map_err(error_to_status_internal)?; @@ -166,14 +166,14 @@ impl FlightServiceHandler { ) -> StdResult<(), Status> { // Create an empty Delta Lake table. self.context - .remote_delta_lake + .remote_data_folder .create_normal_table(table_name, schema) .await .map_err(error_to_status_internal)?; // Persist the new normal table to the metadata Delta Lake. self.context - .remote_delta_lake + .remote_data_folder .save_normal_table_metadata(table_name) .await .map_err(error_to_status_internal)?; @@ -210,14 +210,14 @@ impl FlightServiceHandler { ) -> StdResult<(), Status> { // Create an empty Delta Lake table. self.context - .remote_delta_lake + .remote_data_folder .create_time_series_table(&time_series_table_metadata) .await .map_err(error_to_status_internal)?; // Persist the new time series table to the metadata Delta Lake. self.context - .remote_delta_lake + .remote_data_folder .save_time_series_table_metadata(&time_series_table_metadata) .await .map_err(error_to_status_internal)?; @@ -257,14 +257,14 @@ impl FlightServiceHandler { // Drop the table from the remote data folder metadata Delta Lake. This will return an error // if the table does not exist. self.context - .remote_delta_lake + .remote_data_folder .drop_table_metadata(table_name) .await .map_err(error_to_status_internal)?; // Drop the table from the remote data folder data Delta lake. self.context - .remote_delta_lake + .remote_data_folder .drop_table(table_name) .await .map_err(error_to_status_internal)?; @@ -293,7 +293,7 @@ impl FlightServiceHandler { // Truncate the table in the remote data folder data Delta lake. self.context - .remote_delta_lake + .remote_data_folder .truncate_table(table_name) .await .map_err(error_to_status_internal)?; @@ -320,7 +320,7 @@ impl FlightServiceHandler { ) -> StdResult<(), Status> { // Vacuum the table in the remote data folder Delta lake. self.context - .remote_delta_lake + .remote_data_folder .vacuum_table(table_name, maybe_retention_period_in_seconds) .await .map_err(error_to_status_internal)?; @@ -378,7 +378,7 @@ impl FlightService for FlightServiceHandler { // Retrieve the table names from the metadata Delta Lake. let table_names = self .context - .remote_delta_lake + .remote_data_folder .table_names() .await .map_err(error_to_status_internal)?; @@ -509,7 +509,7 @@ impl FlightService for FlightServiceHandler { if table_names.is_empty() { table_names = self .context - .remote_delta_lake + .remote_data_folder .table_names() .await .map_err(error_to_status_internal)?; @@ -616,7 +616,7 @@ impl FlightService for FlightServiceHandler { // this fails, the metadata Delta Lake and the cluster will be out of sync until the // manager is restarted. self.context - .remote_delta_lake + .remote_data_folder .save_node(node) .await .map_err(error_to_status_internal)?; @@ -650,7 +650,7 @@ impl FlightService for FlightServiceHandler { // Remove the node with the given url from the metadata Delta Lake. self.context - .remote_delta_lake + .remote_data_folder .remove_node(&node_metadata.url) .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_server/src/configuration.rs b/crates/modelardb_server/src/configuration.rs index 9984d25c5..90ff04a92 100644 --- a/crates/modelardb_server/src/configuration.rs +++ b/crates/modelardb_server/src/configuration.rs @@ -241,11 +241,12 @@ mod tests { use std::sync::Arc; + use modelardb_storage::data_folder::DataFolder; use tempfile::TempDir; use tokio::sync::RwLock; use uuid::Uuid; - use crate::data_folders::{DataFolder, DataFolders}; + use crate::data_folders::DataFolders; use crate::manager::Manager; use crate::storage::StorageEngine; @@ -407,11 +408,11 @@ mod tests { Arc>, ) { let local_url = temp_dir.path().to_str().unwrap(); - let local_data_folder = DataFolder::try_from_local_url(local_url).await.unwrap(); + let local_data_folder = DataFolder::open_local_url(local_url).await.unwrap(); let target_dir = tempfile::tempdir().unwrap(); let target_url = target_dir.path().to_str().unwrap(); - let remote_data_folder = DataFolder::try_from_local_url(target_url).await.unwrap(); + let remote_data_folder = DataFolder::open_local_url(target_url).await.unwrap(); let data_folders = DataFolders::new( local_data_folder.clone(), diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index d164b2bb0..62a1389ad 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -84,7 +84,6 @@ impl Context { // Create an empty Delta Lake table. self.data_folders .local_data_folder - .delta_lake .create_normal_table(table_name, schema) .await?; @@ -94,7 +93,6 @@ impl Context { // Persist the new normal table to the Delta Lake. self.data_folders .local_data_folder - .delta_lake .save_normal_table_metadata(table_name) .await?; @@ -127,7 +125,6 @@ impl Context { // Create an empty Delta Lake table. self.data_folders .local_data_folder - .delta_lake .create_time_series_table(time_series_table_metadata) .await?; @@ -138,7 +135,6 @@ impl Context { // Persist the new time series table to the metadata Delta Lake. self.data_folders .local_data_folder - .delta_lake .save_time_series_table_metadata(time_series_table_metadata) .await?; @@ -159,7 +155,6 @@ impl Context { let table_names = self .data_folders .local_data_folder - .delta_lake .normal_table_names() .await?; @@ -177,7 +172,6 @@ impl Context { let delta_table = self .data_folders .query_data_folder - .delta_lake .delta_table(table_name) .await?; @@ -208,7 +202,6 @@ impl Context { let time_series_table_metadata = self .data_folders .local_data_folder - .delta_lake .time_series_table_metadata() .await?; @@ -229,7 +222,6 @@ impl Context { let delta_table = self .data_folders .query_data_folder - .delta_lake .delta_table(&time_series_table_metadata.name) .await?; @@ -273,14 +265,12 @@ impl Context { // Drop the table metadata from the metadata Delta Lake. self.data_folders .local_data_folder - .delta_lake .drop_table_metadata(table_name) .await?; // Drop the table from the Delta Lake. self.data_folders .local_data_folder - .delta_lake .drop_table(table_name) .await?; @@ -302,7 +292,6 @@ impl Context { // Delete the table data from the data Delta Lake. self.data_folders .local_data_folder - .delta_lake .truncate_table(table_name) .await?; @@ -339,7 +328,6 @@ impl Context { self.data_folders .local_data_folder - .delta_lake .vacuum_table(table_name, maybe_retention_period_in_seconds) .await?; @@ -424,12 +412,11 @@ fn table_does_not_exist_error(table_name: &str) -> ModelarDbServerError { mod tests { use super::*; + use modelardb_storage::data_folder::DataFolder; use modelardb_test::table::{self, NORMAL_TABLE_NAME, TIME_SERIES_TABLE_NAME}; use modelardb_types::types::MAX_RETENTION_PERIOD_IN_SECONDS; use tempfile::TempDir; - use crate::data_folders::DataFolder; - // Tests for Context. #[tokio::test] async fn test_create_normal_table() { @@ -455,7 +442,6 @@ mod tests { context .data_folders .local_data_folder - .delta_lake .is_normal_table(NORMAL_TABLE_NAME) .await .unwrap() @@ -503,7 +489,6 @@ mod tests { let time_series_table_metadata = context .data_folders .local_data_folder - .delta_lake .time_series_table_metadata() .await .unwrap(); @@ -608,7 +593,6 @@ mod tests { !context .data_folders .local_data_folder - .delta_lake .is_normal_table(NORMAL_TABLE_NAME) .await .unwrap() @@ -645,7 +629,6 @@ mod tests { !context .data_folders .local_data_folder - .delta_lake .is_time_series_table(TIME_SERIES_TABLE_NAME) .await .unwrap() @@ -675,7 +658,6 @@ mod tests { let local_data_folder = &context.data_folders.local_data_folder; let mut delta_table = local_data_folder - .delta_lake .delta_table(NORMAL_TABLE_NAME) .await .unwrap(); @@ -687,7 +669,6 @@ mod tests { // The normal table should not be deleted from the metadata Delta Lake. assert!( local_data_folder - .delta_lake .is_normal_table(NORMAL_TABLE_NAME) .await .unwrap() @@ -705,7 +686,6 @@ mod tests { let local_data_folder = &context.data_folders.local_data_folder; let mut delta_table = local_data_folder - .delta_lake .delta_table(TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -720,7 +700,6 @@ mod tests { // The time series table should not be deleted from the metadata Delta Lake. assert!( local_data_folder - .delta_lake .is_time_series_table(TIME_SERIES_TABLE_NAME) .await .unwrap() @@ -782,7 +761,6 @@ mod tests { // Write data to the normal table. let local_data_folder = &context.data_folders.local_data_folder; local_data_folder - .delta_lake .write_record_batches_to_normal_table( NORMAL_TABLE_NAME, vec![table::normal_table_record_batch()], @@ -854,7 +832,6 @@ mod tests { // Write data to the time series table. let local_data_folder = &context.data_folders.local_data_folder; local_data_folder - .delta_lake .write_compressed_segments_to_time_series_table( TIME_SERIES_TABLE_NAME, vec![table::compressed_segments_record_batch()], @@ -1002,7 +979,7 @@ mod tests { /// Create a simple [`Context`] that uses `temp_dir` as the local data folder and query data folder. async fn create_context(temp_dir: &TempDir) -> Arc { let temp_dir_url = temp_dir.path().to_str().unwrap(); - let local_data_folder = DataFolder::try_from_local_url(temp_dir_url).await.unwrap(); + let local_data_folder = DataFolder::open_local_url(temp_dir_url).await.unwrap(); Arc::new( Context::try_new( diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index 529dd3511..453a45396 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -15,57 +15,14 @@ //! Implementation of a struct that provides access to the local and remote data storage components. -use std::sync::Arc; - -use modelardb_storage::delta_lake::DeltaLake; -use modelardb_types::flight::protocol; +use modelardb_storage::data_folder::DataFolder; use modelardb_types::types::ServerMode; -use tracing::warn; use crate::ClusterMode; use crate::Result; use crate::error::ModelarDbServerError; use crate::manager::Manager; -/// Folder for storing metadata and data in Apache Parquet files. -#[derive(Clone)] -pub struct DataFolder { - /// Delta Lake for storing metadata and data in Apache Parquet files. - pub delta_lake: Arc, -} - -impl DataFolder { - /// Return a [`DataFolder`] with a local [`DeltaLake`] created from `local_url`. If `local_url` - /// is a folder that does not exist, it is created. If `local_url` could not be parsed, if the - /// folder does not exist and could not be created, or if the metadata tables could not be - /// created, [`ModelarDbServerError`] is returned. - pub async fn try_from_local_url(local_url: &str) -> Result { - let delta_lake = Arc::new(DeltaLake::open_local_url(local_url).await?); - - if local_url.starts_with("memory://") { - warn!( - "The local data folder is in memory. Data will not be persisted. Spilling data will \ - not decrease memory usage. Configured memory limitations may be exceeded." - ); - }; - - Ok(Self { delta_lake }) - } - - /// Return a [`DataFolder`] created from `storage_configuration`. If a connection could - /// not be made or if the metadata tables could not be created, [`ModelarDbServerError`] is - /// returned. - pub async fn try_from_storage_configuration( - storage_configuration: protocol::manager_metadata::StorageConfiguration, - ) -> Result { - let delta_lake = Arc::new( - DeltaLake::open_object_store(storage_configuration.clone()).await?, - ); - - Ok(Self { delta_lake }) - } -} - /// Folders for storing metadata and data in Apache Parquet files locally and remotely. #[derive(Clone)] pub struct DataFolders { @@ -103,7 +60,7 @@ impl DataFolders { match arguments { &["edge", local_data_folder_url] | &[local_data_folder_url] => { let local_data_folder = - DataFolder::try_from_local_url(local_data_folder_url).await?; + DataFolder::open_local_url(local_data_folder_url).await?; Ok(( ClusterMode::SingleNode, @@ -115,10 +72,10 @@ impl DataFolders { Manager::register_node(manager_url, ServerMode::Cloud).await?; let local_data_folder = - DataFolder::try_from_local_url(local_data_folder_url).await?; + DataFolder::open_local_url(local_data_folder_url).await?; let remote_data_folder = - DataFolder::try_from_storage_configuration(storage_configuration).await?; + DataFolder::open_object_store(storage_configuration).await?; Ok(( ClusterMode::MultiNode(manager), @@ -135,10 +92,10 @@ impl DataFolders { Manager::register_node(manager_url, ServerMode::Edge).await?; let local_data_folder = - DataFolder::try_from_local_url(local_data_folder_url).await?; + DataFolder::open_local_url(local_data_folder_url).await?; let remote_data_folder = - DataFolder::try_from_storage_configuration(storage_configuration).await?; + DataFolder::open_object_store(storage_configuration).await?; Ok(( ClusterMode::MultiNode(manager), diff --git a/crates/modelardb_server/src/manager.rs b/crates/modelardb_server/src/manager.rs index 520fc9865..6a3b9c8e0 100644 --- a/crates/modelardb_server/src/manager.rs +++ b/crates/modelardb_server/src/manager.rs @@ -23,6 +23,7 @@ use arrow_flight::flight_service_client::FlightServiceClient; use arrow_flight::{Action, Result as FlightResult}; use datafusion::arrow::datatypes::Schema; use datafusion::catalog::TableProvider; +use modelardb_storage::data_folder::DataFolder; use modelardb_types::flight::protocol; use modelardb_types::types::{Node, ServerMode, TimeSeriesTableMetadata}; use prost::Message; @@ -33,7 +34,6 @@ use tonic::transport::Channel; use crate::PORT; use crate::context::Context; -use crate::data_folders::DataFolder; use crate::error::{ModelarDbServerError, Result}; /// Manages metadata related to the manager and provides functionality for interacting with the manager. @@ -182,11 +182,9 @@ async fn validate_local_tables_exist_remotely( remote_data_folder: &DataFolder, ) -> Result<()> { let local_table_names = local_data_folder - .delta_lake .table_names() .await?; let remote_table_names = remote_data_folder - .delta_lake .table_names() .await?; @@ -217,7 +215,6 @@ async fn validate_normal_tables( let mut missing_normal_tables = vec![]; let remote_normal_tables = remote_data_folder - .delta_lake .normal_table_names() .await?; @@ -242,7 +239,7 @@ async fn validate_normal_tables( /// Retrieve the schema of a normal table from the Delta Lake in the data folder. If the table does /// not exist, or the schema could not be retrieved, return [`ModelarDbServerError`]. async fn normal_table_schema(data_folder: &DataFolder, table_name: &str) -> Result> { - let delta_table = data_folder.delta_lake.delta_table(table_name).await?; + let delta_table = data_folder.delta_table(table_name).await?; Ok(TableProvider::schema(&delta_table)) } @@ -257,18 +254,15 @@ async fn validate_time_series_tables( let mut missing_time_series_tables = vec![]; let remote_time_series_tables = remote_data_folder - .delta_lake .time_series_table_names() .await?; for table_name in remote_time_series_tables { let remote_metadata = remote_data_folder - .delta_lake .time_series_table_metadata_for_time_series_table(&table_name) .await?; if let Ok(local_metadata) = local_data_folder - .delta_lake .time_series_table_metadata_for_time_series_table(&table_name) .await { diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index f2c86dd9f..c272aea74 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -21,11 +21,11 @@ use std::sync::Arc; use crossbeam_queue::SegQueue; use dashmap::DashMap; use datafusion::arrow::record_batch::RecordBatch; +use modelardb_storage::data_folder::DataFolder; use tokio::runtime::Handle; use tokio::sync::RwLock; use tracing::{debug, error, info}; -use crate::data_folders::DataFolder; use crate::error::Result; use crate::storage::compressed_data_buffer::{CompressedDataBuffer, CompressedSegmentBatch}; use crate::storage::data_transfer::DataTransfer; @@ -87,7 +87,6 @@ impl CompressedDataManager { let record_batch_size_in_bytes = record_batch.get_array_memory_size(); self.local_data_folder - .delta_lake .write_record_batches_to_normal_table(table_name, vec![record_batch]) .await?; @@ -247,7 +246,6 @@ impl CompressedDataManager { let compressed_data_buffer_size_in_bytes = compressed_data_buffer.size_in_bytes; let compressed_segments = compressed_data_buffer.record_batches(); self.local_data_folder - .delta_lake .write_compressed_segments_to_time_series_table(table_name, compressed_segments) .await?; @@ -317,7 +315,6 @@ mod tests { let local_data_folder = data_manager.local_data_folder.clone(); let mut delta_table = local_data_folder - .delta_lake .create_normal_table(NORMAL_TABLE_NAME, &record_batch.schema()) .await .unwrap(); @@ -391,7 +388,6 @@ mod tests { let local_data_folder = data_manager.local_data_folder.clone(); let mut delta_table = local_data_folder - .delta_lake .create_time_series_table(&table::time_series_table_metadata()) .await .unwrap(); @@ -451,7 +447,6 @@ mod tests { let segments = compressed_segments_record_batch(); local_data_folder - .delta_lake .create_time_series_table(&segments.time_series_table_metadata) .await .unwrap(); @@ -507,7 +502,6 @@ mod tests { // Insert data that should be saved when the remaining memory is decreased. let segments = compressed_segments_record_batch(); local_data_folder - .delta_lake .create_time_series_table(&segments.time_series_table_metadata) .await .unwrap(); @@ -561,11 +555,10 @@ mod tests { // Create a local data folder and save a single time series table to the metadata Delta Lake. let temp_dir_url = temp_dir.path().to_str().unwrap(); - let local_data_folder = DataFolder::try_from_local_url(temp_dir_url).await.unwrap(); + let local_data_folder = DataFolder::open_local_url(temp_dir_url).await.unwrap(); let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 6b6fa6684..4e21e96d1 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -23,11 +23,11 @@ use std::time::Duration; use dashmap::DashMap; use deltalake::arrow::array::RecordBatch; use futures::TryStreamExt; +use modelardb_storage::data_folder::DataFolder; use tokio::sync::RwLock; use tokio::task::JoinHandle as TaskJoinHandle; use tracing::debug; -use crate::data_folders::DataFolder; use crate::error::Result; // TODO: Handle the case where a connection can not be established when transferring data. @@ -62,13 +62,12 @@ impl DataTransfer { remote_data_folder: DataFolder, transfer_batch_size_in_bytes: Option, ) -> Result { - let table_names = local_data_folder.delta_lake.table_names().await?; + let table_names = local_data_folder.table_names().await?; // The size of tables is computed manually as datafusion_table_statistics() is not exact. let table_size_in_bytes = DashMap::with_capacity(table_names.len()); for table_name in table_names { let delta_table = local_data_folder - .delta_lake .delta_table(&table_name) .await?; @@ -240,7 +239,6 @@ impl DataTransfer { let local_delta_ops = self .local_data_folder - .delta_lake .delta_ops(table_name) .await?; @@ -253,24 +251,20 @@ impl DataTransfer { // Write the data to the remote Delta Lake. if self .local_data_folder - .delta_lake .is_time_series_table(table_name) .await? { self.remote_data_folder - .delta_lake .write_compressed_segments_to_time_series_table(table_name, record_batches) .await?; } else { self.remote_data_folder - .delta_lake .write_record_batches_to_normal_table(table_name, record_batches) .await?; } // Delete the data that has been transferred to the remote Delta Lake. self.local_data_folder - .delta_lake .truncate_table(table_name) .await?; @@ -478,17 +472,15 @@ mod tests { async fn create_local_data_folder_with_tables() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); let temp_dir_url = temp_dir.path().to_str().unwrap(); - let local_data_folder = DataFolder::try_from_local_url(temp_dir_url).await.unwrap(); + let local_data_folder = DataFolder::open_local_url(temp_dir_url).await.unwrap(); // Create a normal table. local_data_folder - .delta_lake .create_normal_table(NORMAL_TABLE_NAME, &table::normal_table_schema()) .await .unwrap(); local_data_folder - .delta_lake .save_normal_table_metadata(NORMAL_TABLE_NAME) .await .unwrap(); @@ -496,13 +488,11 @@ mod tests { // Create a time series table. let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .delta_lake .create_time_series_table(&time_series_table_metadata) .await .unwrap(); local_data_folder - .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); @@ -519,7 +509,6 @@ mod tests { for _ in 0..batch_write_count { // Write to the normal table. local_data_folder - .delta_lake .write_record_batches_to_normal_table( NORMAL_TABLE_NAME, vec![table::normal_table_record_batch()], @@ -529,7 +518,6 @@ mod tests { // Write to the time series table. local_data_folder - .delta_lake .write_compressed_segments_to_time_series_table( TIME_SERIES_TABLE_NAME, vec![table::compressed_segments_record_batch()], @@ -547,7 +535,6 @@ mod tests { /// Return the total size of the files in the table with `table_name` in `local_data_folder`. async fn table_files_size(local_data_folder: &DataFolder, table_name: &str) -> u64 { let delta_table = local_data_folder - .delta_lake .delta_table(table_name) .await .unwrap(); @@ -567,7 +554,7 @@ mod tests { ) -> (TempDir, DataTransfer) { let target_dir = tempfile::tempdir().unwrap(); let target_dir_url = target_dir.path().to_str().unwrap(); - let remote_data_folder = DataFolder::try_from_local_url(target_dir_url) + let remote_data_folder = DataFolder::open_local_url(target_dir_url) .await .unwrap(); diff --git a/crates/modelardb_server/src/storage/mod.rs b/crates/modelardb_server/src/storage/mod.rs index 5d574cade..65c8e6f0c 100644 --- a/crates/modelardb_server/src/storage/mod.rs +++ b/crates/modelardb_server/src/storage/mod.rs @@ -19,7 +19,7 @@ //! metadata and models in in-memory buffers to batch them before saving them to immutable Apache //! Parquet files. The path to the Apache Parquet files containing relevant compressed data points //! for a query can be retrieved by the query engine using -//! [`DeltaLake`](modelardb_storage::delta_lake::DeltaLake). +//! [`DataFolder`](modelardb_storage::data_folder::DataFolder). mod compressed_data_buffer; mod compressed_data_manager; diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index 077686be0..a352159d1 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -24,13 +24,13 @@ use std::sync::atomic::{AtomicU64, Ordering}; use dashmap::DashMap; use futures::StreamExt; +use modelardb_storage::data_folder::DataFolder; use modelardb_types::types::{TimeSeriesTableMetadata, Timestamp, Value}; use object_store::path::{Path, PathPart}; use tokio::runtime::Handle; use tracing::{debug, error, warn}; use crate::context::Context; -use crate::data_folders::DataFolder; use crate::error::Result; use crate::storage::UNCOMPRESSED_DATA_FOLDER; use crate::storage::compressed_data_buffer::CompressedSegmentBatch; @@ -86,7 +86,7 @@ impl UncompressedDataManager { /// Add references to the [`UncompressedDataBuffers`](UncompressedDataBuffer) currently on disk /// to [`UncompressedDataManager`] which immediately will start compressing them. pub(super) async fn initialize(&self, context: &Context) -> Result<()> { - let local_data_folder = self.local_data_folder.delta_lake.object_store(); + let local_data_folder = self.local_data_folder.object_store(); let mut spilled_buffers = local_data_folder.list(Some(&Path::from(UNCOMPRESSED_DATA_FOLDER))); @@ -398,7 +398,7 @@ impl UncompressedDataManager { .1; let maybe_uncompressed_on_disk_data_buffer = uncompressed_in_memory_data_buffer - .spill_to_apache_parquet(self.local_data_folder.delta_lake.object_store()) + .spill_to_apache_parquet(self.local_data_folder.object_store()) .await; // If an error occurs the in-memory buffer must be re-added to the map before returning. @@ -683,7 +683,7 @@ mod tests { async fn test_can_compress_existing_on_disk_data_buffers_when_initializing() { let temp_dir = tempfile::tempdir().unwrap(); let temp_dir_url = temp_dir.path().to_str().unwrap(); - let local_data_folder = DataFolder::try_from_local_url(temp_dir_url).await.unwrap(); + let local_data_folder = DataFolder::open_local_url(temp_dir_url).await.unwrap(); // Create a context with a storage engine. let context = Arc::new( @@ -727,7 +727,6 @@ mod tests { let spilled_buffers = storage_engine .uncompressed_data_manager .local_data_folder - .delta_lake .object_store() .list(Some(&Path::from(UNCOMPRESSED_DATA_FOLDER))) .collect::>() @@ -1069,7 +1068,6 @@ mod tests { // The UncompressedDataBuffer should be spilled to tag hash in the uncompressed folder. let spilled_buffers = data_manager .local_data_folder - .delta_lake .object_store() .list(Some(&Path::from(UNCOMPRESSED_DATA_FOLDER))) .collect::>() @@ -1281,13 +1279,12 @@ mod tests { temp_dir: &TempDir, ) -> (UncompressedDataManager, Arc) { let temp_dir_url = temp_dir.path().to_str().unwrap(); - let local_data_folder = DataFolder::try_from_local_url(temp_dir_url).await.unwrap(); + let local_data_folder = DataFolder::open_local_url(temp_dir_url).await.unwrap(); // Ensure the expected metadata is available through the metadata manager. let time_series_table_metadata = table::time_series_table_metadata(); local_data_folder - .delta_lake .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_storage/src/delta_lake.rs b/crates/modelardb_storage/src/data_folder.rs similarity index 92% rename from crates/modelardb_storage/src/delta_lake.rs rename to crates/modelardb_storage/src/data_folder.rs index cb24bec23..0295fec82 100644 --- a/crates/modelardb_storage/src/delta_lake.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -68,14 +68,12 @@ enum TableType { } /// Functionality for managing Delta Lake tables in a local folder or an object store. -pub struct DeltaLake { +#[derive(Clone)] +pub struct DataFolder { /// URL to access the root of the Delta Lake. location: String, /// Storage options required to access Delta Lake. storage_options: HashMap, - /// Connection information saved as bytes to make it possible to transfer the information using - /// Apache Arrow Flight. Only set to [`Some`] by [`try_remote_from_connection_info()`]. - maybe_connection_info: Option>, /// [`ObjectStore`] to access the root of the Delta Lake. object_store: Arc, /// Cache of Delta tables to avoid opening the same table multiple times. @@ -84,8 +82,8 @@ pub struct DeltaLake { session_context: Arc, } -impl DeltaLake { - /// Create a new [`DeltaLake`] that manages the Delta tables at `local_url`. If `local_url` has +impl DataFolder { + /// Create a new [`DataFolder`] that manages the Delta tables at `local_url`. If `local_url` has /// the schema `file` or no schema, the Delta tables are managed in a local data folder. If /// `local_url` has the schema `memory`, the Delta tables are managed in memory. Return /// [`ModelarDbStorageError`] if `local_url` cannot be parsed or the metadata tables cannot be @@ -101,23 +99,22 @@ impl DeltaLake { } } - /// Create a new [`DeltaLake`] that manages the Delta tables in memory. + /// Create a new [`DataFolder`] that manages the Delta tables in memory. pub async fn open_memory() -> Result { - let delta_lake = Self { + let data_folder = Self { location: "memory:///modelardb".to_owned(), storage_options: HashMap::new(), - maybe_connection_info: None, object_store: Arc::new(InMemory::new()), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), }; - delta_lake.create_and_register_metadata_tables().await?; + data_folder.create_and_register_metadata_tables().await?; - Ok(delta_lake) + Ok(data_folder) } - /// Create a new [`DeltaLake`] that manages the Delta tables in `data_folder_path`. Returns a + /// Create a new [`DataFolder`] that manages the Delta tables in `data_folder_path`. Returns a /// [`ModelarDbStorageError`] if `data_folder_path` does not exist and could not be created or /// the metadata tables cannot be created. pub async fn open_local(data_folder_path: &StdPath) -> Result { @@ -135,21 +132,20 @@ impl DeltaLake { .ok_or_else(|| DeltaTableError::generic("Local data folder path is not UTF-8."))? .to_owned(); - let delta_lake = Self { + let data_folder = Self { location, storage_options: HashMap::new(), - maybe_connection_info: None, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), }; - delta_lake.create_and_register_metadata_tables().await?; + data_folder.create_and_register_metadata_tables().await?; - Ok(delta_lake) + Ok(data_folder) } - /// Create a new [`DeltaLake`] that manages Delta tables in the remote object store given by + /// Create a new [`DataFolder`] that manages Delta tables in the remote object store given by /// `storage_configuration`. Returns [`ModelarDbStorageError`] if a connection to the specified /// object store could not be created. pub async fn open_object_store( @@ -184,7 +180,7 @@ impl DeltaLake { } } - /// Create a new [`DeltaLake`] that manages the Delta tables in an object store with an + /// Create a new [`DataFolder`] that manages the Delta tables in an object store with an /// S3-compatible API. Returns a [`ModelarDbStorageError`] if a connection to the object store /// could not be made or the metadata tables cannot be created. pub async fn open_s3( @@ -221,21 +217,20 @@ impl DeltaLake { ) .build()?; - let delta_lake = DeltaLake { + let data_folder = DataFolder { location, storage_options, - maybe_connection_info: None, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), }; - delta_lake.create_and_register_metadata_tables().await?; + data_folder.create_and_register_metadata_tables().await?; - Ok(delta_lake) + Ok(data_folder) } - /// Create a new [`DeltaLake`] that manages the Delta tables in an object store with an + /// Create a new [`DataFolder`] that manages the Delta tables in an object store with an /// Azure-compatible API. Returns a [`ModelarDbStorageError`] if a connection to the object /// store could not be made or the metadata tables cannot be created. pub async fn open_azure( @@ -255,18 +250,17 @@ impl DeltaLake { .map_err(|error| ModelarDbStorageError::InvalidArgument(error.to_string()))?; let (object_store, _path) = object_store::parse_url_opts(&url, &storage_options)?; - let delta_lake = DeltaLake { + let data_folder = DataFolder { location, storage_options, - maybe_connection_info: None, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), session_context: Arc::new(SessionContext::new()), }; - delta_lake.create_and_register_metadata_tables().await?; + data_folder.create_and_register_metadata_tables().await?; - Ok(delta_lake) + Ok(data_folder) } /// If they do not already exist, create the tables in the metadata Delta Lake for normal table @@ -366,13 +360,6 @@ impl DeltaLake { Ok(()) } - /// Return connection information saved as bytes to make it possible to transfer the information - /// using Apache Arrow Flight. Only returns [`Some`] if [`DeltaLake] was created by - /// [`try_remote_from_connection_info()`]. - pub fn connection_info(&self) -> &Option> { - &self.maybe_connection_info - } - /// Return the session context used to query the tables using Apache DataFusion. pub fn session_context(&self) -> &SessionContext { &self.session_context @@ -1304,16 +1291,16 @@ mod tests { use modelardb_types::types::{ArrowTimestamp}; use tempfile::TempDir; - // Tests for DeltaLake. + // Tests for DataFolder. #[tokio::test] - async fn test_create_metadata_delta_lake_tables() { + async fn test_create_metadata_data_folder_tables() { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::open_local(temp_dir.path()) + let data_folder = DataFolder::open_local(temp_dir.path()) .await .unwrap(); assert!( - delta_lake + data_folder .session_context .sql("SELECT table_name FROM normal_table_metadata") .await @@ -1321,14 +1308,14 @@ mod tests { ); assert!( - delta_lake + data_folder .session_context .sql("SELECT table_name, query_schema FROM time_series_table_metadata") .await .is_ok() ); - assert!(delta_lake + assert!(data_folder .session_context .sql("SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ generated_column_expr FROM time_series_table_field_columns") @@ -1338,15 +1325,15 @@ mod tests { #[tokio::test] async fn test_normal_table_is_normal_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; - assert!(delta_lake.is_normal_table("normal_table_1").await.unwrap()); + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; + assert!(data_folder.is_normal_table("normal_table_1").await.unwrap()); } #[tokio::test] async fn test_time_series_table_is_not_normal_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; assert!( - !delta_lake + !data_folder .is_normal_table(test::TIME_SERIES_TABLE_NAME) .await .unwrap() @@ -1355,9 +1342,9 @@ mod tests { #[tokio::test] async fn test_time_series_table_is_time_series_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; assert!( - delta_lake + data_folder .is_time_series_table(test::TIME_SERIES_TABLE_NAME) .await .unwrap() @@ -1366,9 +1353,9 @@ mod tests { #[tokio::test] async fn test_normal_table_is_not_time_series_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; assert!( - !delta_lake + !data_folder .is_time_series_table("normal_table_1") .await .unwrap() @@ -1377,15 +1364,15 @@ mod tests { #[tokio::test] async fn test_table_names() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; let time_series_table_metadata = test::time_series_table_metadata(); - delta_lake + data_folder .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); - let table_names = delta_lake.table_names().await.unwrap(); + let table_names = data_folder.table_names().await.unwrap(); assert_eq!( table_names, vec![ @@ -1398,27 +1385,27 @@ mod tests { #[tokio::test] async fn test_normal_table_names() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; - let normal_table_names = delta_lake.normal_table_names().await.unwrap(); + let normal_table_names = data_folder.normal_table_names().await.unwrap(); assert_eq!(normal_table_names, vec!["normal_table_2", "normal_table_1"]); } #[tokio::test] async fn test_time_series_table_names() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; - let time_series_table_names = delta_lake.time_series_table_names().await.unwrap(); + let time_series_table_names = data_folder.time_series_table_names().await.unwrap(); assert_eq!(time_series_table_names, vec![test::TIME_SERIES_TABLE_NAME]); } #[tokio::test] async fn test_save_normal_table_metadata() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; // Retrieve the normal table from the metadata Delta Lake. let sql = "SELECT table_name FROM normal_table_metadata ORDER BY table_name"; - let batch = sql_and_concat(&delta_lake.session_context, sql) + let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1430,11 +1417,11 @@ mod tests { #[tokio::test] async fn test_save_time_series_table_metadata() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; // Check that a row has been added to the time_series_table_metadata table. let sql = "SELECT table_name, query_schema FROM time_series_table_metadata"; - let batch = sql_and_concat(&delta_lake.session_context, sql) + let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1453,7 +1440,7 @@ mod tests { // Check that a row has been added to the time_series_table_field_columns table for each field column. let sql = "SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ generated_column_expr FROM time_series_table_field_columns ORDER BY column_name"; - let batch = sql_and_concat(&delta_lake.session_context, sql) + let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1479,16 +1466,16 @@ mod tests { #[tokio::test] async fn test_drop_normal_table_metadata() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; - delta_lake + data_folder .drop_table_metadata("normal_table_2") .await .unwrap(); // Verify that normal_table_2 was deleted from the normal_table_metadata table. let sql = "SELECT table_name FROM normal_table_metadata"; - let batch = sql_and_concat(&delta_lake.session_context, sql) + let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1497,16 +1484,16 @@ mod tests { #[tokio::test] async fn test_drop_time_series_table_metadata() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; - delta_lake + data_folder .drop_table_metadata(test::TIME_SERIES_TABLE_NAME) .await .unwrap(); // Verify that the time series table was deleted from the time_series_table_metadata table. let sql = "SELECT table_name FROM time_series_table_metadata"; - let batch = sql_and_concat(&delta_lake.session_context, sql) + let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1514,7 +1501,7 @@ mod tests { // Verify that the field columns were deleted from the time_series_table_field_columns table. let sql = "SELECT table_name FROM time_series_table_field_columns"; - let batch = sql_and_concat(&delta_lake.session_context, sql) + let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1523,40 +1510,40 @@ mod tests { #[tokio::test] async fn test_drop_table_metadata_for_missing_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_normal_tables().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; assert!( - delta_lake + data_folder .drop_table_metadata("missing_table") .await .is_err() ); } - async fn create_delta_lake_and_save_normal_tables() -> (TempDir, DeltaLake) { + async fn create_data_folder_and_save_normal_tables() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::open_local(temp_dir.path()) + let data_folder = DataFolder::open_local(temp_dir.path()) .await .unwrap(); - delta_lake + data_folder .save_normal_table_metadata("normal_table_1") .await .unwrap(); - delta_lake + data_folder .save_normal_table_metadata("normal_table_2") .await .unwrap(); - (temp_dir, delta_lake) + (temp_dir, data_folder) } #[tokio::test] async fn test_time_series_table_metadata() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; - let time_series_table_metadata = delta_lake.time_series_table_metadata().await.unwrap(); + let time_series_table_metadata = data_folder.time_series_table_metadata().await.unwrap(); assert_eq!( time_series_table_metadata.first().unwrap().name, @@ -1566,9 +1553,9 @@ mod tests { #[tokio::test] async fn test_time_series_table_metadata_for_existing_time_series_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; - let time_series_table_metadata = delta_lake + let time_series_table_metadata = data_folder .time_series_table_metadata_for_time_series_table(test::TIME_SERIES_TABLE_NAME) .await .unwrap(); @@ -1581,9 +1568,9 @@ mod tests { #[tokio::test] async fn test_time_series_table_metadata_for_missing_time_series_table() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; - let time_series_table_metadata = delta_lake + let time_series_table_metadata = data_folder .time_series_table_metadata_for_time_series_table("missing_table") .await; @@ -1592,9 +1579,9 @@ mod tests { #[tokio::test] async fn test_error_bound() { - let (_temp_dir, delta_lake) = create_delta_lake_and_save_time_series_table().await; + let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; - let error_bounds = delta_lake + let error_bounds = data_folder .error_bounds(test::TIME_SERIES_TABLE_NAME, 4) .await .unwrap(); @@ -1614,7 +1601,7 @@ mod tests { #[tokio::test] async fn test_generated_columns() { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::open_local(temp_dir.path()) + let data_folder = DataFolder::open_local(temp_dir.path()) .await .unwrap(); @@ -1653,7 +1640,7 @@ mod tests { ) .unwrap(); - delta_lake + data_folder .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); @@ -1662,7 +1649,7 @@ mod tests { .query_schema .to_dfschema() .unwrap(); - let generated_columns = delta_lake + let generated_columns = data_folder .generated_columns("generated_columns_table", &df_schema) .await .unwrap(); @@ -1682,19 +1669,19 @@ mod tests { ); } - async fn create_delta_lake_and_save_time_series_table() -> (TempDir, DeltaLake) { + async fn create_data_folder_and_save_time_series_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = DeltaLake::open_local(temp_dir.path()) + let data_folder = DataFolder::open_local(temp_dir.path()) .await .unwrap(); // Save a time series table to the metadata Delta Lake. let time_series_table_metadata = test::time_series_table_metadata(); - delta_lake + data_folder .save_time_series_table_metadata(&time_series_table_metadata) .await .unwrap(); - (temp_dir, delta_lake) + (temp_dir, data_folder) } } diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 493a3e85b..13c0e7c55 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -16,7 +16,7 @@ //! Utility functions to register metadata tables, normal tables, and time series tables with Apache //! DataFusion and to read and write Apache Parquet files to and from an object store. -pub mod delta_lake; +pub mod data_folder; pub mod error; mod optimizer; pub mod parser; diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 11bb2f08c..7ed15d4fa 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -641,7 +641,7 @@ mod tests { use tempfile::TempDir; use tonic::async_trait; - use crate::delta_lake::DeltaLake; + use crate::data_folder::DataFolder; use crate::optimizer; use crate::query::grid_exec::GridExec; use crate::query::time_series_table::TimeSeriesTable; @@ -777,7 +777,7 @@ mod tests { ) -> Arc { // Setup access to data and metadata in data folder. let data_folder_path = temp_dir.path(); - let delta_lake = DeltaLake::open_local(data_folder_path) + let data_folder = DataFolder::open_local(data_folder_path) .await .unwrap(); @@ -796,7 +796,7 @@ mod tests { // Create time series table. let time_series_table_metadata = table::time_series_table_metadata_arc(); - let delta_table = delta_lake + let delta_table = data_folder .create_time_series_table(&time_series_table_metadata) .await .unwrap(); diff --git a/crates/modelardb_storage/src/query/metadata_table.rs b/crates/modelardb_storage/src/query/metadata_table.rs index 5a814070f..d93bce269 100644 --- a/crates/modelardb_storage/src/query/metadata_table.rs +++ b/crates/modelardb_storage/src/query/metadata_table.rs @@ -70,7 +70,7 @@ impl TableProvider for MetadataTable { filters: &[Expr], limit: Option, ) -> DataFusionResult> { - // Clone the Delta Lake table and update it to the latest version. self.delta_lake.load( + // Clone the Delta Lake table and update it to the latest version. self.data_folder.load( // &mut self) is not an option due to TypeProvider::scan(&self, ...). Storing the DeltaTable // in a Mutex and RwLock is also not an option since most of the methods in TypeProvider // return a reference and the locks will be dropped at the end of the method. diff --git a/crates/modelardb_storage/src/query/normal_table.rs b/crates/modelardb_storage/src/query/normal_table.rs index d256c4f79..c6ff4d20a 100644 --- a/crates/modelardb_storage/src/query/normal_table.rs +++ b/crates/modelardb_storage/src/query/normal_table.rs @@ -100,7 +100,7 @@ impl TableProvider for NormalTable { filters: &[Expr], limit: Option, ) -> DataFusionResult> { - // Clone the Delta Lake table and update it to the latest version. self.delta_lake.load( + // Clone the Delta Lake table and update it to the latest version. self.data_folder.load( // &mut self) is not an option due to TypeProvider::scan(&self, ...). Storing the DeltaTable // in a Mutex and RwLock is also not an option since most of the methods in TypeProvider // return a reference and the locks will be dropped at the end of the method. diff --git a/crates/modelardb_storage/src/query/time_series_table.rs b/crates/modelardb_storage/src/query/time_series_table.rs index afa742479..1b6392520 100644 --- a/crates/modelardb_storage/src/query/time_series_table.rs +++ b/crates/modelardb_storage/src/query/time_series_table.rs @@ -515,7 +515,7 @@ impl TableProvider for TimeSeriesTable { let query_schema = &self.time_series_table_metadata.query_schema; let generated_columns = &self.time_series_table_metadata.generated_columns; - // Clone the Delta Lake table and update it to the latest version. self.delta_lake.load( + // Clone the Delta Lake table and update it to the latest version. self.data_folder.load( // &mut self) is not an option due to TypeProvider::scan(&self, ...). Storing the DeltaTable // in a Mutex and RwLock is also not an option since most of the methods in TypeProvider // return a reference and the locks will be dropped at the end of the method. From 804f69a5bf3a7761edfa296248f0ee5cd6d8baba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 13:05:56 +0000 Subject: [PATCH 18/31] Remove TableMEtadataManager --- crates/modelardb_storage/src/data_folder.rs | 46 +- crates/modelardb_storage/src/metadata/mod.rs | 19 - .../src/metadata/table_metadata_manager.rs | 909 ------------------ 3 files changed, 22 insertions(+), 952 deletions(-) delete mode 100644 crates/modelardb_storage/src/metadata/mod.rs delete mode 100644 crates/modelardb_storage/src/metadata/table_metadata_manager.rs diff --git a/crates/modelardb_storage/src/data_folder.rs b/crates/modelardb_storage/src/data_folder.rs index 0295fec82..feae3bd0b 100644 --- a/crates/modelardb_storage/src/data_folder.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -46,7 +46,10 @@ use futures::{StreamExt, TryStreamExt}; use modelardb_types::flight::protocol; use modelardb_types::functions::{try_convert_bytes_to_schema, try_convert_schema_to_bytes}; use modelardb_types::schemas::{COMPRESSED_SCHEMA, FIELD_COLUMN}; -use modelardb_types::types::{ArrowValue, ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, MAX_RETENTION_PERIOD_IN_SECONDS}; +use modelardb_types::types::{ + ArrowValue, ErrorBound, GeneratedColumn, MAX_RETENTION_PERIOD_IN_SECONDS, + TimeSeriesTableMetadata, +}; use object_store::ObjectStore; use object_store::aws::AmazonS3Builder; use object_store::local::LocalFileSystem; @@ -330,12 +333,13 @@ impl DataFolder { /// Register all normal tables and time series tables in `self` with its [`SessionContext`]. /// `data_sink` set as the [`DataSink`] for all of the tables. If the tables could not be /// registered, [`ModelarDbStorageError`] is returned. - pub async fn register_normal_and_time_series_tables(&self, data_sink: Arc) -> Result<()> { + pub async fn register_normal_and_time_series_tables( + &self, + data_sink: Arc, + ) -> Result<()> { // Register normal tables. for normal_table_name in self.normal_table_names().await? { - let delta_table = self - .delta_table(&normal_table_name) - .await?; + let delta_table = self.delta_table(&normal_table_name).await?; crate::register_normal_table( &self.session_context, @@ -467,7 +471,8 @@ impl DataFolder { /// Return the schema of the table with the name in `table_name` if it is a normal table. If the /// table does not exist or the table is not a normal table, return [`None`]. pub async fn normal_table_schema(&self, table_name: &str) -> Option { - if self.is_normal_table(table_name) + if self + .is_normal_table(table_name) .await .is_ok_and(|is_normal_table| is_normal_table) { @@ -510,7 +515,11 @@ impl DataFolder { /// the table does not exist. pub async fn table_writer(&self, table_name: &str) -> Result { let delta_table = self.delta_table(table_name).await?; - if self.time_series_table_metadata_for_registered_time_series_table(table_name).await.is_some() { + if self + .time_series_table_metadata_for_registered_time_series_table(table_name) + .await + .is_some() + { self.time_series_table_writer(delta_table) .await .map_err(|error| error.into()) @@ -1288,16 +1297,14 @@ mod tests { use datafusion::common::ScalarValue::Int64; use datafusion::logical_expr::Expr::Literal; use modelardb_test::table as test; - use modelardb_types::types::{ArrowTimestamp}; + use modelardb_types::types::ArrowTimestamp; use tempfile::TempDir; // Tests for DataFolder. #[tokio::test] async fn test_create_metadata_data_folder_tables() { let temp_dir = tempfile::tempdir().unwrap(); - let data_folder = DataFolder::open_local(temp_dir.path()) - .await - .unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); assert!( data_folder @@ -1522,9 +1529,7 @@ mod tests { async fn create_data_folder_and_save_normal_tables() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let data_folder = DataFolder::open_local(temp_dir.path()) - .await - .unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); data_folder .save_normal_table_metadata("normal_table_1") @@ -1601,9 +1606,7 @@ mod tests { #[tokio::test] async fn test_generated_columns() { let temp_dir = tempfile::tempdir().unwrap(); - let data_folder = DataFolder::open_local(temp_dir.path()) - .await - .unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let query_schema = Arc::new(Schema::new(vec![ Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), @@ -1614,10 +1617,7 @@ mod tests { Field::new("generated_column_2", ArrowValue::DATA_TYPE, false), ])); - let error_bounds = vec![ - ErrorBound::Lossless; - query_schema.fields.len() - ]; + let error_bounds = vec![ErrorBound::Lossless; query_schema.fields.len()]; let plus_one_column = Some(GeneratedColumn { expr: col("field_1") + Literal(Int64(Some(1))), @@ -1671,9 +1671,7 @@ mod tests { async fn create_data_folder_and_save_time_series_table() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let data_folder = DataFolder::open_local(temp_dir.path()) - .await - .unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); // Save a time series table to the metadata Delta Lake. let time_series_table_metadata = test::time_series_table_metadata(); diff --git a/crates/modelardb_storage/src/metadata/mod.rs b/crates/modelardb_storage/src/metadata/mod.rs deleted file mode 100644 index 088d50532..000000000 --- a/crates/modelardb_storage/src/metadata/mod.rs +++ /dev/null @@ -1,19 +0,0 @@ -/* Copyright 2023 The ModelarDB Contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//! Implementation of [`TableMetadataManager`](table_metadata_manager::TableMetadataManager) which -//! provides functionality to access table related metadata in the metadata Delta Lake. - -pub mod table_metadata_manager; diff --git a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs b/crates/modelardb_storage/src/metadata/table_metadata_manager.rs deleted file mode 100644 index cbb1660d0..000000000 --- a/crates/modelardb_storage/src/metadata/table_metadata_manager.rs +++ /dev/null @@ -1,909 +0,0 @@ -/* Copyright 2024 The ModelarDB Contributors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//! Table metadata manager that includes functionality used to access both the server metadata Delta Lake -//! and the manager metadata Delta Lake. Note that the entire server metadata Delta Lake can be accessed -//! through this metadata manager, while it only supports a subset of the manager metadata Delta Lake. - -use std::sync::Arc; - -use arrow::array::{Array, BinaryArray, BooleanArray, Float32Array, Int16Array, StringArray}; -use arrow::datatypes::{ArrowPrimitiveType, DataType, Field, Schema}; -use datafusion::common::{DFSchema, ToDFSchema}; -use datafusion::logical_expr::{Expr, lit}; -use datafusion::prelude::{SessionContext, col}; -use datafusion_proto::bytes::Serializeable; -use modelardb_types::flight::protocol; -use modelardb_types::functions::{try_convert_bytes_to_schema, try_convert_schema_to_bytes}; -use modelardb_types::types::{ArrowValue, ErrorBound, GeneratedColumn, TimeSeriesTableMetadata}; - -use crate::delta_lake::DeltaLake; -use crate::error::{ModelarDbStorageError, Result}; -use crate::{register_metadata_table, sql_and_concat}; - -/// Types of tables supported by ModelarDB. -enum TableType { - NormalTable, - TimeSeriesTable, -} - -/// Stores the metadata required for reading from and writing to the normal tables and time series -/// tables. The data that needs to be persisted is stored in the metadata Delta Lake. -pub struct TableMetadataManager { - /// Delta Lake with functionality to read and write to and from the metadata tables. - delta_lake: Arc, - /// Session context used to query the metadata Delta Lake tables using Apache DataFusion. - session_context: Arc, -} - -impl TableMetadataManager { - /// Create a new [`TableMetadataManager`] that saves the metadata to `delta_lake` If the - /// metadata tables could not be created, return [`ModelarDbStorageError`]. - pub async fn try_new(delta_lake: Arc) -> Result { - let table_metadata_manager = Self { - delta_lake, - session_context: Arc::new(SessionContext::new()), - }; - - table_metadata_manager - .create_and_register_metadata_delta_lake_tables() - .await?; - - Ok(table_metadata_manager) - } - - /// If they do not already exist, create the tables in the metadata Delta Lake for normal table - /// and time series table metadata and register them with the Apache DataFusion session context. - /// * The `normal_table_metadata` table contains the metadata for normal tables. - /// * The `time_series_table_metadata` table contains the main metadata for time series tables. - /// * The `time_series_table_field_columns` table contains the name, index, error bound value, - /// whether error bound is relative, and generation expression of the field columns in each - /// time series table. - /// - /// If the tables exist or were created, return [`Ok`], otherwise return - /// [`ModelarDbStorageError`]. - async fn create_and_register_metadata_delta_lake_tables(&self) -> Result<()> { - // Create and register the normal_table_metadata table if it does not exist. - let delta_table = self - .delta_lake - .create_metadata_table( - "normal_table_metadata", - &Schema::new(vec![Field::new("table_name", DataType::Utf8, false)]), - ) - .await?; - - register_metadata_table(&self.session_context, "normal_table_metadata", delta_table)?; - - // Create and register the time_series_table_metadata table if it does not exist. - let delta_table = self - .delta_lake - .create_metadata_table( - "time_series_table_metadata", - &Schema::new(vec![ - Field::new("table_name", DataType::Utf8, false), - Field::new("query_schema", DataType::Binary, false), - ]), - ) - .await?; - - register_metadata_table( - &self.session_context, - "time_series_table_metadata", - delta_table, - )?; - - // Create and register the time_series_table_field_columns table if it does not exist. Note - // that column_index will only use a maximum of 10 bits. generated_column_expr is NULL if - // the fields are stored as segments. - let delta_table = self - .delta_lake - .create_metadata_table( - "time_series_table_field_columns", - &Schema::new(vec![ - Field::new("table_name", DataType::Utf8, false), - Field::new("column_name", DataType::Utf8, false), - Field::new("column_index", DataType::Int16, false), - Field::new("error_bound_value", DataType::Float32, false), - Field::new("error_bound_is_relative", DataType::Boolean, false), - Field::new("generated_column_expr", DataType::Binary, true), - ]), - ) - .await?; - - register_metadata_table( - &self.session_context, - "time_series_table_field_columns", - delta_table, - )?; - - Ok(()) - } - - /// Return `true` if the table with `table_name` is a normal table, otherwise return `false`. - pub async fn is_normal_table(&self, table_name: &str) -> Result { - Ok(self - .normal_table_names() - .await? - .contains(&table_name.to_owned())) - } - - /// Return `true` if the table with `table_name` is a time series table, otherwise return `false`. - pub async fn is_time_series_table(&self, table_name: &str) -> Result { - Ok(self - .time_series_table_names() - .await? - .contains(&table_name.to_owned())) - } - - /// Return the name of each table currently in the metadata Delta Lake. If the table names - /// cannot be retrieved, [`ModelarDbStorageError`] is returned. - pub async fn table_names(&self) -> Result> { - let normal_table_names = self.normal_table_names().await?; - let time_series_table_names = self.time_series_table_names().await?; - - let mut table_names = normal_table_names; - table_names.extend(time_series_table_names); - - Ok(table_names) - } - - /// Return the name of each normal table currently in the metadata Delta Lake. Note that this - /// does not include time series tables. If the normal table names cannot be retrieved, - /// [`ModelarDbStorageError`] is returned. - pub async fn normal_table_names(&self) -> Result> { - self.table_names_of_type(TableType::NormalTable).await - } - - /// Return the name of each time series table currently in the metadata Delta Lake. Note that - /// this does not include normal tables. If the time series table names cannot be retrieved, - /// [`ModelarDbStorageError`] is returned. - pub async fn time_series_table_names(&self) -> Result> { - self.table_names_of_type(TableType::TimeSeriesTable).await - } - - /// Return the name of tables of `table_type`. Returns [`ModelarDbStorageError`] if the table - /// names cannot be retrieved. - async fn table_names_of_type(&self, table_type: TableType) -> Result> { - let table_type = match table_type { - TableType::NormalTable => "normal_table", - TableType::TimeSeriesTable => "time_series_table", - }; - - let sql = format!("SELECT table_name FROM {table_type}_metadata"); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - let table_names = modelardb_types::array!(batch, 0, StringArray); - Ok(table_names.iter().flatten().map(str::to_owned).collect()) - } - - /// Save the created normal table to the metadata Delta Lake. This consists of adding a row to - /// the `normal_table_metadata` table with the `name` of the table. If the normal table metadata - /// was saved, return [`Ok`], otherwise return [`ModelarDbStorageError`]. - pub async fn save_normal_table_metadata(&self, name: &str) -> Result<()> { - self.delta_lake - .write_columns_to_metadata_table( - "normal_table_metadata", - vec![Arc::new(StringArray::from(vec![name]))], - ) - .await?; - - Ok(()) - } - - /// Save the created time series table to the metadata Delta Lake. This includes adding a row to - /// the `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` - /// table for each field column. - pub async fn save_time_series_table_metadata( - &self, - time_series_table_metadata: &TimeSeriesTableMetadata, - ) -> Result<()> { - // Convert the query schema to bytes, so it can be saved in the metadata Delta Lake. - let query_schema_bytes = - try_convert_schema_to_bytes(&time_series_table_metadata.query_schema)?; - - // Add a new row in the time_series_table_metadata table to persist the time series table. - self.delta_lake - .write_columns_to_metadata_table( - "time_series_table_metadata", - vec![ - Arc::new(StringArray::from(vec![ - time_series_table_metadata.name.clone(), - ])), - Arc::new(BinaryArray::from_vec(vec![&query_schema_bytes])), - ], - ) - .await?; - - // Add a row for each field column to the time_series_table_field_columns table. - for (query_schema_index, field) in time_series_table_metadata - .query_schema - .fields() - .iter() - .enumerate() - { - if field.data_type() == &ArrowValue::DATA_TYPE { - // Convert the generated column expression to bytes, if it exists. - let maybe_generated_column_expr = match time_series_table_metadata - .generated_columns - .get(query_schema_index) - { - Some(Some(generated_column)) => { - Some(generated_column.expr.to_bytes()?.to_vec()) - } - _ => None, - }; - - // error_bounds matches schema and not query_schema to simplify looking up the error - // bound during ingestion as it occurs far more often than creation of time series tables. - let (error_bound_value, error_bound_is_relative) = if let Ok(schema_index) = - time_series_table_metadata.schema.index_of(field.name()) - { - match time_series_table_metadata.error_bounds[schema_index] { - ErrorBound::Absolute(value) => (value, false), - ErrorBound::Relative(value) => (value, true), - ErrorBound::Lossless => (0.0, false), - } - } else { - (0.0, false) - }; - - // query_schema_index is simply cast as a time series table contains at most 32767 columns. - self.delta_lake - .write_columns_to_metadata_table( - "time_series_table_field_columns", - vec![ - Arc::new(StringArray::from(vec![ - time_series_table_metadata.name.clone(), - ])), - Arc::new(StringArray::from(vec![field.name().clone()])), - Arc::new(Int16Array::from(vec![query_schema_index as i16])), - Arc::new(Float32Array::from(vec![error_bound_value])), - Arc::new(BooleanArray::from(vec![error_bound_is_relative])), - Arc::new(BinaryArray::from_opt_vec(vec![ - maybe_generated_column_expr.as_deref(), - ])), - ], - ) - .await?; - } - } - - Ok(()) - } - - /// Depending on the type of the table with `table_name`, drop either the normal table - /// metadata or the time series table metadata from the metadata Delta Lake. If the table does - /// not exist or the metadata could not be dropped, [`ModelarDbStorageError`] is returned. - pub async fn drop_table_metadata(&self, table_name: &str) -> Result<()> { - if self.is_normal_table(table_name).await? { - self.drop_normal_table_metadata(table_name).await - } else if self.is_time_series_table(table_name).await? { - self.drop_time_series_table_metadata(table_name).await - } else { - Err(ModelarDbStorageError::InvalidArgument(format!( - "Table with name '{table_name}' does not exist." - ))) - } - } - - /// Drop the metadata for the normal table with `table_name` from the `normal_table_metadata` - /// table in the metadata Delta Lake. If the metadata could not be dropped, - /// [`ModelarDbStorageError`] is returned. - async fn drop_normal_table_metadata(&self, table_name: &str) -> Result<()> { - let delta_ops = self - .delta_lake - .metadata_delta_ops("normal_table_metadata") - .await?; - - delta_ops - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - - Ok(()) - } - - /// Drop the metadata for the time series table with `table_name` from the metadata Delta Lake. - /// This includes deleting a row from the `time_series_table_metadata` table and deleting a row - /// from the `time_series_table_field_columns` table for each field column. If the metadata - /// could not be dropped, [`ModelarDbStorageError`] is returned. - async fn drop_time_series_table_metadata(&self, table_name: &str) -> Result<()> { - // Delete the table metadata from the time_series_table_metadata table. - self.delta_lake - .metadata_delta_ops("time_series_table_metadata") - .await? - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - - // Delete the column metadata from the time_series_table_field_columns table. - self.delta_lake - .metadata_delta_ops("time_series_table_field_columns") - .await? - .delete() - .with_predicate(col("table_name").eq(lit(table_name))) - .await?; - - Ok(()) - } - - /// Return the [`TimeSeriesTableMetadata`] of each time series table currently in the metadata - /// Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, - /// [`ModelarDbStorageError`] is returned. - pub async fn time_series_table_metadata(&self) -> Result>> { - let sql = "SELECT table_name, query_schema FROM time_series_table_metadata"; - let batch = sql_and_concat(&self.session_context, sql).await?; - - let mut time_series_table_metadata: Vec> = vec![]; - let table_name_array = modelardb_types::array!(batch, 0, StringArray); - let query_schema_bytes_array = modelardb_types::array!(batch, 1, BinaryArray); - - for row_index in 0..batch.num_rows() { - let table_name = table_name_array.value(row_index); - let query_schema_bytes = query_schema_bytes_array.value(row_index); - - let metadata = self - .time_series_table_metadata_row_to_time_series_table_metadata( - table_name, - query_schema_bytes, - ) - .await?; - - time_series_table_metadata.push(Arc::new(metadata)) - } - - Ok(time_series_table_metadata) - } - - /// Return the [`TimeSeriesTableMetadata`] for the time series table with `table_name` in the - /// metadata Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, - /// [`ModelarDbStorageError`] is returned. - pub async fn time_series_table_metadata_for_time_series_table( - &self, - table_name: &str, - ) -> Result { - let sql = format!( - "SELECT table_name, query_schema FROM time_series_table_metadata WHERE table_name = '{table_name}'" - ); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - if batch.num_rows() == 0 { - return Err(ModelarDbStorageError::InvalidArgument(format!( - "No metadata for time series table named '{table_name}'." - ))); - } - - let table_name_array = modelardb_types::array!(batch, 0, StringArray); - let query_schema_bytes_array = modelardb_types::array!(batch, 1, BinaryArray); - - let table_name = table_name_array.value(0); - let query_schema_bytes = query_schema_bytes_array.value(0); - - self.time_series_table_metadata_row_to_time_series_table_metadata( - table_name, - query_schema_bytes, - ) - .await - } - - /// Convert a row from the table "time_series_table_metadata" to an instance of - /// [`TimeSeriesTableMetadata`]. Returns [`ModelarDbStorageError`] if a time_series table with - /// `table_name` does not exist or the bytes in `query_schema_bytes` are not a valid schema. - async fn time_series_table_metadata_row_to_time_series_table_metadata( - &self, - table_name: &str, - query_schema_bytes: &[u8], - ) -> Result { - let query_schema = try_convert_bytes_to_schema(query_schema_bytes.into())?; - - let error_bounds = self - .error_bounds(table_name, query_schema.fields().len()) - .await?; - - let df_query_schema = query_schema.clone().to_dfschema()?; - let generated_columns = self.generated_columns(table_name, &df_query_schema).await?; - - TimeSeriesTableMetadata::try_new( - table_name.to_owned(), - Arc::new(query_schema), - error_bounds, - generated_columns, - ) - .map_err(|error| error.into()) - } - - /// Return the error bounds for the columns in the time series table with `table_name`. If a - /// time series table with `table_name` does not exist, [`ModelarDbStorageError`] is returned. - async fn error_bounds( - &self, - table_name: &str, - query_schema_columns: usize, - ) -> Result> { - let sql = format!( - "SELECT column_index, error_bound_value, error_bound_is_relative - FROM time_series_table_field_columns - WHERE table_name = '{table_name}' - ORDER BY column_index" - ); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - let mut column_to_error_bound = vec![ErrorBound::Lossless; query_schema_columns]; - - let column_index_array = modelardb_types::array!(batch, 0, Int16Array); - let error_bound_value_array = modelardb_types::array!(batch, 1, Float32Array); - let error_bound_is_relative_array = modelardb_types::array!(batch, 2, BooleanArray); - - for row_index in 0..batch.num_rows() { - let error_bound_index = column_index_array.value(row_index); - let error_bound_value = error_bound_value_array.value(row_index); - let error_bound_is_relative = error_bound_is_relative_array.value(row_index); - - if error_bound_value != 0.0 { - let error_bound = if error_bound_is_relative { - ErrorBound::try_new_relative(error_bound_value) - } else { - ErrorBound::try_new_absolute(error_bound_value) - }?; - - column_to_error_bound[error_bound_index as usize] = error_bound; - } - } - - Ok(column_to_error_bound) - } - - /// Return the generated columns for the time series table with `table_name` and `df_schema`. If - /// a time series table with `table_name` does not exist, [`ModelarDbStorageError`] is returned. - async fn generated_columns( - &self, - table_name: &str, - df_schema: &DFSchema, - ) -> Result>> { - let sql = format!( - "SELECT column_index, generated_column_expr - FROM time_series_table_field_columns - WHERE table_name = '{table_name}' - ORDER BY column_index" - ); - let batch = sql_and_concat(&self.session_context, &sql).await?; - - let mut generated_columns = vec![None; df_schema.fields().len()]; - - let column_index_array = modelardb_types::array!(batch, 0, Int16Array); - let generated_column_expr_array = modelardb_types::array!(batch, 1, BinaryArray); - - for row_index in 0..batch.num_rows() { - let generated_column_index = column_index_array.value(row_index); - let expr_bytes = generated_column_expr_array.value(row_index); - - // If generated_column_expr is null, it is saved as empty bytes in the column values. - if !expr_bytes.is_empty() { - let expr = Expr::from_bytes(expr_bytes)?; - let generated_column = GeneratedColumn::try_from_expr(expr, df_schema)?; - - generated_columns[generated_column_index as usize] = Some(generated_column); - } - } - - Ok(generated_columns) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use arrow::datatypes::{ArrowPrimitiveType, Field}; - use datafusion::arrow::datatypes::DataType; - use datafusion::common::ScalarValue::Int64; - use datafusion::logical_expr::Expr::Literal; - use modelardb_test::table::{self, TIME_SERIES_TABLE_NAME}; - use modelardb_types::types::{ArrowTimestamp, ArrowValue}; - use tempfile::TempDir; - - // Tests for TableMetadataManager. - #[tokio::test] - async fn test_create_metadata_delta_lake_tables() { - let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); - let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); - - assert!( - metadata_manager - .session_context - .sql("SELECT table_name FROM normal_table_metadata") - .await - .is_ok() - ); - - assert!( - metadata_manager - .session_context - .sql("SELECT table_name, query_schema FROM time_series_table_metadata") - .await - .is_ok() - ); - - assert!(metadata_manager - .session_context - .sql("SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ - generated_column_expr FROM time_series_table_field_columns") - .await - .is_ok()); - } - - #[tokio::test] - async fn test_normal_table_is_normal_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - assert!( - metadata_manager - .is_normal_table("normal_table_1") - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_time_series_table_is_not_normal_table() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - assert!( - !metadata_manager - .is_normal_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_time_series_table_is_time_series_table() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - assert!( - metadata_manager - .is_time_series_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_normal_table_is_not_time_series_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - assert!( - !metadata_manager - .is_time_series_table("normal_table_1") - .await - .unwrap() - ); - } - - #[tokio::test] - async fn test_table_names() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - let time_series_table_metadata = table::time_series_table_metadata(); - metadata_manager - .save_time_series_table_metadata(&time_series_table_metadata) - .await - .unwrap(); - - let table_names = metadata_manager.table_names().await.unwrap(); - assert_eq!( - table_names, - vec!["normal_table_2", "normal_table_1", TIME_SERIES_TABLE_NAME] - ); - } - - #[tokio::test] - async fn test_normal_table_names() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - let normal_table_names = metadata_manager.normal_table_names().await.unwrap(); - assert_eq!(normal_table_names, vec!["normal_table_2", "normal_table_1"]); - } - - #[tokio::test] - async fn test_time_series_table_names() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - let time_series_table_names = metadata_manager.time_series_table_names().await.unwrap(); - assert_eq!(time_series_table_names, vec![TIME_SERIES_TABLE_NAME]); - } - - #[tokio::test] - async fn test_save_normal_table_metadata() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - // Retrieve the normal table from the metadata Delta Lake. - let sql = "SELECT table_name FROM normal_table_metadata ORDER BY table_name"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!( - **batch.column(0), - StringArray::from(vec!["normal_table_1", "normal_table_2"]) - ); - } - - #[tokio::test] - async fn test_save_time_series_table_metadata() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - // Check that a row has been added to the time_series_table_metadata table. - let sql = "SELECT table_name, query_schema FROM time_series_table_metadata"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!( - **batch.column(0), - StringArray::from(vec![TIME_SERIES_TABLE_NAME]) - ); - assert_eq!( - **batch.column(1), - BinaryArray::from_vec(vec![ - &try_convert_schema_to_bytes(&table::time_series_table_metadata().query_schema) - .unwrap() - ]) - ); - - // Check that a row has been added to the time_series_table_field_columns table for each field column. - let sql = "SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ - generated_column_expr FROM time_series_table_field_columns ORDER BY column_name"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!( - **batch.column(0), - StringArray::from(vec![TIME_SERIES_TABLE_NAME, TIME_SERIES_TABLE_NAME]) - ); - assert_eq!( - **batch.column(1), - StringArray::from(vec!["field_1", "field_2"]) - ); - assert_eq!(**batch.column(2), Int16Array::from(vec![1, 2])); - assert_eq!(**batch.column(3), Float32Array::from(vec![1.0, 5.0])); - assert_eq!(**batch.column(4), BooleanArray::from(vec![false, true])); - assert_eq!( - **batch.column(5), - BinaryArray::from_opt_vec(vec![None, None]) - ); - } - - #[tokio::test] - async fn test_drop_normal_table_metadata() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - metadata_manager - .drop_table_metadata("normal_table_2") - .await - .unwrap(); - - // Verify that normal_table_2 was deleted from the normal_table_metadata table. - let sql = "SELECT table_name FROM normal_table_metadata"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!(**batch.column(0), StringArray::from(vec!["normal_table_1"])); - } - - #[tokio::test] - async fn test_drop_time_series_table_metadata() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - metadata_manager - .drop_table_metadata(TIME_SERIES_TABLE_NAME) - .await - .unwrap(); - - // Verify that the time series table was deleted from the time_series_table_metadata table. - let sql = "SELECT table_name FROM time_series_table_metadata"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!(batch.num_rows(), 0); - - // Verify that the field columns were deleted from the time_series_table_field_columns table. - let sql = "SELECT table_name FROM time_series_table_field_columns"; - let batch = sql_and_concat(&metadata_manager.session_context, sql) - .await - .unwrap(); - - assert_eq!(batch.num_rows(), 0); - } - - #[tokio::test] - async fn test_drop_table_metadata_for_missing_table() { - let (_temp_dir, metadata_manager) = create_metadata_manager_and_save_normal_tables().await; - - let result = metadata_manager.drop_table_metadata("missing_table").await; - - assert_eq!( - result.unwrap_err().to_string(), - "Invalid Argument Error: Table with name 'missing_table' does not exist." - ); - } - - async fn create_metadata_manager_and_save_normal_tables() -> (TempDir, TableMetadataManager) { - let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); - let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); - - metadata_manager - .save_normal_table_metadata("normal_table_1") - .await - .unwrap(); - - metadata_manager - .save_normal_table_metadata("normal_table_2") - .await - .unwrap(); - - (temp_dir, metadata_manager) - } - - #[tokio::test] - async fn test_time_series_table_metadata() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - let time_series_table_metadata = - metadata_manager.time_series_table_metadata().await.unwrap(); - - assert_eq!( - time_series_table_metadata.first().unwrap().name, - table::time_series_table_metadata().name, - ); - } - - #[tokio::test] - async fn test_time_series_table_metadata_for_existing_time_series_table() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - let time_series_table_metadata = metadata_manager - .time_series_table_metadata_for_time_series_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap(); - - assert_eq!( - time_series_table_metadata.name, - table::time_series_table_metadata().name, - ); - } - - #[tokio::test] - async fn test_time_series_table_metadata_for_missing_time_series_table() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - let result = metadata_manager - .time_series_table_metadata_for_time_series_table("missing_table") - .await; - - assert_eq!( - result.unwrap_err().to_string(), - "Invalid Argument Error: No metadata for time series table named 'missing_table'." - ); - } - - #[tokio::test] - async fn test_error_bound() { - let (_temp_dir, metadata_manager) = - create_metadata_manager_and_save_time_series_table().await; - - let error_bounds = metadata_manager - .error_bounds(TIME_SERIES_TABLE_NAME, 4) - .await - .unwrap(); - - let values: Vec = error_bounds - .iter() - .map(|error_bound| match error_bound { - ErrorBound::Absolute(value) => *value, - ErrorBound::Relative(value) => *value, - ErrorBound::Lossless => 0.0, - }) - .collect(); - - assert_eq!(values, &[0.0, 1.0, 5.0, 0.0]); - } - - #[tokio::test] - async fn test_generated_columns() { - let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); - let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); - - let query_schema = Arc::new(Schema::new(vec![ - Field::new("generated_column_1", ArrowValue::DATA_TYPE, false), - Field::new("timestamp", ArrowTimestamp::DATA_TYPE, false), - Field::new("field_1", ArrowValue::DATA_TYPE, false), - Field::new("field_2", ArrowValue::DATA_TYPE, false), - Field::new("generated_column_2", ArrowValue::DATA_TYPE, false), - Field::new("tag", DataType::Utf8, false), - ])); - - let error_bounds = vec![ErrorBound::Lossless; query_schema.fields.len()]; - - let plus_one_column = Some(GeneratedColumn { - expr: col("field_1") + Literal(Int64(Some(1))), - source_columns: vec![2], - }); - - let addition_column = Some(GeneratedColumn { - expr: col("field_1") + col("field_2"), - source_columns: vec![2, 3], - }); - - let mut expected_generated_columns = - vec![plus_one_column, None, None, None, addition_column, None]; - - let time_series_table_metadata = TimeSeriesTableMetadata::try_new( - "generated_columns_table".to_owned(), - query_schema, - error_bounds, - expected_generated_columns.clone(), - ) - .unwrap(); - - metadata_manager - .save_time_series_table_metadata(&time_series_table_metadata) - .await - .unwrap(); - - let df_schema = time_series_table_metadata - .query_schema - .to_dfschema() - .unwrap(); - let mut generated_columns = metadata_manager - .generated_columns("generated_columns_table", &df_schema) - .await - .unwrap(); - - let mut actual_addition_column = generated_columns.remove(4).unwrap(); - let expected_addition_column = expected_generated_columns.remove(4).unwrap(); - - // Sort the source columns to ensure the order is consistent. - actual_addition_column.source_columns.sort(); - assert_eq!(actual_addition_column, expected_addition_column); - - assert_eq!(generated_columns, expected_generated_columns); - } - - async fn create_metadata_manager_and_save_time_series_table() -> (TempDir, TableMetadataManager) - { - let temp_dir = tempfile::tempdir().unwrap(); - let delta_lake = Arc::new(DeltaLake::try_from_local_path(temp_dir.path()).unwrap()); - let metadata_manager = TableMetadataManager::try_new(delta_lake).await.unwrap(); - - // Save a time series table to the metadata Delta Lake. - let time_series_table_metadata = table::time_series_table_metadata(); - metadata_manager - .save_time_series_table_metadata(&time_series_table_metadata) - .await - .unwrap(); - - (temp_dir, metadata_manager) - } -} From e590530e303198356d1bbb0bcc7f2bd31c6e4b62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 13:47:31 +0000 Subject: [PATCH 19/31] Remove duplicate SessionContexts --- .../src/operations/data_folder.rs | 3 +- crates/modelardb_server/src/context.rs | 29 ++++++++++--------- crates/modelardb_server/src/remote.rs | 6 ++-- crates/modelardb_storage/src/data_folder.rs | 8 ++--- .../src/optimizer/model_simple_aggregates.rs | 14 +-------- 5 files changed, 25 insertions(+), 35 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 5d216f229..171389ab3 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -30,7 +30,6 @@ use datafusion::error::DataFusionError; use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}; use datafusion::physical_plan::metrics::MetricsSet; use datafusion::physical_plan::{DisplayAs, DisplayFormatType, common}; -use datafusion::prelude::SessionContext; use futures::TryStreamExt; use modelardb_storage::data_folder::DataFolder; @@ -391,7 +390,7 @@ impl Operations for DataFolder { // Read data to copy from source_table_name in source. let source_table = Arc::new(self.delta_table(source_table_name).await?); - let session_context = SessionContext::new(); + let session_context = modelardb_storage::create_session_context(); session_context.register_table(source_table_name, source_table)?; let df = session_context.sql(&sql).await?; diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 62a1389ad..2afccbf95 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -20,7 +20,6 @@ use std::sync::Arc; use datafusion::arrow::datatypes::Schema; use datafusion::catalog::{SchemaProvider, TableProvider}; -use datafusion::prelude::SessionContext; use modelardb_types::types::TimeSeriesTableMetadata; use tokio::sync::RwLock; use tracing::info; @@ -37,8 +36,6 @@ pub struct Context { pub data_folders: DataFolders, /// Updatable configuration of the server. pub configuration_manager: Arc>, - /// Main interface for Apache DataFusion. - pub session_context: SessionContext, /// Manages all uncompressed and compressed data in the system. pub storage_engine: Arc>, } @@ -50,8 +47,6 @@ impl Context { pub async fn try_new(data_folders: DataFolders, cluster_mode: ClusterMode) -> Result { let configuration_manager = Arc::new(RwLock::new(ConfigurationManager::new(cluster_mode))); - let session_context = modelardb_storage::create_session_context(); - let storage_engine = Arc::new(RwLock::new( StorageEngine::try_new(data_folders.clone(), &configuration_manager).await?, )); @@ -59,7 +54,6 @@ impl Context { Ok(Context { data_folders, configuration_manager, - session_context, storage_engine, }) } @@ -167,8 +161,11 @@ impl Context { /// Register the normal table with `table_name` in Apache DataFusion. If the normal table does /// not exist or could not be registered with Apache DataFusion, return - /// [`ModelarDbServerError`]. + /// [`ModelarDbServerError`]. [`DataFolder.register_normal_and_time_series_tables()`] is not + /// used so a unique [`NormalTableDataSink`] can be passed per table. async fn register_normal_table(&self, table_name: &str) -> Result<()> { + let session_context = self.data_folders.query_data_folder.session_context(); + let delta_table = self .data_folders .query_data_folder @@ -182,7 +179,7 @@ impl Context { )); modelardb_storage::register_normal_table( - &self.session_context, + session_context, table_name, delta_table, normal_table_data_sink, @@ -212,13 +209,16 @@ impl Context { Ok(()) } - /// Register the time series table with `time_series_table_metadata` in Apache DataFusion. If the - /// time series table does not exist or could not be registered with Apache DataFusion, return - /// [`ModelarDbServerError`]. + /// Register the time series table with `time_series_table_metadata` in Apache DataFusion. If + /// the time series table does not exist or could not be registered with Apache DataFusion, + /// return [`ModelarDbServerError`]. [`DataFolder.register_normal_and_time_series_tables()`] is + /// not used so a unique [`TimeSeriesTableDataSink`] can be passed per table. async fn register_time_series_table( &self, time_series_table_metadata: Arc, ) -> Result<()> { + let session_context = self.data_folders.query_data_folder.session_context(); + let delta_table = self .data_folders .query_data_folder @@ -231,7 +231,7 @@ impl Context { )); modelardb_storage::register_time_series_table( - &self.session_context, + session_context, delta_table, time_series_table_metadata.clone(), time_series_table_data_sink, @@ -258,7 +258,8 @@ impl Context { // Deregister the table from the Apache DataFusion session context. This is done first to // avoid data being ingested into the table while it is being deleted. - self.session_context.deregister_table(table_name)?; + let session_context = self.data_folders.query_data_folder.session_context(); + session_context.deregister_table(table_name)?; self.drop_table_from_storage_engine(table_name).await?; @@ -389,7 +390,7 @@ impl Context { /// Return the default database schema if it exists, otherwise a [`ModelarDbServerError`] /// indicating at what level the lookup failed is returned. pub fn default_database_schema(&self) -> Result> { - let session_context = self.session_context.clone(); + let session_context = self.data_folders.query_data_folder.session_context(); let catalog = session_context.catalog("datafusion").ok_or_else(|| { ModelarDbServerError::InvalidState("Default catalog does not exist.".to_owned()) diff --git a/crates/modelardb_server/src/remote.rs b/crates/modelardb_server/src/remote.rs index 11ef4fc0d..a4ff1c03b 100644 --- a/crates/modelardb_server/src/remote.rs +++ b/crates/modelardb_server/src/remote.rs @@ -469,13 +469,15 @@ impl FlightService for FlightServiceHandler { Ok(empty_record_batch_stream()) } ModelarDbStatement::Statement(statement) => { - modelardb_storage::execute_statement(&self.context.session_context, statement) + let session_context = self.context.data_folders.query_data_folder.session_context(); + modelardb_storage::execute_statement(session_context, statement) .await .map_err(|error| error.into()) } ModelarDbStatement::IncludeSelect(statement, addresses) => { + let session_context = self.context.data_folders.query_data_folder.session_context(); let local_sendable_record_batch_stream = - modelardb_storage::execute_statement(&self.context.session_context, statement) + modelardb_storage::execute_statement(session_context, statement) .await .map_err(error_to_status_internal)?; diff --git a/crates/modelardb_storage/src/data_folder.rs b/crates/modelardb_storage/src/data_folder.rs index feae3bd0b..daaafca70 100644 --- a/crates/modelardb_storage/src/data_folder.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -109,7 +109,7 @@ impl DataFolder { storage_options: HashMap::new(), object_store: Arc::new(InMemory::new()), delta_table_cache: DashMap::new(), - session_context: Arc::new(SessionContext::new()), + session_context: Arc::new(crate::create_session_context()), }; data_folder.create_and_register_metadata_tables().await?; @@ -140,7 +140,7 @@ impl DataFolder { storage_options: HashMap::new(), object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), - session_context: Arc::new(SessionContext::new()), + session_context: Arc::new(crate::create_session_context()), }; data_folder.create_and_register_metadata_tables().await?; @@ -225,7 +225,7 @@ impl DataFolder { storage_options, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), - session_context: Arc::new(SessionContext::new()), + session_context: Arc::new(crate::create_session_context()), }; data_folder.create_and_register_metadata_tables().await?; @@ -258,7 +258,7 @@ impl DataFolder { storage_options, object_store: Arc::new(object_store), delta_table_cache: DashMap::new(), - session_context: Arc::new(SessionContext::new()), + session_context: Arc::new(crate::create_session_context()), }; data_folder.create_and_register_metadata_tables().await?; diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index 7ed15d4fa..b08b1d05d 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -628,7 +628,6 @@ mod tests { use datafusion::arrow::datatypes::Schema; use datafusion::datasource::sink::DataSink; - use datafusion::execution::session_state::SessionStateBuilder; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_plan::aggregates::AggregateExec; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; @@ -636,13 +635,11 @@ mod tests { use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::metrics::MetricsSet; use datafusion::physical_plan::{DisplayAs, DisplayFormatType}; - use datafusion::prelude::SessionContext; use modelardb_test::table::{self, TIME_SERIES_TABLE_NAME}; use tempfile::TempDir; use tonic::async_trait; use crate::data_folder::DataFolder; - use crate::optimizer; use crate::query::grid_exec::GridExec; use crate::query::time_series_table::TimeSeriesTable; @@ -782,16 +779,7 @@ mod tests { .unwrap(); // Setup access to Apache DataFusion. - let mut session_state_builder = SessionStateBuilder::new().with_default_features(); - - // Uses the rule method instead of the rules method as the rules method replaces the built-ins. - for physical_optimizer_rule in optimizer::physical_optimizer_rules() { - session_state_builder = - session_state_builder.with_physical_optimizer_rule(physical_optimizer_rule); - } - - let session_state = session_state_builder.build(); - let session_context = SessionContext::new_with_state(session_state); + let session_context = crate::create_session_context(); // Create time series table. let time_series_table_metadata = table::time_series_table_metadata_arc(); From 8ebf233c7716482fab0d553ac08ed35d1b7258bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 13:54:33 +0000 Subject: [PATCH 20/31] Fix Clippy warnings --- crates/modelardb_embedded/src/operations/data_folder.rs | 7 +++++++ crates/modelardb_manager/src/metadata.rs | 6 +++--- crates/modelardb_storage/src/data_folder.rs | 8 ++------ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 171389ab3..2faf63e4e 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -53,6 +53,13 @@ impl DataFolderDataSink { } } +impl Default for DataFolderDataSink { + // Trait implemented to silence clippy warning. + fn default() -> Self { + Self::new() + } +} + #[async_trait] impl DataSink for DataFolderDataSink { /// Return `self` as [`Any`] so it can be downcast. diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 31b5041f1..66758ca54 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -58,7 +58,7 @@ impl ManagerMetadata for DataFolder { ) .await?; - register_metadata_table(&self.session_context(), "manager_metadata", delta_table)?; + register_metadata_table(self.session_context(), "manager_metadata", delta_table)?; // Create and register the nodes table if it does not exist. let delta_table = self @@ -71,7 +71,7 @@ impl ManagerMetadata for DataFolder { ) .await?; - register_metadata_table(&self.session_context(), "nodes", delta_table)?; + register_metadata_table(self.session_context(), "nodes", delta_table)?; Ok(()) } @@ -81,7 +81,7 @@ impl ManagerMetadata for DataFolder { /// or created, return [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). async fn manager_key(&self) -> Result { let sql = "SELECT key FROM manager_metadata"; - let batch = sql_and_concat(&self.session_context(), sql).await?; + let batch = sql_and_concat(self.session_context(), sql).await?; let keys = modelardb_types::array!(batch, 0, StringArray); if keys.is_empty() { diff --git a/crates/modelardb_storage/src/data_folder.rs b/crates/modelardb_storage/src/data_folder.rs index daaafca70..2e058ea3a 100644 --- a/crates/modelardb_storage/src/data_folder.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -520,13 +520,9 @@ impl DataFolder { .await .is_some() { - self.time_series_table_writer(delta_table) - .await - .map_err(|error| error.into()) + self.time_series_table_writer(delta_table).await } else { - self.normal_or_metadata_table_writer(delta_table) - .await - .map_err(|error| error.into()) + self.normal_or_metadata_table_writer(delta_table).await } } From 361cb747bb44393ffe505f9e82deea540f454908 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 14:34:29 +0000 Subject: [PATCH 21/31] Move metadata tables to the metadata schema --- crates/modelardb_manager/src/metadata.rs | 16 ++++++------ crates/modelardb_storage/src/data_folder.rs | 28 ++++++++++----------- crates/modelardb_storage/src/lib.rs | 14 ++++++++--- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 66758ca54..a26eafc5d 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -80,7 +80,7 @@ impl ManagerMetadata for DataFolder { /// already exist, create one and save it to the Delta Lake. If a key could not be retrieved /// or created, return [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). async fn manager_key(&self) -> Result { - let sql = "SELECT key FROM manager_metadata"; + let sql = "SELECT key FROM metadata.manager_metadata"; let batch = sql_and_concat(self.session_context(), sql).await?; let keys = modelardb_types::array!(batch, 0, StringArray); @@ -139,7 +139,7 @@ impl ManagerMetadata for DataFolder { async fn nodes(&self) -> Result> { let mut nodes: Vec = vec![]; - let sql = "SELECT url, mode FROM nodes"; + let sql = "SELECT url, mode FROM metadata.nodes"; let batch = sql_and_concat(self.session_context(), sql).await?; let url_array = modelardb_types::array!(batch, 0, StringArray); @@ -174,7 +174,7 @@ mod tests { assert!( data_folder .session_context() - .sql("SELECT key FROM manager_metadata") + .sql("SELECT key FROM metadata.manager_metadata") .await .is_ok() ); @@ -182,7 +182,7 @@ mod tests { assert!( data_folder .session_context() - .sql("SELECT url, mode FROM nodes") + .sql("SELECT url, mode FROM metadata.nodes") .await .is_ok() ); @@ -195,7 +195,7 @@ mod tests { // Verify that the manager key is created and saved correctly. let manager_key = data_folder.manager_key().await.unwrap(); - let sql = "SELECT key FROM manager_metadata"; + let sql = "SELECT key FROM metadata.manager_metadata"; let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -214,7 +214,7 @@ mod tests { let manager_key_1 = data_folder.manager_key().await.unwrap(); let manager_key_2 = data_folder.manager_key().await.unwrap(); - let sql = "SELECT key FROM manager_metadata"; + let sql = "SELECT key FROM metadata.manager_metadata"; let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -234,7 +234,7 @@ mod tests { data_folder.save_node(node_2.clone()).await.unwrap(); // Verify that the nodes are saved correctly. - let sql = "SELECT url, mode FROM nodes"; + let sql = "SELECT url, mode FROM metadata.nodes"; let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); @@ -262,7 +262,7 @@ mod tests { data_folder.remove_node(&node_1.url).await.unwrap(); // Verify that node_1 is removed correctly. - let sql = "SELECT url, mode FROM nodes"; + let sql = "SELECT url, mode FROM metadata.nodes"; let batch = sql_and_concat(data_folder.session_context(), sql) .await .unwrap(); diff --git a/crates/modelardb_storage/src/data_folder.rs b/crates/modelardb_storage/src/data_folder.rs index 2e058ea3a..c7edcbca7 100644 --- a/crates/modelardb_storage/src/data_folder.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -503,7 +503,7 @@ impl DataFolder { TableType::TimeSeriesTable => "time_series_table", }; - let sql = format!("SELECT table_name FROM {table_type}_metadata"); + let sql = format!("SELECT table_name FROM metadata.{table_type}_metadata"); let batch = sql_and_concat(&self.session_context, &sql).await?; let table_names = modelardb_types::array!(batch, 0, StringArray); @@ -960,7 +960,7 @@ impl DataFolder { /// Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, /// [`ModelarDbStorageError`] is returned. pub async fn time_series_table_metadata(&self) -> Result>> { - let sql = "SELECT table_name, query_schema FROM time_series_table_metadata"; + let sql = "SELECT table_name, query_schema FROM metadata.time_series_table_metadata"; let batch = sql_and_concat(&self.session_context, sql).await?; let mut time_series_table_metadata: Vec> = vec![]; @@ -992,7 +992,7 @@ impl DataFolder { table_name: &str, ) -> Result { let sql = format!( - "SELECT table_name, query_schema FROM time_series_table_metadata WHERE table_name = '{table_name}'" + "SELECT table_name, query_schema FROM metadata.time_series_table_metadata WHERE table_name = '{table_name}'" ); let batch = sql_and_concat(&self.session_context, &sql).await?; @@ -1060,7 +1060,7 @@ impl DataFolder { ) -> Result> { let sql = format!( "SELECT column_index, error_bound_value, error_bound_is_relative - FROM time_series_table_field_columns + FROM metadata.time_series_table_field_columns WHERE table_name = '{table_name}' ORDER BY column_index" ); @@ -1100,7 +1100,7 @@ impl DataFolder { ) -> Result>> { let sql = format!( "SELECT column_index, generated_column_expr - FROM time_series_table_field_columns + FROM metadata.time_series_table_field_columns WHERE table_name = '{table_name}' ORDER BY column_index" ); @@ -1305,7 +1305,7 @@ mod tests { assert!( data_folder .session_context - .sql("SELECT table_name FROM normal_table_metadata") + .sql("SELECT table_name FROM metadata.normal_table_metadata") .await .is_ok() ); @@ -1313,7 +1313,7 @@ mod tests { assert!( data_folder .session_context - .sql("SELECT table_name, query_schema FROM time_series_table_metadata") + .sql("SELECT table_name, query_schema FROM metadata.time_series_table_metadata") .await .is_ok() ); @@ -1321,7 +1321,7 @@ mod tests { assert!(data_folder .session_context .sql("SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ - generated_column_expr FROM time_series_table_field_columns") + generated_column_expr FROM metadata.time_series_table_field_columns") .await .is_ok()); } @@ -1407,7 +1407,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; // Retrieve the normal table from the metadata Delta Lake. - let sql = "SELECT table_name FROM normal_table_metadata ORDER BY table_name"; + let sql = "SELECT table_name FROM metadata.normal_table_metadata ORDER BY table_name"; let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1423,7 +1423,7 @@ mod tests { let (_temp_dir, data_folder) = create_data_folder_and_save_time_series_table().await; // Check that a row has been added to the time_series_table_metadata table. - let sql = "SELECT table_name, query_schema FROM time_series_table_metadata"; + let sql = "SELECT table_name, query_schema FROM metadata.time_series_table_metadata"; let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1442,7 +1442,7 @@ mod tests { // Check that a row has been added to the time_series_table_field_columns table for each field column. let sql = "SELECT table_name, column_name, column_index, error_bound_value, error_bound_is_relative, \ - generated_column_expr FROM time_series_table_field_columns ORDER BY column_name"; + generated_column_expr FROM metadata.time_series_table_field_columns ORDER BY column_name"; let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1477,7 +1477,7 @@ mod tests { .unwrap(); // Verify that normal_table_2 was deleted from the normal_table_metadata table. - let sql = "SELECT table_name FROM normal_table_metadata"; + let sql = "SELECT table_name FROM metadata.normal_table_metadata"; let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1495,7 +1495,7 @@ mod tests { .unwrap(); // Verify that the time series table was deleted from the time_series_table_metadata table. - let sql = "SELECT table_name FROM time_series_table_metadata"; + let sql = "SELECT table_name FROM metadata.time_series_table_metadata"; let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); @@ -1503,7 +1503,7 @@ mod tests { assert_eq!(batch.num_rows(), 0); // Verify that the field columns were deleted from the time_series_table_field_columns table. - let sql = "SELECT table_name FROM time_series_table_field_columns"; + let sql = "SELECT table_name FROM metadata.time_series_table_field_columns"; let batch = sql_and_concat(&data_folder.session_context, sql) .await .unwrap(); diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index 13c0e7c55..cb235db66 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -30,7 +30,7 @@ use arrow::compute; use arrow::compute::concat_batches; use arrow::datatypes::Schema; use bytes::Bytes; -use datafusion::catalog::TableProvider; +use datafusion::catalog::{MemorySchemaProvider, TableProvider}; use datafusion::datasource::sink::DataSink; use datafusion::execution::SendableRecordBatchStream; use datafusion::execution::session_state::SessionStateBuilder; @@ -44,6 +44,7 @@ use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties} use datafusion::parquet::format::SortingColumn; use datafusion::prelude::SessionContext; use datafusion::sql::parser::Statement as DFStatement; +use datafusion::sql::TableReference; use deltalake::DeltaTable; use futures::StreamExt; use modelardb_types::types::TimeSeriesTableMetadata; @@ -77,7 +78,13 @@ pub fn create_session_context() -> SessionContext { } let session_state = session_state_builder.build(); - SessionContext::new_with_state(session_state) + let session_context = SessionContext::new_with_state(session_state); + let default_catalog = session_context.catalog("datafusion") + .expect("The datafusion catalog should always exist."); + default_catalog.register_schema("metadata", Arc::new(MemorySchemaProvider::new())) + .expect("Catalog register schema should never fail."); + + session_context } /// Register the metadata table stored in `delta_table` with `table_name` in `session_context`. If @@ -88,8 +95,9 @@ pub fn register_metadata_table( table_name: &str, delta_table: DeltaTable, ) -> Result<()> { + let table_reference = TableReference::partial("metadata", table_name); let metadata_table = Arc::new(MetadataTable::new(delta_table)); - session_context.register_table(table_name, metadata_table)?; + session_context.register_table(table_reference, metadata_table)?; Ok(()) } From 583a6e9e1336f0ef039a936137ddce9e8d29f39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 14:58:02 +0000 Subject: [PATCH 22/31] Remove TableMetadataManager from documentation --- crates/modelardb_storage/src/query/sorted_join_exec.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/modelardb_storage/src/query/sorted_join_exec.rs b/crates/modelardb_storage/src/query/sorted_join_exec.rs index 145432a7e..890778ef3 100644 --- a/crates/modelardb_storage/src/query/sorted_join_exec.rs +++ b/crates/modelardb_storage/src/query/sorted_join_exec.rs @@ -14,11 +14,9 @@ */ //! Implementation of the Apache DataFusion execution plan [`SortedJoinExec`] and its corresponding -//! stream [`SortedJoinStream`] which joins multiple sorted array produced by -//! [`GridExecs`](crate::query::grid_exec::GridExec) streams and combines them with the time series -//! tags retrieved from the [`TableMetadataManager`](metadata::table_metadata_manager::TableMetadataManager) -//! to create the complete results containing a timestamp column, one or more field columns, and zero -//! or more tag columns. +//! stream [`SortedJoinStream`] which joins multiple sorted arrays produced by +//! [`GridExecs`](crate::query::grid_exec::GridExec) streams to create the complete results +//! containing a timestamp column, one or more field columns, and zero or more tag columns. use std::any::Any; use std::fmt::{Formatter, Result as FmtResult}; From 0468f0649f215bad9c1ac806cffbff3ce50ebc11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 15:49:35 +0000 Subject: [PATCH 23/31] Fix issues found by copilot code review --- .../modelardb_compression/src/compression.rs | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 4abb9984c..fe8c2481a 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -41,11 +41,11 @@ const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; /// resulting segments. pub fn try_compress_multivariate_record_batch( time_series_table_metadata: &TimeSeriesTableMetadata, - uncompressed_time_seires: &RecordBatch, + uncompressed_time_series: &RecordBatch, ) -> Result> { // Sort by all tags and then time to simplify splitting the data into time series. let sorted_uncompressed_data = - sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_time_seires)?; + sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_time_series)?; // Split the sorted uncompressed data into time series and compress them separately. let mut compressed_data = vec![]; @@ -438,20 +438,6 @@ mod tests { #[test] fn test_try_compress_regular_constant_time_series_within_lossless_error_bound() { - let compressed_record_batch = try_compress_univariate_arrays( - &TimestampBuilder::new().finish(), - &ValueBuilder::new().finish(), - ErrorBound::Lossless, - compressed_schema(), - vec![TAG_VALUE.to_owned()], - 0, - ) - .unwrap(); - assert_eq!(0, compressed_record_batch.num_rows()); - } - - #[test] - fn test_try_compress_regular_constant_time_series_within_losless_error_bound() { generate_compress_and_assert_known_segment( false, ValuesStructure::Constant(None), From 1fd29ac4f7e668e1ed341ca3a01734b80d6c4a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 15:57:36 +0000 Subject: [PATCH 24/31] Fix compile error after rebasing on main --- crates/modelardb_manager/src/remote.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 02cc037f8..7aa73f4b8 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -629,10 +629,7 @@ impl FlightService for FlightServiceHandler { .expect("key should not contain invalid characters.") .to_owned(), storage_configuration: Some( - self.context - .remote_data_folder - .storage_configuration - .clone(), + self.context.remote_storage_configuration.clone(), ), }; From dbc129a5a190ce6ba21ff79946f19bd38906874c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Tue, 28 Oct 2025 15:59:09 +0000 Subject: [PATCH 25/31] Format all package using cargo fmt --- crates/modelardb_bulkloader/src/main.rs | 20 +++- .../modelardb_compression/src/compression.rs | 16 +-- crates/modelardb_compression/src/lib.rs | 2 +- crates/modelardb_embedded/src/capi.rs | 2 +- crates/modelardb_embedded/src/error.rs | 4 +- .../src/operations/data_folder.rs | 104 +++++++++--------- crates/modelardb_manager/src/main.rs | 12 +- crates/modelardb_manager/src/metadata.rs | 4 +- crates/modelardb_manager/src/remote.rs | 4 +- crates/modelardb_server/src/data_folders.rs | 9 +- crates/modelardb_server/src/manager.rs | 20 +--- crates/modelardb_server/src/remote.rs | 12 +- .../src/storage/data_transfer.rs | 22 +--- crates/modelardb_storage/src/lib.rs | 8 +- .../src/optimizer/model_simple_aggregates.rs | 4 +- 15 files changed, 114 insertions(+), 129 deletions(-) diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index c4da345b1..a2a5faa52 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -167,8 +167,9 @@ async fn import( data_folder.read(sql).await?; } - if let Some(time_series_table_metadata) = - data_folder.time_series_table_metadata_for_registered_time_series_table(table_name).await + if let Some(time_series_table_metadata) = data_folder + .time_series_table_metadata_for_registered_time_series_table(table_name) + .await { import_time_series_table( input_stream, @@ -397,7 +398,9 @@ async fn import_and_clear_time_series_table_batch( let schema = current_batch[0].schema(); let uncompressed_data = compute::concat_batches(&schema, &*current_batch)?; let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( - time_series_table_metadata, &uncompressed_data)?; + time_series_table_metadata, + &uncompressed_data, + )?; delta_table_writer.write_all(&compressed_data).await?; current_batch.clear(); *current_batch_size = 0; @@ -496,17 +499,22 @@ async fn create_data_folder(data_folder_path: &str) -> Result { access_key_id, secret_access_key, ) - .await.map_err(|error| error.into()) + .await + .map_err(|error| error.into()) } Some(("az", container_name)) => { let account_name = env::var("AZURE_STORAGE_ACCOUNT_NAME")?; let access_key = env::var("AZURE_STORAGE_ACCESS_KEY")?; - DataFolder::open_azure(account_name, access_key, container_name.to_owned()).await.map_err(|error| error.into()) + DataFolder::open_azure(account_name, access_key, container_name.to_owned()) + .await + .map_err(|error| error.into()) } _ => { let data_folder_path = StdPath::new(data_folder_path); - DataFolder::open_local(data_folder_path).await.map_err(|error| error.into()) + DataFolder::open_local(data_folder_path) + .await + .map_err(|error| error.into()) } } } diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index fe8c2481a..dfbdde62f 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -45,7 +45,7 @@ pub fn try_compress_multivariate_record_batch( ) -> Result> { // Sort by all tags and then time to simplify splitting the data into time series. let sorted_uncompressed_data = - sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_time_series)?; + sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_time_series)?; // Split the sorted uncompressed data into time series and compress them separately. let mut compressed_data = vec![]; @@ -69,13 +69,13 @@ pub fn try_compress_multivariate_record_batch( let mut is_new_time_series = false; for tag_column_index in 0..tag_column_arrays.len() { is_new_time_series |= tag_values[tag_column_index] - != tag_column_arrays[tag_column_index].value(row_index); + != tag_column_arrays[tag_column_index].value(row_index); } if is_new_time_series { let time_series_length = row_index - row_index_start; let uncompressed_time_series = - sorted_uncompressed_data.slice(row_index_start, time_series_length); + sorted_uncompressed_data.slice(row_index_start, time_series_length); try_compress_univariate_record_batch( time_series_table_metadata, @@ -94,7 +94,7 @@ pub fn try_compress_multivariate_record_batch( let time_series_length = sorted_uncompressed_data.num_rows() - row_index_start; let uncompressed_time_series = - sorted_uncompressed_data.slice(row_index_start, time_series_length); + sorted_uncompressed_data.slice(row_index_start, time_series_length); try_compress_univariate_record_batch( time_series_table_metadata, @@ -134,7 +134,6 @@ fn sort_record_batch_by_tags_and_time( options: sort_options, }); - let indices = compute::lexsort_to_indices(&sort_columns, None)?; let sorted_columns = compute::take_arrays(uncompressed_data.columns(), &indices, None)?; RecordBatch::try_new(uncompressed_data.schema(), sorted_columns).map_err(|error| error.into()) @@ -157,11 +156,8 @@ pub fn try_compress_univariate_record_batch( ); for field_column_index in &time_series_table_metadata.field_column_indices { - let uncompressed_values = modelardb_types::array!( - uncompressed_time_series, - *field_column_index, - ValueArray - ); + let uncompressed_values = + modelardb_types::array!(uncompressed_time_series, *field_column_index, ValueArray); let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; diff --git a/crates/modelardb_compression/src/lib.rs b/crates/modelardb_compression/src/lib.rs index 8f179b04e..875c4e648 100644 --- a/crates/modelardb_compression/src/lib.rs +++ b/crates/modelardb_compression/src/lib.rs @@ -26,8 +26,8 @@ mod types; // Re-export the few functions and types users are meant to use. pub use compression::try_compress_multivariate_record_batch; -pub use compression::try_compress_univariate_record_batch; pub use compression::try_compress_univariate_arrays; +pub use compression::try_compress_univariate_record_batch; pub use models::grid; pub use models::is_value_within_error_bound; pub use models::len; diff --git a/crates/modelardb_embedded/src/capi.rs b/crates/modelardb_embedded/src/capi.rs index 26ea64271..1ae92e3f1 100644 --- a/crates/modelardb_embedded/src/capi.rs +++ b/crates/modelardb_embedded/src/capi.rs @@ -44,9 +44,9 @@ use modelardb_types::types::ErrorBound; use tokio::runtime::Runtime; use crate::error::{ModelarDbEmbeddedError, Result}; -use crate::operations::data_folder::DataFolderDataSink; use crate::operations::Operations; use crate::operations::client::{Client, Node}; +use crate::operations::data_folder::DataFolderDataSink; use crate::record_batch_stream_to_record_batch; use crate::{Aggregate, TableType}; diff --git a/crates/modelardb_embedded/src/error.rs b/crates/modelardb_embedded/src/error.rs index 8e84780d5..e0aff4c88 100644 --- a/crates/modelardb_embedded/src/error.rs +++ b/crates/modelardb_embedded/src/error.rs @@ -76,7 +76,9 @@ impl Display for ModelarDbEmbeddedError { Self::DeltaLake(reason) => write!(f, "Delta Lake Error: {reason}"), Self::EnvironmentVar(reason) => write!(f, "Environment Variable Error: {reason}"), Self::InvalidArgument(reason) => write!(f, "Invalid Argument Error: {reason}"), - Self::ModelarDbCompression(reason) => write!(f, "ModelarDB Compression Error: {reason}"), + Self::ModelarDbCompression(reason) => { + write!(f, "ModelarDB Compression Error: {reason}") + } Self::ModelarDbStorage(reason) => write!(f, "ModelarDB Storage Error: {reason}"), Self::ModelarDbTypes(reason) => write!(f, "ModelarDB Types Error: {reason}"), Self::ObjectStore(reason) => write!(f, "Object Store Error: {reason}"), diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 2faf63e4e..60d86dfa1 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -118,11 +118,9 @@ impl Operations for DataFolder { async fn create(&mut self, table_name: &str, table_type: TableType) -> Result<()> { match table_type { TableType::NormalTable(schema) => { - let delta_table = self.create_normal_table(table_name, &schema) - .await?; + let delta_table = self.create_normal_table(table_name, &schema).await?; - self.save_normal_table_metadata(table_name) - .await?; + self.save_normal_table_metadata(table_name).await?; let data_sink = Arc::new(DataFolderDataSink::new()); @@ -165,15 +163,15 @@ impl Operations for DataFolder { /// Returns the name of all the tables. If the table names could not be retrieved from the /// metadata Delta Lake, [`ModelarDbEmbeddedError`] is returned. async fn tables(&mut self) -> Result> { - self.table_names() - .await - .map_err(|error| error.into()) + self.table_names().await.map_err(|error| error.into()) } /// Returns the schema of the table with the name in `table_name`. If the table does not exist, /// [`ModelarDbEmbeddedError`] is returned. async fn schema(&mut self, table_name: &str) -> Result { - if let Some(time_series_table_metadata) = self.time_series_table_metadata_for_registered_time_series_table(table_name).await + if let Some(time_series_table_metadata) = self + .time_series_table_metadata_for_registered_time_series_table(table_name) + .await { Ok((*time_series_table_metadata.query_schema).to_owned()) } else if let Some(normal_table_schema) = self.normal_table_schema(table_name).await { @@ -200,7 +198,9 @@ impl Operations for DataFolder { "The uncompressed data does not match the schema for the table: {table_name}." )); - if let Some(time_series_table_metadata) = self.time_series_table_metadata_for_registered_time_series_table(table_name).await + if let Some(time_series_table_metadata) = self + .time_series_table_metadata_for_registered_time_series_table(table_name) + .await { // Time series table. if !schemas_are_compatible( @@ -211,7 +211,9 @@ impl Operations for DataFolder { } let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( - &time_series_table_metadata, &uncompressed_data)?; + &time_series_table_metadata, + &uncompressed_data, + )?; self.write_compressed_segments_to_time_series_table(table_name, compressed_data) .await?; @@ -301,8 +303,9 @@ impl Operations for DataFolder { tags: HashMap, ) -> Result>> { // DataFolder.read() interface is designed for time series tables. - let time_series_table_medata = if let Some(time_series_table_metadata) = - self.time_series_table_metadata_for_registered_time_series_table(table_name).await + let time_series_table_medata = if let Some(time_series_table_metadata) = self + .time_series_table_metadata_for_registered_time_series_table(table_name) + .await { time_series_table_metadata } else { @@ -433,7 +436,8 @@ impl Operations for DataFolder { )); if let (Some(source_time_series_table_metadata), Some(target_time_series_table_metadata)) = ( - self.time_series_table_metadata_for_registered_time_series_table(source_table_name).await, + self.time_series_table_metadata_for_registered_time_series_table(source_table_name) + .await, target_data_folder .time_series_table_metadata_for_registered_time_series_table(target_table_name) .await, @@ -572,7 +576,10 @@ fn schemas_are_compatible(source_schema: &Schema, target_schema: &Schema) -> boo mod tests { use super::*; - use arrow::array::{Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray}; + use arrow::array::{ + Array, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, + StringArray, + }; use arrow::compute::SortOptions; use arrow::datatypes::{ArrowPrimitiveType, DataType, Field}; use arrow_flight::flight_service_client::FlightServiceClient; @@ -582,7 +589,8 @@ mod tests { use datafusion::physical_plan::expressions::Column; use datafusion::physical_plan::sorts::sort; use modelardb_types::types::{ - ArrowTimestamp, ArrowValue, ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, TimestampArray, ValueArray + ArrowTimestamp, ArrowValue, ErrorBound, GeneratedColumn, TimeSeriesTableMetadata, + TimestampArray, ValueArray, }; use tempfile::TempDir; use tonic::transport::Channel; @@ -626,7 +634,10 @@ mod tests { // Create a new data folder and verify that the existing normal tables are registered. let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let data_sink = Arc::new(DataFolderDataSink::new()); - new_data_folder.register_normal_and_time_series_tables(data_sink).await.unwrap(); + new_data_folder + .register_normal_and_time_series_tables(data_sink) + .await + .unwrap(); assert!( new_data_folder .session_context() @@ -798,7 +809,10 @@ mod tests { // Create a new data folder and verify that the existing time series tables are registered. let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let data_sink = Arc::new(DataFolderDataSink::new()); - new_data_folder.register_normal_and_time_series_tables(data_sink).await.unwrap(); + new_data_folder + .register_normal_and_time_series_tables(data_sink) + .await + .unwrap(); assert!( new_data_folder .session_context() @@ -931,10 +945,7 @@ mod tests { #[tokio::test] async fn test_write_to_normal_table() { let (_temp_dir, mut data_folder) = create_data_folder_with_normal_table().await; - let mut delta_table = data_folder - .delta_table(NORMAL_TABLE_NAME) - .await - .unwrap(); + let mut delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); assert_eq!(delta_table.get_files_count(), 0); @@ -2059,12 +2070,7 @@ mod tests { ); // Verify that the normal table was dropped from the Delta Lake. - assert!( - data_folder - .delta_table(NORMAL_TABLE_NAME) - .await - .is_err() - ); + assert!(data_folder.delta_table(NORMAL_TABLE_NAME).await.is_err()); } #[tokio::test] @@ -2130,10 +2136,7 @@ mod tests { .await .unwrap(); - let mut delta_table = data_folder - .delta_table(NORMAL_TABLE_NAME) - .await - .unwrap(); + let mut delta_table = data_folder.delta_table(NORMAL_TABLE_NAME).await.unwrap(); assert_eq!(delta_table.get_files_count(), 1); @@ -2281,10 +2284,7 @@ mod tests { .await .unwrap(); - let mut delta_table = source - .delta_table(NORMAL_TABLE_NAME) - .await - .unwrap(); + let mut delta_table = source.delta_table(NORMAL_TABLE_NAME).await.unwrap(); assert_eq!(delta_table.get_files_count(), 1); @@ -2309,24 +2309,21 @@ mod tests { expected_schema: Schema, ) { // Verify that the normal table exists in the Delta Lake. - let delta_table = data_folder - .delta_table(table_name) - .await - .unwrap(); + let delta_table = data_folder.delta_table(table_name).await.unwrap(); let actual_schema = TableProvider::schema(&delta_table); assert_eq!(actual_schema, Arc::new(expected_schema)); // Verify that the normal table exists in the metadata Delta Lake. + assert!(data_folder.is_normal_table(table_name).await.unwrap()); + + // Verify that the normal table is registered with Apache DataFusion. assert!( data_folder - .is_normal_table(table_name) - .await + .session_context() + .table_exist(table_name) .unwrap() - ); - - // Verify that the normal table is registered with Apache DataFusion. - assert!(data_folder.session_context().table_exist(table_name).unwrap()) + ) } #[tokio::test] @@ -2447,10 +2444,7 @@ mod tests { .await .unwrap(); - let mut delta_table = source - .delta_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap(); + let mut delta_table = source.delta_table(TIME_SERIES_TABLE_NAME).await.unwrap(); assert_eq!(delta_table.get_files_count(), 2); @@ -2489,10 +2483,7 @@ mod tests { .await .unwrap(); - let mut delta_table = source - .delta_table(TIME_SERIES_TABLE_NAME) - .await - .unwrap(); + let mut delta_table = source.delta_table(TIME_SERIES_TABLE_NAME).await.unwrap(); assert_eq!(delta_table.get_files_count(), 2); @@ -2539,7 +2530,12 @@ mod tests { assert_eq!(*time_series_table_metadata.query_schema, expected_schema); // Verify that the time series table is registered with Apache DataFusion. - assert!(data_folder.session_context().table_exist(table_name).unwrap()); + assert!( + data_folder + .session_context() + .table_exist(table_name) + .unwrap() + ); time_series_table_metadata } diff --git a/crates/modelardb_manager/src/main.rs b/crates/modelardb_manager/src/main.rs index 1c4d1113b..8e0525f7d 100644 --- a/crates/modelardb_manager/src/main.rs +++ b/crates/modelardb_manager/src/main.rs @@ -69,10 +69,14 @@ async fn main() -> Result<()> { _ => print_usage_and_exit_with_error("remote_data_folder"), }; - let remote_storage_configuration = modelardb_types::flight::argument_to_storage_configuration(remote_data_folder_str)?; - let remote_data_folder = DataFolder::open_object_store(remote_storage_configuration.clone()).await?; - - remote_data_folder.create_and_register_manager_metadata_data_folder_tables().await?; + let remote_storage_configuration = + modelardb_types::flight::argument_to_storage_configuration(remote_data_folder_str)?; + let remote_data_folder = + DataFolder::open_object_store(remote_storage_configuration.clone()).await?; + + remote_data_folder + .create_and_register_manager_metadata_data_folder_tables() + .await?; let mut cluster = Cluster::new(); let nodes = remote_data_folder.nodes().await?; diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index a26eafc5d..5bbb007be 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -295,9 +295,7 @@ mod tests { async fn create_data_folder() -> (TempDir, DataFolder) { let temp_dir = tempfile::tempdir().unwrap(); - let data_folder = DataFolder::open_local(temp_dir.path()) - .await - .unwrap(); + let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); data_folder .create_and_register_manager_metadata_data_folder_tables() diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 7aa73f4b8..853fd1b8d 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -628,9 +628,7 @@ impl FlightService for FlightServiceHandler { .to_str() .expect("key should not contain invalid characters.") .to_owned(), - storage_configuration: Some( - self.context.remote_storage_configuration.clone(), - ), + storage_configuration: Some(self.context.remote_storage_configuration.clone()), }; let protobuf_bytes = manager_metadata.encode_to_vec(); diff --git a/crates/modelardb_server/src/data_folders.rs b/crates/modelardb_server/src/data_folders.rs index 453a45396..f495eccc1 100644 --- a/crates/modelardb_server/src/data_folders.rs +++ b/crates/modelardb_server/src/data_folders.rs @@ -59,8 +59,7 @@ impl DataFolders { // Match the provided command line arguments to the supported inputs. match arguments { &["edge", local_data_folder_url] | &[local_data_folder_url] => { - let local_data_folder = - DataFolder::open_local_url(local_data_folder_url).await?; + let local_data_folder = DataFolder::open_local_url(local_data_folder_url).await?; Ok(( ClusterMode::SingleNode, @@ -71,8 +70,7 @@ impl DataFolders { let (manager, storage_configuration) = Manager::register_node(manager_url, ServerMode::Cloud).await?; - let local_data_folder = - DataFolder::open_local_url(local_data_folder_url).await?; + let local_data_folder = DataFolder::open_local_url(local_data_folder_url).await?; let remote_data_folder = DataFolder::open_object_store(storage_configuration).await?; @@ -91,8 +89,7 @@ impl DataFolders { let (manager, storage_configuration) = Manager::register_node(manager_url, ServerMode::Edge).await?; - let local_data_folder = - DataFolder::open_local_url(local_data_folder_url).await?; + let local_data_folder = DataFolder::open_local_url(local_data_folder_url).await?; let remote_data_folder = DataFolder::open_object_store(storage_configuration).await?; diff --git a/crates/modelardb_server/src/manager.rs b/crates/modelardb_server/src/manager.rs index 6a3b9c8e0..b1004af5c 100644 --- a/crates/modelardb_server/src/manager.rs +++ b/crates/modelardb_server/src/manager.rs @@ -96,9 +96,7 @@ impl Manager { /// retrieved from the remote data folder, or the tables could not be created, /// return [`ModelarDbServerError`]. pub(crate) async fn retrieve_and_create_tables(&self, context: &Arc) -> Result<()> { - let local_data_folder = &context - .data_folders - .local_data_folder; + let local_data_folder = &context.data_folders.local_data_folder; let remote_data_folder = &context .data_folders @@ -181,12 +179,8 @@ async fn validate_local_tables_exist_remotely( local_data_folder: &DataFolder, remote_data_folder: &DataFolder, ) -> Result<()> { - let local_table_names = local_data_folder - .table_names() - .await?; - let remote_table_names = remote_data_folder - .table_names() - .await?; + let local_table_names = local_data_folder.table_names().await?; + let remote_table_names = remote_data_folder.table_names().await?; let invalid_tables: Vec = local_table_names .iter() @@ -214,9 +208,7 @@ async fn validate_normal_tables( ) -> Result)>> { let mut missing_normal_tables = vec![]; - let remote_normal_tables = remote_data_folder - .normal_table_names() - .await?; + let remote_normal_tables = remote_data_folder.normal_table_names().await?; for table_name in remote_normal_tables { let remote_schema = normal_table_schema(remote_data_folder, &table_name).await?; @@ -253,9 +245,7 @@ async fn validate_time_series_tables( ) -> Result> { let mut missing_time_series_tables = vec![]; - let remote_time_series_tables = remote_data_folder - .time_series_table_names() - .await?; + let remote_time_series_tables = remote_data_folder.time_series_table_names().await?; for table_name in remote_time_series_tables { let remote_metadata = remote_data_folder diff --git a/crates/modelardb_server/src/remote.rs b/crates/modelardb_server/src/remote.rs index a4ff1c03b..96bdd206e 100644 --- a/crates/modelardb_server/src/remote.rs +++ b/crates/modelardb_server/src/remote.rs @@ -469,13 +469,21 @@ impl FlightService for FlightServiceHandler { Ok(empty_record_batch_stream()) } ModelarDbStatement::Statement(statement) => { - let session_context = self.context.data_folders.query_data_folder.session_context(); + let session_context = self + .context + .data_folders + .query_data_folder + .session_context(); modelardb_storage::execute_statement(session_context, statement) .await .map_err(|error| error.into()) } ModelarDbStatement::IncludeSelect(statement, addresses) => { - let session_context = self.context.data_folders.query_data_folder.session_context(); + let session_context = self + .context + .data_folders + .query_data_folder + .session_context(); let local_sendable_record_batch_stream = modelardb_storage::execute_statement(session_context, statement) .await diff --git a/crates/modelardb_server/src/storage/data_transfer.rs b/crates/modelardb_server/src/storage/data_transfer.rs index 4e21e96d1..08b9916e5 100644 --- a/crates/modelardb_server/src/storage/data_transfer.rs +++ b/crates/modelardb_server/src/storage/data_transfer.rs @@ -67,9 +67,7 @@ impl DataTransfer { // The size of tables is computed manually as datafusion_table_statistics() is not exact. let table_size_in_bytes = DashMap::with_capacity(table_names.len()); for table_name in table_names { - let delta_table = local_data_folder - .delta_table(&table_name) - .await?; + let delta_table = local_data_folder.delta_table(&table_name).await?; let mut table_size_in_bytes = table_size_in_bytes.entry(table_name).or_insert(0); @@ -237,10 +235,7 @@ impl DataTransfer { .expect("table_size_in_bytes should contain table_name since the table contains data.") .value(); - let local_delta_ops = self - .local_data_folder - .delta_ops(table_name) - .await?; + let local_delta_ops = self.local_data_folder.delta_ops(table_name).await?; // Read the data that is currently stored for the table with table_name. let (_table, stream) = local_delta_ops.load().await?; @@ -264,9 +259,7 @@ impl DataTransfer { } // Delete the data that has been transferred to the remote Delta Lake. - self.local_data_folder - .truncate_table(table_name) - .await?; + self.local_data_folder.truncate_table(table_name).await?; // Remove the transferred data from the in-memory tracking of compressed files. *self.table_size_in_bytes.get_mut(table_name).unwrap() -= current_size_in_bytes; @@ -534,10 +527,7 @@ mod tests { /// Return the total size of the files in the table with `table_name` in `local_data_folder`. async fn table_files_size(local_data_folder: &DataFolder, table_name: &str) -> u64 { - let delta_table = local_data_folder - .delta_table(table_name) - .await - .unwrap(); + let delta_table = local_data_folder.delta_table(table_name).await.unwrap(); let mut files_size = 0; for file_path in delta_table.get_files_iter().unwrap() { @@ -554,9 +544,7 @@ mod tests { ) -> (TempDir, DataTransfer) { let target_dir = tempfile::tempdir().unwrap(); let target_dir_url = target_dir.path().to_str().unwrap(); - let remote_data_folder = DataFolder::open_local_url(target_dir_url) - .await - .unwrap(); + let remote_data_folder = DataFolder::open_local_url(target_dir_url).await.unwrap(); // Set the transfer batch size so that data is transferred if three batches are written. let data_transfer = DataTransfer::try_new( diff --git a/crates/modelardb_storage/src/lib.rs b/crates/modelardb_storage/src/lib.rs index cb235db66..67f78167f 100644 --- a/crates/modelardb_storage/src/lib.rs +++ b/crates/modelardb_storage/src/lib.rs @@ -43,8 +43,8 @@ use datafusion::parquet::errors::ParquetError; use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties}; use datafusion::parquet::format::SortingColumn; use datafusion::prelude::SessionContext; -use datafusion::sql::parser::Statement as DFStatement; use datafusion::sql::TableReference; +use datafusion::sql::parser::Statement as DFStatement; use deltalake::DeltaTable; use futures::StreamExt; use modelardb_types::types::TimeSeriesTableMetadata; @@ -79,9 +79,11 @@ pub fn create_session_context() -> SessionContext { let session_state = session_state_builder.build(); let session_context = SessionContext::new_with_state(session_state); - let default_catalog = session_context.catalog("datafusion") + let default_catalog = session_context + .catalog("datafusion") .expect("The datafusion catalog should always exist."); - default_catalog.register_schema("metadata", Arc::new(MemorySchemaProvider::new())) + default_catalog + .register_schema("metadata", Arc::new(MemorySchemaProvider::new())) .expect("Catalog register schema should never fail."); session_context diff --git a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs index b08b1d05d..8fe12f04e 100644 --- a/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs +++ b/crates/modelardb_storage/src/optimizer/model_simple_aggregates.rs @@ -774,9 +774,7 @@ mod tests { ) -> Arc { // Setup access to data and metadata in data folder. let data_folder_path = temp_dir.path(); - let data_folder = DataFolder::open_local(data_folder_path) - .await - .unwrap(); + let data_folder = DataFolder::open_local(data_folder_path).await.unwrap(); // Setup access to Apache DataFusion. let session_context = crate::create_session_context(); From 5e4bb3311cc1152f876bf6cb4485c353f13f8c65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Thu, 30 Oct 2025 05:55:39 +0000 Subject: [PATCH 26/31] Update based on comments from @CGodiksen --- crates/modelardb_bulkloader/src/main.rs | 2 +- .../modelardb_compression/src/compression.rs | 48 ++++---- crates/modelardb_compression/src/lib.rs | 6 +- .../modelardb_compression/src/models/swing.rs | 2 +- crates/modelardb_embedded/src/capi.rs | 42 +++---- .../src/operations/data_folder.rs | 27 +++-- crates/modelardb_manager/src/metadata.rs | 16 +-- crates/modelardb_manager/src/remote.rs | 44 ++++---- crates/modelardb_server/src/context.rs | 43 ++++---- crates/modelardb_server/src/main.rs | 10 +- crates/modelardb_server/src/manager.rs | 2 +- .../src/storage/compressed_data_manager.rs | 2 +- .../src/storage/uncompressed_data_manager.rs | 8 +- crates/modelardb_storage/src/data_folder.rs | 103 +++++++++--------- .../src/query/normal_table.rs | 2 +- .../src/query/time_series_table.rs | 2 +- 16 files changed, 178 insertions(+), 181 deletions(-) diff --git a/crates/modelardb_bulkloader/src/main.rs b/crates/modelardb_bulkloader/src/main.rs index a2a5faa52..e39746d01 100644 --- a/crates/modelardb_bulkloader/src/main.rs +++ b/crates/modelardb_bulkloader/src/main.rs @@ -397,7 +397,7 @@ async fn import_and_clear_time_series_table_batch( if *current_batch_size != 0 { let schema = current_batch[0].schema(); let uncompressed_data = compute::concat_batches(&schema, &*current_batch)?; - let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( + let compressed_data = modelardb_compression::try_compress_multivariate_time_series( time_series_table_metadata, &uncompressed_data, )?; diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index dfbdde62f..280d1a5bb 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -37,15 +37,15 @@ use crate::types::{CompressedSegmentBatchBuilder, CompressedSegmentBuilder, Mode /// that are marked as residuals are stored as separate segments to allow for efficient pruning. const RESIDUAL_VALUES_MAX_LENGTH: u8 = 255; -/// Compress the `uncompressed_data` from the table with `time_series_table_metadata` and return the -/// resulting segments. -pub fn try_compress_multivariate_record_batch( +/// Compress the `uncompressed_time_series` from the table with `time_series_table_metadata` and +/// return the resulting segments. +pub fn try_compress_multivariate_time_series( time_series_table_metadata: &TimeSeriesTableMetadata, uncompressed_time_series: &RecordBatch, ) -> Result> { // Sort by all tags and then time to simplify splitting the data into time series. let sorted_uncompressed_data = - sort_record_batch_by_tags_and_time(time_series_table_metadata, uncompressed_time_series)?; + sort_time_series_by_tags_and_time(time_series_table_metadata, uncompressed_time_series)?; // Split the sorted uncompressed data into time series and compress them separately. let mut compressed_data = vec![]; @@ -77,7 +77,7 @@ pub fn try_compress_multivariate_record_batch( let uncompressed_time_series = sorted_uncompressed_data.slice(row_index_start, time_series_length); - try_compress_univariate_record_batch( + try_split_and_compress_univariate_time_series( time_series_table_metadata, &uncompressed_time_series, &tag_values, @@ -96,7 +96,7 @@ pub fn try_compress_multivariate_record_batch( let uncompressed_time_series = sorted_uncompressed_data.slice(row_index_start, time_series_length); - try_compress_univariate_record_batch( + try_split_and_compress_univariate_time_series( time_series_table_metadata, &uncompressed_time_series, &tag_values, @@ -108,9 +108,9 @@ pub fn try_compress_multivariate_record_batch( /// Sort the `uncompressed_data` from the time series table with `time_series_table_metadata` /// according to its tags and then timestamps. -fn sort_record_batch_by_tags_and_time( +fn sort_time_series_by_tags_and_time( time_series_table_metadata: &TimeSeriesTableMetadata, - uncompressed_data: &RecordBatch, + uncompressed_time_series: &RecordBatch, ) -> Result { let mut sort_columns = vec![]; @@ -120,7 +120,7 @@ fn sort_record_batch_by_tags_and_time( }); for tag_column_index in &time_series_table_metadata.tag_column_indices { - let tag_column = uncompressed_data.column(*tag_column_index); + let tag_column = uncompressed_time_series.column(*tag_column_index); sort_columns.push(SortColumn { values: (*tag_column).clone(), options: sort_options, @@ -128,26 +128,26 @@ fn sort_record_batch_by_tags_and_time( } let timestamp_column_index = time_series_table_metadata.timestamp_column_index; - let timestamp_column = uncompressed_data.column(timestamp_column_index); + let timestamp_column = uncompressed_time_series.column(timestamp_column_index); sort_columns.push(SortColumn { values: (*timestamp_column).clone(), options: sort_options, }); let indices = compute::lexsort_to_indices(&sort_columns, None)?; - let sorted_columns = compute::take_arrays(uncompressed_data.columns(), &indices, None)?; - RecordBatch::try_new(uncompressed_data.schema(), sorted_columns).map_err(|error| error.into()) + let sorted_columns = compute::take_arrays(uncompressed_time_series.columns(), &indices, None)?; + RecordBatch::try_new(uncompressed_time_series.schema(), sorted_columns).map_err(|error| error.into()) } /// Compress the field columns in `uncompressed_time_series` from the table with -/// `time_series_table_metadata` using [`try_compress_univariate_arrays`] and append the result to -/// `compressed_data`. It is assumed that all data points in `uncompressed_time_series` have the +/// `time_series_table_metadata` using [`try_compress_univariate_time_series`] and append the result +/// to `compressed_data`. It is assumed that all data points in `uncompressed_time_series` have the /// same tags as in `tag_values`. -pub fn try_compress_univariate_record_batch( +pub fn try_split_and_compress_univariate_time_series( time_series_table_metadata: &TimeSeriesTableMetadata, uncompressed_time_series: &RecordBatch, tag_values: &[String], - compressed_data: &mut Vec, + compressed_time_series: &mut Vec, ) -> Result<()> { let uncompressed_timestamps = modelardb_types::array!( uncompressed_time_series, @@ -161,7 +161,7 @@ pub fn try_compress_univariate_record_batch( let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; - let compressed_time_series = try_compress_univariate_arrays( + let compressed_time_series = try_compress_univariate_time_series( uncompressed_timestamps, uncompressed_values, error_bound, @@ -171,7 +171,7 @@ pub fn try_compress_univariate_record_batch( ) .expect("uncompressed_timestamps and uncompressed_values should have the same length."); - compressed_data.push(compressed_time_series); + compressed_time_series.push(compressed_time_series); } Ok(()) @@ -187,7 +187,7 @@ pub fn try_compress_univariate_record_batch( /// `uncompressed_values` have different lengths or if `compressed_schema` is not a valid schema for /// compressed segments, otherwise the resulting compressed segments are returned as a /// [`RecordBatch`] with the `compressed_schema` schema. -pub fn try_compress_univariate_arrays( +pub fn try_compress_univariate_time_series( uncompressed_timestamps: &TimestampArray, uncompressed_values: &ValueArray, error_bound: ErrorBound, @@ -417,10 +417,10 @@ mod tests { const ADD_NOISE_RANGE: Option> = Some(1.0..1.05); const TRY_COMPRESS_TEST_LENGTH: usize = 50; - // Tests for try_compress(). + // Tests for try_compress_univariate_time_series(). #[test] fn test_try_compress_empty_time_series_within_lossless_error_bound() { - let compressed_record_batch = try_compress_univariate_arrays( + let compressed_record_batch = try_compress_univariate_time_series( &TimestampBuilder::new().finish(), &ValueBuilder::new().finish(), ErrorBound::Lossless, @@ -582,7 +582,7 @@ mod tests { let uncompressed_values = data_generation::generate_values(uncompressed_timestamps.values(), values_structure); - let compressed_record_batch = try_compress_univariate_arrays( + let compressed_record_batch = try_compress_univariate_time_series( &uncompressed_timestamps, &uncompressed_values, error_bound, @@ -686,7 +686,7 @@ mod tests { let uncompressed_values = uncompressed_values.finish(); assert_eq!(uncompressed_timestamps.len(), uncompressed_values.len()); - let compressed_record_batch = try_compress_univariate_arrays( + let compressed_record_batch = try_compress_univariate_time_series( &uncompressed_timestamps, &uncompressed_values, error_bound, @@ -843,7 +843,7 @@ mod tests { 100.0..200.0, ); - let compressed_record_batch = try_compress_univariate_arrays( + let compressed_record_batch = try_compress_univariate_time_series( &uncompressed_timestamps, &uncompressed_values, error_bound, diff --git a/crates/modelardb_compression/src/lib.rs b/crates/modelardb_compression/src/lib.rs index 875c4e648..8b96c60c2 100644 --- a/crates/modelardb_compression/src/lib.rs +++ b/crates/modelardb_compression/src/lib.rs @@ -25,9 +25,9 @@ mod models; mod types; // Re-export the few functions and types users are meant to use. -pub use compression::try_compress_multivariate_record_batch; -pub use compression::try_compress_univariate_arrays; -pub use compression::try_compress_univariate_record_batch; +pub use compression::try_compress_multivariate_time_series; +pub use compression::try_compress_univariate_time_series; +pub use compression::try_split_and_compress_univariate_time_series; pub use models::grid; pub use models::is_value_within_error_bound; pub use models::len; diff --git a/crates/modelardb_compression/src/models/swing.rs b/crates/modelardb_compression/src/models/swing.rs index 4627ad2b4..88ec217e7 100644 --- a/crates/modelardb_compression/src/models/swing.rs +++ b/crates/modelardb_compression/src/models/swing.rs @@ -749,7 +749,7 @@ mod tests { compressed_schema_fields.push(Arc::new(Field::new("tag", DataType::Utf8, false))); let compressed_schema = Arc::new(Schema::new(compressed_schema_fields)); - let segments = crate::try_compress_univariate_arrays( + let segments = crate::try_compress_univariate_time_series( ×tamps, &values, error_bound, diff --git a/crates/modelardb_embedded/src/capi.rs b/crates/modelardb_embedded/src/capi.rs index 1ae92e3f1..911bc24e8 100644 --- a/crates/modelardb_embedded/src/capi.rs +++ b/crates/modelardb_embedded/src/capi.rs @@ -87,11 +87,11 @@ pub unsafe extern "C" fn modelardb_embedded_open_memory() -> *const c_void { set_error_and_return_value_ptr(maybe_data_folder) } -/// See documentation for [`modelardb_embedded_open_memory`]. +/// See documentation for [`modelardb_embedded_open_memory()`]. fn open_memory() -> Result { let data_folder = TOKIO_RUNTIME.block_on(DataFolder::open_memory())?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + TOKIO_RUNTIME.block_on(data_folder.register_tables(data_sink))?; Ok(data_folder) } @@ -106,14 +106,14 @@ pub unsafe extern "C" fn modelardb_embedded_open_local( set_error_and_return_value_ptr(maybe_data_folder) } -/// See documentation for [`modelardb_embedded_open_local`]. +/// See documentation for [`modelardb_embedded_open_local()`]. unsafe fn open_local(data_folder_path_ptr: *const c_char) -> Result { let data_folder_str = unsafe { c_char_ptr_to_str(data_folder_path_ptr)? }; let data_folder_path = StdPath::new(data_folder_str); let data_folder = TOKIO_RUNTIME.block_on(DataFolder::open_local(data_folder_path))?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + TOKIO_RUNTIME.block_on(data_folder.register_tables(data_sink))?; Ok(data_folder) } @@ -140,7 +140,7 @@ pub unsafe extern "C" fn modelardb_embedded_open_s3( set_error_and_return_value_ptr(maybe_data_folder) } -/// See documentation for [`modelardb_embedded_open_s3`]. +/// See documentation for [`modelardb_embedded_open_s3()`]. unsafe fn open_s3( endpoint_ptr: *const c_char, bucket_name_ptr: *const c_char, @@ -159,7 +159,7 @@ unsafe fn open_s3( secret_access_key.to_owned(), ))?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + TOKIO_RUNTIME.block_on(data_folder.register_tables(data_sink))?; Ok(data_folder) } @@ -177,7 +177,7 @@ pub unsafe extern "C" fn modelardb_embedded_open_azure( set_error_and_return_value_ptr(maybe_data_folder) } -/// See documentation for [`modelardb_embedded_open_azure`]. +/// See documentation for [`modelardb_embedded_open_azure()`]. unsafe fn open_azure( account_name_ptr: *const c_char, access_key_ptr: *const c_char, @@ -193,7 +193,7 @@ unsafe fn open_azure( container_name.to_owned(), ))?; let data_sink = Arc::new(DataFolderDataSink::new()); - TOKIO_RUNTIME.block_on(data_folder.register_normal_and_time_series_tables(data_sink))?; + TOKIO_RUNTIME.block_on(data_folder.register_tables(data_sink))?; Ok(data_folder) } @@ -209,7 +209,7 @@ pub unsafe extern "C" fn modelardb_embedded_connect( set_error_and_return_value_ptr(maybe_client) } -/// See documentation for [`modelardb_embedded_connect`]. +/// See documentation for [`modelardb_embedded_connect()`]. unsafe fn connect(node_url_ptr: *const c_char, is_server_node: bool) -> Result { let node_url_str = unsafe { c_char_ptr_to_str(node_url_ptr)? }; @@ -305,7 +305,7 @@ pub unsafe extern "C" fn modelardb_embedded_create( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_create`]. +/// See documentation for [`modelardb_embedded_create()`]. #[allow(clippy::too_many_arguments)] unsafe fn create( maybe_operations_ptr: *mut c_void, @@ -402,7 +402,7 @@ pub unsafe extern "C" fn modelardb_embedded_tables( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_tables`]. +/// See documentation for [`modelardb_embedded_tables()`]. unsafe fn tables( maybe_operations_ptr: *mut c_void, is_data_folder: bool, @@ -449,7 +449,7 @@ pub unsafe extern "C" fn modelardb_embedded_schema( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_schema`]. +/// See documentation for [`modelardb_embedded_schema()`]. unsafe fn schema( maybe_operations_ptr: *mut c_void, is_data_folder: bool, @@ -501,7 +501,7 @@ pub unsafe extern "C" fn modelardb_embedded_write( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_write`]. +/// See documentation for [`modelardb_embedded_write()`]. unsafe fn write( maybe_operations_ptr: *mut c_void, is_data_folder: bool, @@ -545,7 +545,7 @@ pub unsafe extern "C" fn modelardb_embedded_read( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_read`]. +/// See documentation for [`modelardb_embedded_read()`]. unsafe fn read( maybe_operations_ptr: *mut c_void, is_data_folder: bool, @@ -596,7 +596,7 @@ pub unsafe extern "C" fn modelardb_embedded_copy( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_copy`]. +/// See documentation for [`modelardb_embedded_copy()`]. unsafe fn copy( maybe_source_operations_ptr: *mut c_void, is_data_folder: bool, @@ -662,7 +662,7 @@ pub unsafe extern "C" fn modelardb_embedded_read_time_series_table( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_read_time_series_table`]. +/// See documentation for [`modelardb_embedded_read_time_series_table()`]. #[allow(clippy::too_many_arguments)] unsafe fn read_time_series_table( maybe_operations_ptr: *mut c_void, @@ -789,7 +789,7 @@ pub unsafe extern "C" fn modelardb_embedded_copy_time_series_table( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_copy_time_series_table`]. +/// See documentation for [`modelardb_embedded_copy_time_series_table()`]. #[allow(clippy::too_many_arguments)] unsafe fn copy_time_series_table( maybe_source_operations_ptr: *mut c_void, @@ -879,7 +879,7 @@ pub unsafe extern "C" fn modelardb_embedded_move( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_move`]. +/// See documentation for [`modelardb_embedded_move()`]. unsafe fn r#move( maybe_source_operations_ptr: *mut c_void, is_data_folder: bool, @@ -909,7 +909,7 @@ pub unsafe extern "C" fn modelardb_embedded_truncate( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_truncate`]. +/// See documentation for [`modelardb_embedded_truncate()`]. unsafe fn truncate( maybe_operations_ptr: *mut c_void, is_data_folder: bool, @@ -934,7 +934,7 @@ pub unsafe extern "C" fn modelardb_embedded_drop( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_drop`]. +/// See documentation for [`modelardb_embedded_drop()`]. unsafe fn drop( maybe_operations_ptr: *mut c_void, is_data_folder: bool, @@ -971,7 +971,7 @@ pub unsafe extern "C" fn modelardb_embedded_vacuum( set_error_and_return_code(maybe_unit) } -/// See documentation for [`modelardb_embedded_vacuum`]. +/// See documentation for [`modelardb_embedded_vacuum()`]. unsafe fn vacuum( maybe_operations_ptr: *mut c_void, is_data_folder: bool, diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 60d86dfa1..a31e28b29 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -160,8 +160,8 @@ impl Operations for DataFolder { Ok(()) } - /// Returns the name of all the tables. If the table names could not be retrieved from the - /// metadata Delta Lake, [`ModelarDbEmbeddedError`] is returned. + /// Returns the name of all the tables. If the table names could not be retrieved from the Delta + /// Lake, [`ModelarDbEmbeddedError`] is returned. async fn tables(&mut self) -> Result> { self.table_names().await.map_err(|error| error.into()) } @@ -210,7 +210,7 @@ impl Operations for DataFolder { return Err(schema_mismatch_error); } - let compressed_data = modelardb_compression::try_compress_multivariate_record_batch( + let compressed_data = modelardb_compression::try_compress_multivariate_time_series( &time_series_table_metadata, &uncompressed_data, )?; @@ -505,15 +505,14 @@ impl Operations for DataFolder { } /// Drop the table with the name in `table_name` by deregistering the table from the Apache - /// Arrow DataFusion session, deleting all the table files from the data Delta Lake, and - /// deleting the table metadata from the metadata Delta Lake. If the table could not be - /// deregistered or the metadata or data could not be dropped, [`ModelarDbEmbeddedError`] is - /// returned. + /// Arrow DataFusion session, deleting all the table files from the Delta Lake, and deleting the + /// table metadata from the Delta Lake. If the table could not be deregistered or the metadata + /// or data could not be dropped, [`ModelarDbEmbeddedError`] is returned. async fn drop(&mut self, table_name: &str) -> Result<()> { // Drop the table from the Apache Arrow DataFusion session. self.session_context().deregister_table(table_name)?; - // Delete the table metadata from the metadata Delta Lake. + // Delete the table metadata from the Delta Lake. self.drop_table_metadata(table_name).await?; // Drop the table from the Delta Lake. @@ -635,7 +634,7 @@ mod tests { let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let data_sink = Arc::new(DataFolderDataSink::new()); new_data_folder - .register_normal_and_time_series_tables(data_sink) + .register_tables(data_sink) .await .unwrap(); assert!( @@ -810,7 +809,7 @@ mod tests { let new_data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); let data_sink = Arc::new(DataFolderDataSink::new()); new_data_folder - .register_normal_and_time_series_tables(data_sink) + .register_tables(data_sink) .await .unwrap(); assert!( @@ -2061,7 +2060,7 @@ mod tests { .unwrap() ); - // Verify that the normal table was dropped from the metadata Delta Lake. + // Verify that the normal table was dropped from the Delta Lake. assert!( !data_folder .is_normal_table(NORMAL_TABLE_NAME) @@ -2094,7 +2093,7 @@ mod tests { .unwrap() ); - // Verify that the time series table was dropped from the metadata Delta Lake. + // Verify that the time series table was dropped from the Delta Lake. assert!( !data_folder .is_time_series_table(TIME_SERIES_TABLE_NAME) @@ -2314,7 +2313,7 @@ mod tests { let actual_schema = TableProvider::schema(&delta_table); assert_eq!(actual_schema, Arc::new(expected_schema)); - // Verify that the normal table exists in the metadata Delta Lake. + // Verify that the normal table exists in the Delta Lake. assert!(data_folder.is_normal_table(table_name).await.unwrap()); // Verify that the normal table is registered with Apache DataFusion. @@ -2520,7 +2519,7 @@ mod tests { // Verify that the time series table exists in the Delta Lake. assert!(data_folder.delta_table(table_name).await.is_ok()); - // Verify that the time series table exists in the metadata Delta Lake with the correct schema. + // Verify that the time series table exists in the Delta Lake with the correct schema. let time_series_table_metadata = data_folder .time_series_table_metadata_for_time_series_table(table_name) .await diff --git a/crates/modelardb_manager/src/metadata.rs b/crates/modelardb_manager/src/metadata.rs index 5bbb007be..00fdda038 100644 --- a/crates/modelardb_manager/src/metadata.rs +++ b/crates/modelardb_manager/src/metadata.rs @@ -13,8 +13,8 @@ * limitations under the License. */ -//! Management of the metadata Delta Lake for the manager. Metadata which is unique to the manager, -//! such as metadata about registered edges, is handled here. +//! Management of the Delta Lake for the manager. Metadata which is unique to the manager, such as +//! metadata about registered edges, is handled here. use std::str::FromStr; use std::sync::Arc; @@ -30,8 +30,8 @@ use uuid::Uuid; use crate::error::Result; -/// Stores the metadata required for reading from and writing to the normal tables and time series tables -/// and persisting edges. The data that needs to be persisted is stored in the metadata Delta Lake. +/// Stores the metadata required for reading from and writing to the normal tables and time series +/// tables and persisting edges. The data that needs to be persisted is stored in the Delta Lake. pub trait ManagerMetadata { async fn create_and_register_manager_metadata_data_folder_tables(&self) -> Result<()>; async fn manager_key(&self) -> Result; @@ -104,8 +104,8 @@ impl ManagerMetadata for DataFolder { } } - /// Save the node to the metadata Delta Lake and return [`Ok`]. If the node could not be saved, - /// return [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). + /// Save the node to the Delta Lake and return [`Ok`]. If the node could not be saved, return + /// [`ModelarDbManagerError`](crate::error::ModelarDbManagerError). async fn save_node(&self, node: Node) -> Result<()> { self.write_columns_to_metadata_table( "nodes", @@ -133,8 +133,8 @@ impl ManagerMetadata for DataFolder { Ok(()) } - /// Return the nodes currently controlled by the manager that have been persisted to the - /// metadata Delta Lake. If the nodes could not be retrieved, + /// Return the nodes currently controlled by the manager that have been persisted to the Delta + /// Lake. If the nodes could not be retrieved, /// [`ModelarDbManagerError`](crate::error::ModelarDbManagerError) is returned. async fn nodes(&self) -> Result> { let mut nodes: Vec = vec![]; diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 853fd1b8d..139e9a5b8 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -156,9 +156,9 @@ impl FlightServiceHandler { } } - /// Create a normal table, save it to the metadata Delta Lake and create it for each node - /// controlled by the manager. If the normal table cannot be saved to the metadata Delta Lake or - /// created for each node, return [`Status`]. + /// Create a normal table, save it to the Delta Lake and create it for each node controlled by + /// the manager. If the normal table cannot be saved to the Delta Lake or created for each node, + /// return [`Status`]. async fn save_and_create_cluster_normal_table( &self, table_name: &str, @@ -171,7 +171,7 @@ impl FlightServiceHandler { .await .map_err(error_to_status_internal)?; - // Persist the new normal table to the metadata Delta Lake. + // Persist the new normal table to the Delta Lake. self.context .remote_data_folder .save_normal_table_metadata(table_name) @@ -201,9 +201,9 @@ impl FlightServiceHandler { Ok(()) } - /// Create a time series table, save it to the metadata Delta Lake and create it for each node - /// controlled by the manager. If the time series table cannot be saved to the metadata Delta - /// Lake or created for each node, return [`Status`]. + /// Create a time series table, save it to the Delta Lake and create it for each node controlled + /// by the manager. If the time series table cannot be saved to the Delta Lake or created for + /// each node, return [`Status`]. async fn save_and_create_cluster_time_series_table( &self, time_series_table_metadata: Arc, @@ -215,7 +215,7 @@ impl FlightServiceHandler { .await .map_err(error_to_status_internal)?; - // Persist the new time series table to the metadata Delta Lake. + // Persist the new time series table to the Delta Lake. self.context .remote_data_folder .save_time_series_table_metadata(&time_series_table_metadata) @@ -250,19 +250,19 @@ impl FlightServiceHandler { Ok(()) } - /// Drop the table from the metadata Delta Lake, the data Delta Lake, and from each node - /// controlled by the manager. If the table does not exist or the table cannot be dropped from - /// the remote data folder and from each node, return [`Status`]. + /// Drop the table from the Delta Lake, the Delta Lake, and from each node controlled by the + /// manager. If the table does not exist or the table cannot be dropped from the remote data + /// folder and from each node, return [`Status`]. async fn drop_cluster_table(&self, table_name: &str) -> StdResult<(), Status> { - // Drop the table from the remote data folder metadata Delta Lake. This will return an error - // if the table does not exist. + // Drop the table from the remote data folder Delta Lake. This will return an error if the + // table does not exist. self.context .remote_data_folder .drop_table_metadata(table_name) .await .map_err(error_to_status_internal)?; - // Drop the table from the remote data folder data Delta lake. + // Drop the table from the remote data folder Delta lake. self.context .remote_data_folder .drop_table(table_name) @@ -291,7 +291,7 @@ impl FlightServiceHandler { ))); } - // Truncate the table in the remote data folder data Delta lake. + // Truncate the table in the remote data folder Delta lake. self.context .remote_data_folder .truncate_table(table_name) @@ -375,7 +375,7 @@ impl FlightService for FlightServiceHandler { &self, _request: Request, ) -> StdResult, Status> { - // Retrieve the table names from the metadata Delta Lake. + // Retrieve the table names from the Delta Lake. let table_names = self .context .remote_data_folder @@ -612,9 +612,9 @@ impl FlightService for FlightServiceHandler { .register_node(node.clone()) .map_err(error_to_status_internal)?; - // Use the metadata manager to persist the node to the metadata Delta Lake. Note that if - // this fails, the metadata Delta Lake and the cluster will be out of sync until the - // manager is restarted. + // Use the metadata manager to persist the node to the Delta Lake. Note that if this + // fails, the Delta Lake and the cluster will be out of sync until the manager is + // restarted. self.context .remote_data_folder .save_node(node) @@ -643,15 +643,15 @@ impl FlightService for FlightServiceHandler { let node_metadata = protocol::NodeMetadata::decode(action.body) .map_err(error_to_status_invalid_argument)?; - // Remove the node with the given url from the metadata Delta Lake. + // Remove the node with the given url from the Delta Lake. self.context .remote_data_folder .remove_node(&node_metadata.url) .await .map_err(error_to_status_internal)?; - // Remove the node with the given url from the cluster and kill it. Note that if this fails, - // the cluster and metadata Delta Lake will be out of sync until the manager is restarted. + // Remove the node with the given url from the cluster and kill it. Note that if this + // fails, the cluster and Delta Lake will be out of sync until the manager is restarted. self.context .cluster .write() diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 2afccbf95..69d57d1b2 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -126,7 +126,7 @@ impl Context { self.register_time_series_table(Arc::new(time_series_table_metadata.clone())) .await?; - // Persist the new time series table to the metadata Delta Lake. + // Persist the new time series table to the Delta Lake. self.data_folders .local_data_folder .save_time_series_table_metadata(time_series_table_metadata) @@ -140,9 +140,9 @@ impl Context { Ok(()) } - /// For each normal table saved in the metadata Delta Lake, register the normal table in Apache - /// DataFusion. If the normal tables could not be retrieved from the metadata Delta Lake or a - /// normal table could not be registered, return [`ModelarDbServerError`]. + /// For each normal table saved in the Delta Lake, register the normal table in Apache + /// DataFusion. If the normal tables could not be retrieved from the Delta Lake or a normal + /// table could not be registered, return [`ModelarDbServerError`]. pub async fn register_normal_tables(&self) -> Result<()> { // We register the normal tables in the local data folder to avoid registering tables that // NormalTableDataSink cannot write data to. @@ -190,9 +190,9 @@ impl Context { Ok(()) } - /// For each time series table saved in the metadata Delta Lake, register the time series table - /// in Apache DataFusion. If the time series tables could not be retrieved from the metadata - /// Delta Lake or a time series table could not be registered, return [`ModelarDbServerError`]. + /// For each time series table saved in the Delta Lake, register the time series table in Apache + /// DataFusion. If the time series tables could not be retrieved from the Delta Lake or a time + /// series table could not be registered, return [`ModelarDbServerError`]. pub async fn register_time_series_tables(&self) -> Result<()> { // We register the time series tables in the local data folder to avoid registering tables // that TimeSeriesTableDataSink cannot write data to. @@ -246,9 +246,8 @@ impl Context { } /// Drop the table with `table_name` if it exists. The table is deregistered from the Apache - /// Arrow Datafusion session context and deleted from the storage engine, metadata Delta Lake, - /// and data Delta Lake. If the table does not exist or if it could not be dropped, - /// [`ModelarDbServerError`] is returned. + /// Arrow Datafusion session context and deleted from the storage engine and Delta Lake. If the + /// table does not exist or if it could not be dropped, [`ModelarDbServerError`] is returned. pub async fn drop_table(&self, table_name: &str) -> Result<()> { // Deregistering the table from the Apache DataFusion session context and deleting the table // from the storage engine does not require the table to exist, so the table is checked first. @@ -263,7 +262,7 @@ impl Context { self.drop_table_from_storage_engine(table_name).await?; - // Drop the table metadata from the metadata Delta Lake. + // Drop the table metadata from the Delta Lake. self.data_folders .local_data_folder .drop_table_metadata(table_name) @@ -279,8 +278,8 @@ impl Context { } /// Delete all data from the table with `table_name` if it exists. The table data is deleted - /// from the storage engine and data Delta Lake. If the table does not exist or if it could not - /// be truncated, [`ModelarDbServerError`] is returned. + /// from the storage engine and Delta Lake. If the table does not exist or if it could not be + /// truncated, [`ModelarDbServerError`] is returned. pub async fn truncate_table(&self, table_name: &str) -> Result<()> { // Deleting the table from the storage engine does not require the table to exist, so the // table is checked first. @@ -290,7 +289,7 @@ impl Context { self.drop_table_from_storage_engine(table_name).await?; - // Delete the table data from the data Delta Lake. + // Delete the table data from the Delta Lake. self.data_folders .local_data_folder .truncate_table(table_name) @@ -438,7 +437,7 @@ mod tests { assert!(folder_path.exists()); - // The normal table should be saved to the metadata Delta Lake. + // The normal table should be saved to the Delta Lake. assert!( context .data_folders @@ -486,7 +485,7 @@ mod tests { .await .unwrap(); - // The time series table should be saved to the metadata Delta Lake. + // The time series table should be saved to the Delta Lake. let time_series_table_metadata = context .data_folders .local_data_folder @@ -531,7 +530,7 @@ mod tests { async fn test_register_normal_tables() { // The test succeeds if none of the unwrap()s fails. - // Save a normal table to the metadata Delta Lake. + // Save a normal table to the Delta Lake. let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; @@ -551,7 +550,7 @@ mod tests { async fn test_register_time_series_tables() { // The test succeeds if none of the unwrap()s fails. - // Save a time series table to the metadata Delta Lake. + // Save a time series table to the Delta Lake. let temp_dir = tempfile::tempdir().unwrap(); let context = create_context(&temp_dir).await; @@ -589,7 +588,7 @@ mod tests { .is_ok() ); - // The normal table should be deleted from the metadata Delta Lake. + // The normal table should be deleted from the Delta Lake. assert!( !context .data_folders @@ -625,7 +624,7 @@ mod tests { .is_ok() ); - // The time series table should be deleted from the metadata Delta Lake. + // The time series table should be deleted from the Delta Lake. assert!( !context .data_folders @@ -667,7 +666,7 @@ mod tests { context.truncate_table(NORMAL_TABLE_NAME).await.unwrap(); - // The normal table should not be deleted from the metadata Delta Lake. + // The normal table should not be deleted from the Delta Lake. assert!( local_data_folder .is_normal_table(NORMAL_TABLE_NAME) @@ -698,7 +697,7 @@ mod tests { .await .unwrap(); - // The time series table should not be deleted from the metadata Delta Lake. + // The time series table should not be deleted from the Delta Lake. assert!( local_data_folder .is_time_series_table(TIME_SERIES_TABLE_NAME) diff --git a/crates/modelardb_server/src/main.rs b/crates/modelardb_server/src/main.rs index 23171b7be..75f6f5833 100644 --- a/crates/modelardb_server/src/main.rs +++ b/crates/modelardb_server/src/main.rs @@ -52,11 +52,11 @@ pub enum ClusterMode { /// Setup tracing that prints to stdout, parse the command line arguments to extract /// [`DataFolders`], construct a [`Context`] with the systems components, initialize the normal -/// tables and time series tables in the metadata Delta Lake, initialize a CTRL+C handler that -/// flushes the data in memory to disk, and start the Apache Arrow Flight interface. Returns -/// [`ModelarDbServerError`](error::ModelarDbServerError) if the command line arguments -/// cannot be parsed, if the metadata cannot be read from the database, or if the Apache Arrow -/// Flight interface cannot be started. +/// tables and time series tables in the Delta Lake, initialize a CTRL+C handler that flushes the +/// data in memory to disk, and start the Apache Arrow Flight interface. Returns +/// [`ModelarDbServerError`](error::ModelarDbServerError) if the command line arguments cannot be +/// parsed, if the metadata cannot be read from the database, or if the Apache Arrow Flight +/// interface cannot be started. #[tokio::main] async fn main() -> Result<()> { // Initialize a tracing layer that logs events to stdout. diff --git a/crates/modelardb_server/src/manager.rs b/crates/modelardb_server/src/manager.rs index b1004af5c..2e00e39aa 100644 --- a/crates/modelardb_server/src/manager.rs +++ b/crates/modelardb_server/src/manager.rs @@ -14,7 +14,7 @@ */ //! Interface to connect to and interact with the manager, used if the server is started with a -//! manager and needs to interact with it to initialize the metadata Delta Lake. +//! manager and needs to interact with it to initialize the Delta Lake. use std::sync::Arc; use std::{env, str}; diff --git a/crates/modelardb_server/src/storage/compressed_data_manager.rs b/crates/modelardb_server/src/storage/compressed_data_manager.rs index c272aea74..38e90e949 100644 --- a/crates/modelardb_server/src/storage/compressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/compressed_data_manager.rs @@ -553,7 +553,7 @@ mod tests { COMPRESSED_RESERVED_MEMORY_IN_BYTES, )); - // Create a local data folder and save a single time series table to the metadata Delta Lake. + // Create a local data folder and save a single time series table to the Delta Lake. let temp_dir_url = temp_dir.path().to_str().unwrap(); let local_data_folder = DataFolder::open_local_url(temp_dir_url).await.unwrap(); diff --git a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs index a352159d1..a7274e815 100644 --- a/crates/modelardb_server/src/storage/uncompressed_data_manager.rs +++ b/crates/modelardb_server/src/storage/uncompressed_data_manager.rs @@ -225,9 +225,9 @@ impl UncompressedDataManager { /// Insert a single data point into the in-memory buffer with the tag hash that corresponds to /// `tag_values` if one exists. If the buffer has been spilled, read it back into memory. If no /// buffer exists for the tag hash, allocate a new buffer that will be compressed within the - /// error bound in `time_series_table_metadata`. Returns [`true`] if a buffer was spilled, [`false`] - /// if not, and [`ModelarDbServerError`](crate::error::ModelarDbServerError) if the error bound - /// cannot be retrieved from the metadata Delta Lake. + /// error bound in `time_series_table_metadata`. Returns [`true`] if a buffer was spilled, + /// [`false`] if not, and [`ModelarDbServerError`](crate::error::ModelarDbServerError) if the + /// error bound cannot be retrieved from the Delta Lake. async fn insert_data_point( &self, tag_values: Vec, @@ -591,7 +591,7 @@ impl UncompressedDataManager { .map(|(uncompressed_values, field_column_index)| { let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; - modelardb_compression::try_compress_univariate_arrays( + modelardb_compression::try_compress_univariate_time_series( uncompressed_timestamps, uncompressed_values, error_bound, diff --git a/crates/modelardb_storage/src/data_folder.rs b/crates/modelardb_storage/src/data_folder.rs index c7edcbca7..aab50ac4d 100644 --- a/crates/modelardb_storage/src/data_folder.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -1,4 +1,4 @@ -/* Copyright 2024 The ModelarDB Contributors +/* Copyright 2025 The ModelarDB Contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -266,8 +266,8 @@ impl DataFolder { Ok(data_folder) } - /// If they do not already exist, create the tables in the metadata Delta Lake for normal table - /// and time series table metadata and register them with the Apache DataFusion session context. + /// If they do not already exist, create the tables in the Delta Lake for normal table and time + /// series table metadata and register them with the Apache DataFusion session context. /// * The `normal_table_metadata` table contains the metadata for normal tables. /// * The `time_series_table_metadata` table contains the main metadata for time series tables. /// * The `time_series_table_field_columns` table contains the name, index, error bound value, @@ -331,9 +331,9 @@ impl DataFolder { } /// Register all normal tables and time series tables in `self` with its [`SessionContext`]. - /// `data_sink` set as the [`DataSink`] for all of the tables. If the tables could not be + /// `data_sink` is set as the [`DataSink`] for all of the tables. If the tables could not be /// registered, [`ModelarDbStorageError`] is returned. - pub async fn register_normal_and_time_series_tables( + pub async fn register_tables( &self, data_sink: Arc, ) -> Result<()> { @@ -386,7 +386,7 @@ impl DataFolder { /// [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or the /// table does not exist. pub async fn delta_table(&self, table_name: &str) -> Result { - let table_path = self.location_of_compressed_table(table_name); + let table_path = self.location_of_table(table_name); self.delta_table_from_path(&table_path).await } @@ -404,7 +404,7 @@ impl DataFolder { /// [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or the /// table does not exist. pub async fn delta_ops(&self, table_name: &str) -> Result { - let table_path = self.location_of_compressed_table(table_name); + let table_path = self.location_of_table(table_name); self.delta_table_from_path(&table_path) .await .map(Into::into) @@ -449,8 +449,8 @@ impl DataFolder { .contains(&table_name.to_owned())) } - /// Return the name of each table currently in the metadata Delta Lake. If the table names - /// cannot be retrieved, [`ModelarDbStorageError`] is returned. + /// Return the name of each table currently in the Delta Lake. If the table names cannot be + /// retrieved, [`ModelarDbStorageError`] is returned. pub async fn table_names(&self) -> Result> { let normal_table_names = self.normal_table_names().await?; let time_series_table_names = self.time_series_table_names().await?; @@ -461,8 +461,8 @@ impl DataFolder { Ok(table_names) } - /// Return the name of each normal table currently in the metadata Delta Lake. Note that this - /// does not include time series tables. If the normal table names cannot be retrieved, + /// Return the name of each normal table currently in the Delta Lake. Note that this does not + /// include time series tables. If the normal table names cannot be retrieved, /// [`ModelarDbStorageError`] is returned. pub async fn normal_table_names(&self) -> Result> { self.table_names_of_type(TableType::NormalTable).await @@ -478,7 +478,7 @@ impl DataFolder { { self.delta_table(table_name) .await - .expect("Delta Lake table should exist if the table is in the metadata Delta Lake.") + .expect("Delta Lake table should exist if the metadata is in the Delta Lake.") .get_schema() .expect("Delta Lake table should be loaded and metadata should be in the log.") .try_into() @@ -488,8 +488,8 @@ impl DataFolder { } } - /// Return the name of each time series table currently in the metadata Delta Lake. Note that - /// this does not include normal tables. If the time series table names cannot be retrieved, + /// Return the name of each time series table currently in the Delta Lake. Note that this does + /// not include normal tables. If the time series table names cannot be retrieved, /// [`ModelarDbStorageError`] is returned. pub async fn time_series_table_names(&self) -> Result> { self.table_names_of_type(TableType::TimeSeriesTable).await @@ -526,9 +526,9 @@ impl DataFolder { } } - /// Return a [`DeltaTableWriter`] for writing to the time series table with `delta_table` in the - /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be - /// established or the table does not exist. + /// Return a [`DeltaTableWriter`] for writing to the time series table corresponding to + /// `delta_table` in the Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta + /// Lake cannot be established or the table does not exist. pub async fn time_series_table_writer( &self, delta_table: DeltaTable, @@ -553,9 +553,9 @@ impl DataFolder { DeltaTableWriter::try_new(delta_table, partition_columns, writer_properties) } - /// Return a [`DeltaTableWriter`] for writing to the table with `delta_table` in the Delta Lake, - /// or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be established or - /// the table does not exist. + /// Return a [`DeltaTableWriter`] for writing to the table corresponding to `delta_table` in the + /// Delta Lake, or a [`ModelarDbStorageError`] if a connection to the Delta Lake cannot be + /// established or the table does not exist. pub async fn normal_or_metadata_table_writer( &self, delta_table: DeltaTable, @@ -599,7 +599,7 @@ impl DataFolder { table_name, schema, &[], - self.location_of_compressed_table(table_name), + self.location_of_table(table_name), SaveMode::ErrorIfExists, ) .await @@ -616,14 +616,14 @@ impl DataFolder { &time_series_table_metadata.name, &time_series_table_metadata.compressed_schema, &[FIELD_COLUMN.to_owned()], - self.location_of_compressed_table(&time_series_table_metadata.name), + self.location_of_table(&time_series_table_metadata.name), SaveMode::ErrorIfExists, ) .await } - /// Return the location of the compressed time series or normal table with `table_name`. - fn location_of_compressed_table(&self, table_name: &str) -> String { + /// Return the location of the table with `table_name. + fn location_of_table(&self, table_name: &str) -> String { format!("{}/{TABLE_FOLDER}/{table_name}", self.location) } @@ -672,11 +672,11 @@ impl DataFolder { Ok(delta_table) } - /// Drop the metadata Delta Lake table with `table_name` from the Delta Lake by deleting every - /// file related to the table. The table folder cannot be deleted directly since folders do not - /// exist in object stores and therefore cannot be operated upon. If the table was dropped - /// successfully, the paths to the deleted files are returned, otherwise a - /// [`ModelarDbStorageError`] is returned. + /// Drop the metadata table with `table_name` from the Delta Lake by deleting every file related + /// to the table. The table folder cannot be deleted directly since folders do not exist in + /// object stores and therefore cannot be operated upon. If the table was dropped successfully, + /// the paths to the deleted files are returned, otherwise a [`ModelarDbStorageError`] is + /// returned. pub async fn drop_metadata_table(&self, table_name: &str) -> Result> { let table_path = format!("{METADATA_FOLDER}/{table_name}"); self.delete_table_files(&table_path).await @@ -753,9 +753,9 @@ impl DataFolder { Ok(()) } - /// Save the created normal table to the metadata Delta Lake. This consists of adding a row to - /// the `normal_table_metadata` table with the `name` of the table. If the normal table metadata - /// was saved, return [`Ok`], otherwise return [`ModelarDbStorageError`]. + /// Save the created normal table to the Delta Lake. This consists of adding a row to the + /// `normal_table_metadata` table with the `name` of the table. If the normal table metadata was + /// saved, return [`Ok`], otherwise return [`ModelarDbStorageError`]. pub async fn save_normal_table_metadata(&self, name: &str) -> Result<()> { self.write_columns_to_metadata_table( "normal_table_metadata", @@ -766,14 +766,14 @@ impl DataFolder { Ok(()) } - /// Save the created time series table to the metadata Delta Lake. This includes adding a row to - /// the `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` + /// Save the created time series table to the Delta Lake. This includes adding a row to the + /// `time_series_table_metadata` table and adding a row to the `time_series_table_field_columns` /// table for each field column. pub async fn save_time_series_table_metadata( &self, time_series_table_metadata: &TimeSeriesTableMetadata, ) -> Result<()> { - // Convert the query schema to bytes, so it can be saved in the metadata Delta Lake. + // Convert the query schema to bytes, so it can be saved in the Delta Lake. let query_schema_bytes = try_convert_schema_to_bytes(&time_series_table_metadata.query_schema)?; @@ -845,9 +845,8 @@ impl DataFolder { Ok(()) } - /// Write `columns` to a metadata Delta Lake table with `table_name`. Returns an updated - /// [`DeltaTable`] version if the file was written successfully, otherwise returns - /// [`ModelarDbStorageError`]. + /// Write `columns` to a Delta Lake table with `table_name`. Returns an updated [`DeltaTable`] + /// version if the file was written successfully, otherwise returns [`ModelarDbStorageError`]. pub async fn write_columns_to_metadata_table( &self, table_name: &str, @@ -874,7 +873,7 @@ impl DataFolder { .await } - /// Write `record_batches` with segments to a Delta Lake table for a time series table with + /// Write `compressed_segments` with segments to a Delta Lake table for a time series table with /// `table_name`. Returns an updated [`DeltaTable`] if the file was written successfully, /// otherwise returns [`ModelarDbStorageError`]. pub async fn write_compressed_segments_to_time_series_table( @@ -890,7 +889,7 @@ impl DataFolder { /// Write `record_batches` to the `delta_table_writer` and commit. Returns an updated /// [`DeltaTable`] if all `record_batches` are written and committed successfully, otherwise it - /// rollback all writes done using `delta_table_writer` and returns [`ModelarDbStorageError`]. + /// rolls back all writes done using `delta_table_writer` and returns [`ModelarDbStorageError`]. async fn write_record_batches_to_table( &self, mut delta_table_writer: DeltaTableWriter, @@ -905,9 +904,9 @@ impl DataFolder { } } - /// Depending on the type of the table with `table_name`, drop either the normal table - /// metadata or the time series table metadata from the metadata Delta Lake. If the table does - /// not exist or the metadata could not be dropped, [`ModelarDbStorageError`] is returned. + /// Depending on the type of the table with `table_name`, drop either the normal table metadata + /// or the time series table metadata from the Delta Lake. If the table does not exist or the + /// metadata could not be dropped, [`ModelarDbStorageError`] is returned. pub async fn drop_table_metadata(&self, table_name: &str) -> Result<()> { if self.is_normal_table(table_name).await? { self.drop_normal_table_metadata(table_name).await @@ -921,8 +920,8 @@ impl DataFolder { } /// Drop the metadata for the normal table with `table_name` from the `normal_table_metadata` - /// table in the metadata Delta Lake. If the metadata could not be dropped, - /// [`ModelarDbStorageError`] is returned. + /// table in the Delta Lake. If the metadata could not be dropped, [`ModelarDbStorageError`] is + /// returned. async fn drop_normal_table_metadata(&self, table_name: &str) -> Result<()> { let delta_ops = self.metadata_delta_ops("normal_table_metadata").await?; @@ -934,10 +933,10 @@ impl DataFolder { Ok(()) } - /// Drop the metadata for the time series table with `table_name` from the metadata Delta Lake. - /// This includes deleting a row from the `time_series_table_metadata` table and deleting a row - /// from the `time_series_table_field_columns` table for each field column. If the metadata - /// could not be dropped, [`ModelarDbStorageError`] is returned. + /// Drop the metadata for the time series table with `table_name` from the Delta Lake. This + /// includes deleting a row from the `time_series_table_metadata` table and deleting a row from + /// the `time_series_table_field_columns` table for each field column. If the metadata could not + /// be dropped, [`ModelarDbStorageError`] is returned. async fn drop_time_series_table_metadata(&self, table_name: &str) -> Result<()> { // Delete the table metadata from the time_series_table_metadata table. self.metadata_delta_ops("time_series_table_metadata") @@ -985,7 +984,7 @@ impl DataFolder { } /// Return the [`TimeSeriesTableMetadata`] for the time series table with `table_name` in the - /// metadata Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, + /// Delta Lake. If the [`TimeSeriesTableMetadata`] cannot be retrieved, /// [`ModelarDbStorageError`] is returned. pub async fn time_series_table_metadata_for_time_series_table( &self, @@ -1406,7 +1405,7 @@ mod tests { async fn test_save_normal_table_metadata() { let (_temp_dir, data_folder) = create_data_folder_and_save_normal_tables().await; - // Retrieve the normal table from the metadata Delta Lake. + // Retrieve the normal table from the Delta Lake. let sql = "SELECT table_name FROM metadata.normal_table_metadata ORDER BY table_name"; let batch = sql_and_concat(&data_folder.session_context, sql) .await @@ -1669,7 +1668,7 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let data_folder = DataFolder::open_local(temp_dir.path()).await.unwrap(); - // Save a time series table to the metadata Delta Lake. + // Save a time series table to the Delta Lake. let time_series_table_metadata = test::time_series_table_metadata(); data_folder .save_time_series_table_metadata(&time_series_table_metadata) diff --git a/crates/modelardb_storage/src/query/normal_table.rs b/crates/modelardb_storage/src/query/normal_table.rs index c6ff4d20a..a30f74d73 100644 --- a/crates/modelardb_storage/src/query/normal_table.rs +++ b/crates/modelardb_storage/src/query/normal_table.rs @@ -129,7 +129,7 @@ impl TableProvider for NormalTable { /// Create an [`ExecutionPlan`] that will insert the result of `input` into the normal table. /// Generally, [`arrow_flight::flight_service_server::FlightService::do_put()`] should be used /// instead of this method as it is more efficient. Returns a [`DataFusionError::Plan`] if the - /// necessary metadata cannot be retrieved from the metadata Delta Lake. + /// necessary metadata cannot be retrieved from the Delta Lake. async fn insert_into( &self, _state: &dyn Session, diff --git a/crates/modelardb_storage/src/query/time_series_table.rs b/crates/modelardb_storage/src/query/time_series_table.rs index 1b6392520..82ef41b4c 100644 --- a/crates/modelardb_storage/src/query/time_series_table.rs +++ b/crates/modelardb_storage/src/query/time_series_table.rs @@ -407,7 +407,7 @@ fn convert_logical_expr_to_physical_expr( /// Create an [`ExecutionPlan`] that will return the compressed segments that represent the data /// points for `field_column_index` in `delta_table`. Returns a [`DataFusionError`] if the necessary -/// metadata cannot be retrieved from the metadata Delta Lake. +/// metadata cannot be retrieved from the Delta Lake. fn new_data_source_exec( delta_table: &DeltaTable, partition_filters: &[PartitionFilter], From bff7eaf3a1b7ff20a3e060a6d98c5b3180fe1486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Thu, 30 Oct 2025 07:49:59 +0000 Subject: [PATCH 27/31] Fix compile error due to renaming --- crates/modelardb_compression/src/compression.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 280d1a5bb..6dd6550d5 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -161,7 +161,7 @@ pub fn try_split_and_compress_univariate_time_series( let error_bound = time_series_table_metadata.error_bounds[*field_column_index]; - let compressed_time_series = try_compress_univariate_time_series( + let compressed_segments = try_compress_univariate_time_series( uncompressed_timestamps, uncompressed_values, error_bound, @@ -171,7 +171,7 @@ pub fn try_split_and_compress_univariate_time_series( ) .expect("uncompressed_timestamps and uncompressed_values should have the same length."); - compressed_time_series.push(compressed_time_series); + compressed_time_series.push(compressed_segments); } Ok(()) From 5c33dc42e5435fc862ab1d40613f4a0729ab03ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Thu, 30 Oct 2025 08:32:30 +0000 Subject: [PATCH 28/31] Update based on comments from @CGodiksen --- crates/modelardb_compression/src/compression.rs | 6 +++--- crates/modelardb_storage/src/data_folder.rs | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/modelardb_compression/src/compression.rs b/crates/modelardb_compression/src/compression.rs index 6dd6550d5..db48ee612 100644 --- a/crates/modelardb_compression/src/compression.rs +++ b/crates/modelardb_compression/src/compression.rs @@ -140,9 +140,9 @@ fn sort_time_series_by_tags_and_time( } /// Compress the field columns in `uncompressed_time_series` from the table with -/// `time_series_table_metadata` using [`try_compress_univariate_time_series`] and append the result -/// to `compressed_data`. It is assumed that all data points in `uncompressed_time_series` have the -/// same tags as in `tag_values`. +/// `time_series_table_metadata` using [`try_compress_univariate_time_series()`] and append the +/// result to `compressed_data`. It is assumed that all data points in `uncompressed_time_series` +/// have the same tags as in `tag_values`. pub fn try_split_and_compress_univariate_time_series( time_series_table_metadata: &TimeSeriesTableMetadata, uncompressed_time_series: &RecordBatch, diff --git a/crates/modelardb_storage/src/data_folder.rs b/crates/modelardb_storage/src/data_folder.rs index aab50ac4d..86f2a66ec 100644 --- a/crates/modelardb_storage/src/data_folder.rs +++ b/crates/modelardb_storage/src/data_folder.rs @@ -622,7 +622,7 @@ impl DataFolder { .await } - /// Return the location of the table with `table_name. + /// Return the location of the table with `table_name`. fn location_of_table(&self, table_name: &str) -> String { format!("{}/{TABLE_FOLDER}/{table_name}", self.location) } @@ -873,9 +873,9 @@ impl DataFolder { .await } - /// Write `compressed_segments` with segments to a Delta Lake table for a time series table with - /// `table_name`. Returns an updated [`DeltaTable`] if the file was written successfully, - /// otherwise returns [`ModelarDbStorageError`]. + /// Write `compressed_segments` to a Delta Lake table for a time series table with `table_name`. + /// Returns an updated [`DeltaTable`] if the file was written successfully, otherwise returns + /// [`ModelarDbStorageError`]. pub async fn write_compressed_segments_to_time_series_table( &self, table_name: &str, From 1e99357164a04df2724a588d2abf97b1f7bbad7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Fri, 31 Oct 2025 14:26:13 +0000 Subject: [PATCH 29/31] Update based on comments from @chrthomsen --- crates/modelardb_manager/src/remote.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/modelardb_manager/src/remote.rs b/crates/modelardb_manager/src/remote.rs index 139e9a5b8..6c690126b 100644 --- a/crates/modelardb_manager/src/remote.rs +++ b/crates/modelardb_manager/src/remote.rs @@ -262,7 +262,7 @@ impl FlightServiceHandler { .await .map_err(error_to_status_internal)?; - // Drop the table from the remote data folder Delta lake. + // Drop the table from the remote data folder Delta Lake. self.context .remote_data_folder .drop_table(table_name) @@ -291,7 +291,7 @@ impl FlightServiceHandler { ))); } - // Truncate the table in the remote data folder Delta lake. + // Truncate the table in the remote data folder Delta Lake. self.context .remote_data_folder .truncate_table(table_name) @@ -318,7 +318,7 @@ impl FlightServiceHandler { table_name: &str, maybe_retention_period_in_seconds: Option, ) -> StdResult<(), Status> { - // Vacuum the table in the remote data folder Delta lake. + // Vacuum the table in the remote data folder Delta Lake. self.context .remote_data_folder .vacuum_table(table_name, maybe_retention_period_in_seconds) From 8ca91ee30e22a3c66aec77af68b0457ad67d4986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Fri, 31 Oct 2025 15:08:24 +0000 Subject: [PATCH 30/31] Fix cargo doc warnings --- crates/modelardb_embedded/src/operations/data_folder.rs | 2 +- crates/modelardb_server/src/context.rs | 8 ++++---- crates/modelardb_server/src/remote.rs | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index a31e28b29..108acf339 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -39,7 +39,7 @@ use crate::operations::{ }; use crate::{Aggregate, TableType}; -/// [`DataSink`] that rejects INSERT statements passed to [`DataFolder.read()`]. +/// [`DataSink`] that rejects INSERT statements passed to [`Operations.read()`]. pub struct DataFolderDataSink { /// The schema of the data sink is empty since it rejects everything. schema: Arc, diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 69d57d1b2..3be361319 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -161,8 +161,8 @@ impl Context { /// Register the normal table with `table_name` in Apache DataFusion. If the normal table does /// not exist or could not be registered with Apache DataFusion, return - /// [`ModelarDbServerError`]. [`DataFolder.register_normal_and_time_series_tables()`] is not - /// used so a unique [`NormalTableDataSink`] can be passed per table. + /// [`ModelarDbServerError`]. [`DataFolder.register_tables()`] is not used so a unique + /// [`NormalTableDataSink`] can be passed per table. async fn register_normal_table(&self, table_name: &str) -> Result<()> { let session_context = self.data_folders.query_data_folder.session_context(); @@ -211,8 +211,8 @@ impl Context { /// Register the time series table with `time_series_table_metadata` in Apache DataFusion. If /// the time series table does not exist or could not be registered with Apache DataFusion, - /// return [`ModelarDbServerError`]. [`DataFolder.register_normal_and_time_series_tables()`] is - /// not used so a unique [`TimeSeriesTableDataSink`] can be passed per table. + /// return [`ModelarDbServerError`]. [`DataFolder.register_tables()`] is not used so a unique + /// [`TimeSeriesTableDataSink`] can be passed per table. async fn register_time_series_table( &self, time_series_table_metadata: Arc, diff --git a/crates/modelardb_server/src/remote.rs b/crates/modelardb_server/src/remote.rs index 96bdd206e..e0d96ce62 100644 --- a/crates/modelardb_server/src/remote.rs +++ b/crates/modelardb_server/src/remote.rs @@ -227,8 +227,8 @@ pub fn flight_data_to_record_batch( .map_err(|error| Status::invalid_argument(error.to_string())) } -/// Return the table stored as the first element in [`FlightDescriptor.path`], otherwise a -/// [`Status`] that specifies that the table name is missing. +/// Return the table stored as the first element in `FlightDescriptor.path`, otherwise a [`Status`] +/// that specifies that the table name is missing. pub fn table_name_from_flight_descriptor( flight_descriptor: &FlightDescriptor, ) -> StdResult<&String, Status> { From ef9c74d3212d14d9292e530917fe1d61bebe7d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Kejser=20Jensen?= Date: Fri, 31 Oct 2025 15:29:09 +0000 Subject: [PATCH 31/31] Import modelardb_storage::data_folder::DataFolder --- crates/modelardb_embedded/src/operations/data_folder.rs | 2 +- crates/modelardb_server/src/context.rs | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/modelardb_embedded/src/operations/data_folder.rs b/crates/modelardb_embedded/src/operations/data_folder.rs index 108acf339..1300f62d1 100644 --- a/crates/modelardb_embedded/src/operations/data_folder.rs +++ b/crates/modelardb_embedded/src/operations/data_folder.rs @@ -39,7 +39,7 @@ use crate::operations::{ }; use crate::{Aggregate, TableType}; -/// [`DataSink`] that rejects INSERT statements passed to [`Operations.read()`]. +/// [`DataSink`] that rejects INSERT statements passed to [`DataFolder::read()`]. pub struct DataFolderDataSink { /// The schema of the data sink is empty since it rejects everything. schema: Arc, diff --git a/crates/modelardb_server/src/context.rs b/crates/modelardb_server/src/context.rs index 3be361319..0c88e1b0a 100644 --- a/crates/modelardb_server/src/context.rs +++ b/crates/modelardb_server/src/context.rs @@ -161,8 +161,8 @@ impl Context { /// Register the normal table with `table_name` in Apache DataFusion. If the normal table does /// not exist or could not be registered with Apache DataFusion, return - /// [`ModelarDbServerError`]. [`DataFolder.register_tables()`] is not used so a unique - /// [`NormalTableDataSink`] can be passed per table. + /// [`ModelarDbServerError`]. [`modelardb_storage::data_folder::DataFolder::register_tables()`] + /// is not used so a unique [`NormalTableDataSink`] can be passed per table. async fn register_normal_table(&self, table_name: &str) -> Result<()> { let session_context = self.data_folders.query_data_folder.session_context(); @@ -211,7 +211,8 @@ impl Context { /// Register the time series table with `time_series_table_metadata` in Apache DataFusion. If /// the time series table does not exist or could not be registered with Apache DataFusion, - /// return [`ModelarDbServerError`]. [`DataFolder.register_tables()`] is not used so a unique + /// return [`ModelarDbServerError`]. + /// [`modelardb_storage::data_folder::DataFolder::register_tables()`] is not used so a unique /// [`TimeSeriesTableDataSink`] can be passed per table. async fn register_time_series_table( &self,