From 52b2dd4a2cb955c88c55fa458801d7ade355701a Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:31:54 -0500 Subject: [PATCH 1/2] some docstrings --- datafusion/datasource/src/file_scan_config.rs | 42 +++++++++---------- datafusion/datasource/src/table_schema.rs | 33 +++++++++++++++ 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index d557a99274eab..659b9f1d1834c 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -155,6 +155,11 @@ pub struct FileScanConfig { pub object_store_url: ObjectStoreUrl, /// Schema information including the file schema, table partition columns, /// and the combined table schema. + /// + /// The table schema (file schema + partition columns) is the schema exposed + /// upstream of [`FileScanConfig`] (e.g. in [`DataSourceExec`]). + /// + /// See [`TableSchema`] for more information. /// /// [`DataSourceExec`]: crate::source::DataSourceExec pub table_schema: TableSchema, @@ -244,23 +249,19 @@ pub struct FileScanConfig { #[derive(Clone)] pub struct FileScanConfigBuilder { object_store_url: ObjectStoreUrl, - /// Table schema before any projections or partition columns are applied. + /// Schema information including the file schema, table partition columns, + /// and the combined table schema. /// - /// This schema is used to read the files, but is **not** necessarily the - /// schema of the physical files. Rather this is the schema that the + /// This schema is used to read the files, but the file schema is **not** necessarily + /// the schema of the physical files. Rather this is the schema that the /// physical file schema will be mapped onto, and the schema that the /// [`DataSourceExec`] will return. /// - /// This is usually the same as the table schema as specified by the `TableProvider` minus any partition columns. - /// - /// This probably would be better named `table_schema` - /// /// [`DataSourceExec`]: crate::source::DataSourceExec - file_schema: SchemaRef, + table_schema: TableSchema, file_source: Arc, limit: Option, projection: Option>, - table_partition_cols: Vec, constraints: Option, file_groups: Vec, statistics: Option, @@ -285,7 +286,7 @@ impl FileScanConfigBuilder { ) -> Self { Self { object_store_url, - file_schema, + table_schema: TableSchema::from_file_schema(file_schema), file_source, file_groups: vec![], statistics: None, @@ -294,7 +295,6 @@ impl FileScanConfigBuilder { new_lines_in_values: None, limit: None, projection: None, - table_partition_cols: vec![], constraints: None, batch_size: None, expr_adapter_factory: None, @@ -326,10 +326,13 @@ impl FileScanConfigBuilder { /// Set the partitioning columns pub fn with_table_partition_cols(mut self, table_partition_cols: Vec) -> Self { - self.table_partition_cols = table_partition_cols + let table_partition_cols: Vec = table_partition_cols .into_iter() .map(|f| Arc::new(f) as FieldRef) .collect(); + self.table_schema = self + .table_schema + .with_table_partition_cols(table_partition_cols); self } @@ -427,11 +430,10 @@ impl FileScanConfigBuilder { pub fn build(self) -> FileScanConfig { let Self { object_store_url, - file_schema, + table_schema, file_source, limit, projection, - table_partition_cols, constraints, file_groups, statistics, @@ -443,19 +445,16 @@ impl FileScanConfigBuilder { } = self; let constraints = constraints.unwrap_or_default(); - let statistics = - statistics.unwrap_or_else(|| Statistics::new_unknown(&file_schema)); + let statistics = statistics + .unwrap_or_else(|| Statistics::new_unknown(table_schema.file_schema())); let file_source = file_source .with_statistics(statistics.clone()) - .with_schema(Arc::clone(&file_schema)); + .with_schema(Arc::clone(table_schema.file_schema())); let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); - // Create TableSchema from file_schema and table_partition_cols - let table_schema = TableSchema::new(file_schema, table_partition_cols); - FileScanConfig { object_store_url, table_schema, @@ -477,7 +476,7 @@ impl From for FileScanConfigBuilder { fn from(config: FileScanConfig) -> Self { Self { object_store_url: config.object_store_url, - file_schema: Arc::clone(config.table_schema.file_schema()), + table_schema: config.table_schema, file_source: Arc::::clone(&config.file_source), file_groups: config.file_groups, statistics: config.file_source.statistics().ok(), @@ -486,7 +485,6 @@ impl From for FileScanConfigBuilder { new_lines_in_values: Some(config.new_lines_in_values), limit: config.limit, projection: config.projection, - table_partition_cols: config.table_schema.table_partition_cols().clone(), constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs index 9413bd9ef20bf..fed71d525554b 100644 --- a/datafusion/datasource/src/table_schema.rs +++ b/datafusion/datasource/src/table_schema.rs @@ -121,6 +121,39 @@ impl TableSchema { } } + /// Create a new TableSchema from a file schema with no partition columns. + pub fn from_file_schema(file_schema: SchemaRef) -> Self { + Self::new(file_schema, vec![]) + } + + /// Set the table partition columns and rebuild the table schema. + /// + /// This is a convenience method for constructing a TableSchema with builder-style syntax. + /// + /// # Example + /// + /// ``` + /// # use std::sync::Arc; + /// # use arrow::datatypes::{Schema, Field, DataType}; + /// # use datafusion_datasource::TableSchema; + /// let file_schema = Arc::new(Schema::new(vec![ + /// Field::new("user_id", DataType::Int64, false), + /// ])); + /// + /// let table_schema = TableSchema::from_file_schema(file_schema) + /// .with_table_partition_cols(vec![ + /// Arc::new(Field::new("date", DataType::Utf8, false)), + /// ]); + /// + /// assert_eq!(table_schema.table_schema().fields().len(), 2); + /// ``` + pub fn with_table_partition_cols( + self, + table_partition_cols: Vec, + ) -> TableSchema { + Self::new(self.file_schema, table_partition_cols) + } + /// Get the file schema (without partition columns). /// /// This is the schema of the actual data files on disk. From c61d53f571475fca866df15b7691258d9683df25 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:53:54 -0500 Subject: [PATCH 2/2] cleanup --- datafusion/datasource/src/file_scan_config.rs | 4 +-- datafusion/datasource/src/table_schema.rs | 25 +++---------------- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 659b9f1d1834c..4dfb6a4ec3d33 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -155,10 +155,10 @@ pub struct FileScanConfig { pub object_store_url: ObjectStoreUrl, /// Schema information including the file schema, table partition columns, /// and the combined table schema. - /// + /// /// The table schema (file schema + partition columns) is the schema exposed /// upstream of [`FileScanConfig`] (e.g. in [`DataSourceExec`]). - /// + /// /// See [`TableSchema`] for more information. /// /// [`DataSourceExec`]: crate::source::DataSourceExec diff --git a/datafusion/datasource/src/table_schema.rs b/datafusion/datasource/src/table_schema.rs index fed71d525554b..8e95585ce873b 100644 --- a/datafusion/datasource/src/table_schema.rs +++ b/datafusion/datasource/src/table_schema.rs @@ -127,31 +127,12 @@ impl TableSchema { } /// Set the table partition columns and rebuild the table schema. - /// - /// This is a convenience method for constructing a TableSchema with builder-style syntax. - /// - /// # Example - /// - /// ``` - /// # use std::sync::Arc; - /// # use arrow::datatypes::{Schema, Field, DataType}; - /// # use datafusion_datasource::TableSchema; - /// let file_schema = Arc::new(Schema::new(vec![ - /// Field::new("user_id", DataType::Int64, false), - /// ])); - /// - /// let table_schema = TableSchema::from_file_schema(file_schema) - /// .with_table_partition_cols(vec![ - /// Arc::new(Field::new("date", DataType::Utf8, false)), - /// ]); - /// - /// assert_eq!(table_schema.table_schema().fields().len(), 2); - /// ``` pub fn with_table_partition_cols( - self, + mut self, table_partition_cols: Vec, ) -> TableSchema { - Self::new(self.file_schema, table_partition_cols) + self.table_partition_cols = table_partition_cols; + self } /// Get the file schema (without partition columns).