From 96ea5ef6ad24b558e5c3bd61b414c355cfa0372e Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 6 Jun 2025 19:20:17 +0800
Subject: [PATCH 01/22] feat: Enhance schema inference to preserve provided
 schema in ListingTableConfig

---
 .../core/src/datasource/listing/table.rs      | 46 +++++++++++++++++--
 1 file changed, 41 insertions(+), 5 deletions(-)
diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 3c87d3ee2329c..91f8b6243192f 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -216,14 +216,25 @@ impl ListingTableConfig {
     pub async fn infer_schema(self, state: &dyn Session) -> Result<Self> {
         match self.options {
             Some(options) => {
-                let schema = if let Some(url) = self.table_paths.first() {
-                    options.infer_schema(state, url).await?
-                } else {
-                    Arc::new(Schema::empty())
+                let ListingTableConfig {
+                    table_paths,
+                    file_schema,
+                    ..
+                } = self;
+
+                let schema = match file_schema {
+                    Some(schema) => schema,
+                    None => {
+                        if let Some(url) = table_paths.first() {
+                            options.infer_schema(state, url).await?
+                        } else {
+                            Arc::new(Schema::empty())
+                        }
+                    }
                 };
 
                 Ok(Self {
-                    table_paths: self.table_paths,
+                    table_paths,
                     file_schema: Some(schema),
                     options: Some(options),
                 })
@@ -2473,4 +2484,29 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn infer_preserves_provided_schema() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let testdata = crate::test_util::datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
+
+        let provided_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Float32, true),
+            Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::Boolean, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]));
+
+        let config =
+            ListingTableConfig::new(table_path).with_schema(Arc::clone(&provided_schema));
+
+        let config = config.infer(&ctx.state()).await?;
+
+        assert_eq!(*config.file_schema.unwrap(), *provided_schema);
+
+        Ok(())
+    }
 }

From d5fd7f8b9d6fc2a8e90cf2ef12f2541200131c07 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 6 Jun 2025 21:00:51 +0800
Subject: [PATCH 02/22] feat: Add SchemaSource enum to track schema origin in
 ListingTableConfig

---
 .../core/src/datasource/listing/table.rs      | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 91f8b6243192f..693a33d569cfb 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -58,6 +58,17 @@ use futures::{future, stream, Stream, StreamExt, TryStreamExt};
 use itertools::Itertools;
 use object_store::ObjectStore;
 
+/// Indicates the source of the schema for a [`ListingTable`]
+#[derive(Debug, Clone, PartialEq)]
+pub enum SchemaSource {
+    /// Schema is not yet set (initial state)
+    None,
+    /// Schema was inferred from first table_path
+    Inferred,
+    /// Schema was specified explicitly via with_schema
+    Specified,
+}
+
 /// Configuration for creating a [`ListingTable`]
 ///
 ///
@@ -74,6 +85,8 @@ pub struct ListingTableConfig {
     ///
     /// See details on [`ListingTableConfig::with_listing_options`]
     pub options: Option<ListingOptions>,
+    /// Tracks the source of the schema information
+    schema_source: SchemaSource,
 }
 
 impl ListingTableConfig {
@@ -84,6 +97,7 @@ impl ListingTableConfig {
             table_paths,
             file_schema: None,
             options: None,
+            schema_source: SchemaSource::None,
         }
     }
 
@@ -95,8 +109,14 @@ impl ListingTableConfig {
             table_paths,
             file_schema: None,
             options: None,
+            schema_source: SchemaSource::None,
         }
     }
+
+    /// Returns the source of the schema for this configuration
+    pub fn schema_source(&self) -> &SchemaSource {
+        &self.schema_source
+    }
     /// Set the `schema` for the overall [`ListingTable`]
     ///
     /// [`ListingTable`] will automatically coerce, when possible, the schema
@@ -112,6 +132,7 @@ impl ListingTableConfig {
             table_paths: self.table_paths,
             file_schema: Some(schema),
             options: self.options,
+            schema_source: SchemaSource::Specified,
         }
     }
 
@@ -124,6 +145,7 @@ impl ListingTableConfig {
             table_paths: self.table_paths,
             file_schema: self.file_schema,
             options: Some(listing_options),
+            schema_source: self.schema_source,
         }
     }
 
@@ -203,6 +225,7 @@ impl ListingTableConfig {
             table_paths: self.table_paths,
             file_schema: self.file_schema,
             options: Some(listing_options),
+            schema_source: self.schema_source,
         })
     }
 
@@ -219,16 +242,20 @@ impl ListingTableConfig {
                 let ListingTableConfig {
                     table_paths,
                     file_schema,
-                    ..
+                    options: _,
+                    schema_source,
                 } = self;
 
-                let schema = match file_schema {
-                    Some(schema) => schema,
+                let (schema, new_schema_source) = match file_schema {
+                    Some(schema) => (schema, schema_source), // Keep existing source if schema exists
                     None => {
                         if let Some(url) = table_paths.first() {
-                            options.infer_schema(state, url).await?
+                            (
+                                options.infer_schema(state, url).await?,
+                                SchemaSource::Inferred,
+                            )
                         } else {
-                            Arc::new(Schema::empty())
+                            (Arc::new(Schema::empty()), SchemaSource::Inferred)
                         }
                     }
                 };
@@ -237,6 +264,7 @@ impl ListingTableConfig {
                     table_paths,
                     file_schema: Some(schema),
                     options: Some(options),
+                    schema_source: new_schema_source,
                 })
             }
             None => internal_err!("No `ListingOptions` set for inferring schema"),
@@ -277,6 +305,7 @@ impl ListingTableConfig {
                     table_paths: self.table_paths,
                     file_schema: self.file_schema,
                     options: Some(options),
+                    schema_source: self.schema_source,
                 })
             }
             None => config_err!("No `ListingOptions` set for inferring schema"),
@@ -1212,7 +1241,6 @@ impl ListingTable {
 /// * `files` - A stream of `Result<PartitionedFile>` items to process
 /// * `limit` - An optional row count limit. If provided, the function will stop collecting files
 ///   once the accumulated number of rows exceeds this limit
-/// * `collect_stats` - Whether to collect and accumulate statistics from the files
 ///
 /// # Returns
 /// A `Result` containing a `FileGroup` with the collected files

From 4512ea79d0170a9b07b706b2e91183cb15365b39 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 6 Jun 2025 21:10:42 +0800
Subject: [PATCH 03/22] feat: Add schema_source field to ListingTable to track
 schema derivation

---
 datafusion/core/src/datasource/listing/table.rs | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 693a33d569cfb..9e18e886eed0d 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -798,6 +798,8 @@ pub struct ListingTable {
     ///     - Partition columns are derived from directory paths (not stored in files)
     ///     - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet`
     table_schema: SchemaRef,
+    /// Indicates how the schema was derived (inferred or explicitly specified)
+    schema_source: SchemaSource,
     options: ListingOptions,
     definition: Option<String>,
     collected_statistics: FileStatisticsCache,
@@ -810,6 +812,9 @@ impl ListingTable {
     ///
     /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`]
     pub fn try_new(config: ListingTableConfig) -> Result<Self> {
+        // Extract schema_source before moving other parts of the config
+        let schema_source = config.schema_source().clone();
+
         let file_schema = config
             .file_schema
             .ok_or_else(|| DataFusionError::Internal("No schema provided.".into()))?;
@@ -834,6 +839,7 @@ impl ListingTable {
             table_paths: config.table_paths,
             file_schema,
             table_schema,
+            schema_source,
             options,
             definition: None,
             collected_statistics: Arc::new(DefaultFileStatisticsCache::default()),
@@ -887,6 +893,11 @@ impl ListingTable {
         &self.options
     }
 
+    /// Get the schema source
+    pub fn schema_source(&self) -> &SchemaSource {
+        &self.schema_source
+    }
+
     /// If file_sort_order is specified, creates the appropriate physical expressions
     fn try_create_output_ordering(&self) -> Result<Vec<LexOrdering>> {
         create_ordering(&self.table_schema, &self.options.file_sort_order)

From a0ed876cb451cf5db3e202e2769e5a6e4adedefa Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 6 Jun 2025 21:26:25 +0800
Subject: [PATCH 04/22] feat: Add tests for schema source tracking in
 ListingTableConfig

---
 .../core/src/datasource/listing/table.rs      | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 9e18e886eed0d..4e1c4cbed0fa2 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -2548,4 +2548,114 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_schema_source_tracking() -> Result<()> {
+        let ctx = SessionContext::new();
+
+        let testdata = crate::test_util::datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
+
+        // Test default schema source
+        let config = ListingTableConfig::new(table_path.clone());
+        assert_eq!(*config.schema_source(), SchemaSource::None);
+
+        // Test schema source after setting a schema explicitly
+        let provided_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Float32, true),
+            Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::Boolean, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]));
+
+        let config = config.with_schema(provided_schema.clone());
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Test schema source after inferring schema
+        let format = csv::CsvFormat::default().with_schema(None);
+        let options = ListingOptions::new(Arc::new(format));
+        let config_without_schema =
+            ListingTableConfig::new(table_path).with_listing_options(options);
+        assert_eq!(*config_without_schema.schema_source(), SchemaSource::None);
+
+        let config_with_inferred =
+            config_without_schema.infer_schema(&ctx.state()).await?;
+        assert_eq!(
+            *config_with_inferred.schema_source(),
+            SchemaSource::Inferred
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_schema_source_in_listing_table() -> Result<()> {
+        let ctx = SessionContext::new();
+        let testdata = crate::test_util::datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
+
+        // Create a table with specified schema
+        let specified_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Float32, true),
+            Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::Boolean, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]));
+
+        let format = csv::CsvFormat::default().with_schema(None);
+        let options = ListingOptions::new(Arc::new(format));
+
+        let config_specified = ListingTableConfig::new(table_path.clone())
+            .with_listing_options(options.clone())
+            .with_schema(specified_schema);
+
+        let table_specified = ListingTable::try_new(config_specified)?;
+        assert_eq!(*table_specified.schema_source(), SchemaSource::Specified);
+
+        // Create a table with inferred schema
+        let config_to_infer =
+            ListingTableConfig::new(table_path).with_listing_options(options);
+
+        let config_inferred = config_to_infer.infer_schema(&ctx.state()).await?;
+        let table_inferred = ListingTable::try_new(config_inferred)?;
+        assert_eq!(*table_inferred.schema_source(), SchemaSource::Inferred);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_schema_source_preserved_through_config_operations() -> Result<()> {
+        let ctx = SessionContext::new();
+        let testdata = crate::test_util::datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
+
+        // Start with a specified schema
+        let specified_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Float32, true),
+            Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::Boolean, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]));
+
+        let config =
+            ListingTableConfig::new(table_path.clone()).with_schema(specified_schema);
+
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Make sure source is preserved after adding options
+        let format = csv::CsvFormat::default().with_schema(None);
+        let options = ListingOptions::new(Arc::new(format));
+        let config = config.with_listing_options(options);
+
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Make sure inferred schema doesn't override specified schema
+        let config = config.infer(&ctx.state()).await?;
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        Ok(())
+    }
 }

From 82d1bff5286ed4be5d38517fe6cc62578f68e6e7 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Fri, 6 Jun 2025 21:53:44 +0800
Subject: [PATCH 05/22] feat: Add tests for inferred and specified schema
 handling for multiple files in ListingTableConfig

---
 .../core/src/datasource/listing/table.rs      | 252 +++++++++++++++++-
 1 file changed, 247 insertions(+), 5 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 4e1c4cbed0fa2..02e8114acb6a3 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -1340,6 +1340,7 @@ mod tests {
     use datafusion_physical_plan::ExecutionPlanProperties;
 
     use crate::test::object_store::{ensure_head_concurrency, make_test_store_and_state};
+    use std::io::Write;
     use tempfile::TempDir;
     use url::Url;
 
@@ -2486,11 +2487,7 @@ mod tests {
             .await?;
 
         // check count
-        let batches = session_ctx
-            .sql("select * from foo")
-            .await?
-            .collect()
-            .await?;
+        let batches = session_ctx.sql("select * from t").await?.collect().await?;
 
         insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
             +-----+-----+---+
@@ -2658,4 +2655,249 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_listing_table_config_with_multiple_files_inferred() -> Result<()> {
+        // Test case 1: Inferred schema with multiple files having different schemas
+        let ctx = SessionContext::new();
+
+        // Create two test files with different schemas
+        let tmp_dir = TempDir::new()?;
+        let file_path1 = tmp_dir.path().join("file1.csv");
+        let file_path2 = tmp_dir.path().join("file2.csv");
+
+        // File 1: c1,c2,c3
+        let mut file1 = std::fs::File::create(&file_path1)?;
+        writeln!(file1, "c1,c2,c3")?;
+        writeln!(file1, "1,2,3")?;
+        writeln!(file1, "4,5,6")?;
+
+        // File 2: c1,c2,c3,c4
+        let mut file2 = std::fs::File::create(&file_path2)?;
+        writeln!(file2, "c1,c2,c3,c4")?;
+        writeln!(file2, "7,8,9,10")?;
+        writeln!(file2, "11,12,13,14")?;
+
+        // Parse paths
+        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
+        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+
+        // Create config with both paths
+        let config =
+            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2]);
+        assert_eq!(*config.schema_source(), SchemaSource::None);
+
+        // Set up options
+        let format = csv::CsvFormat::default()
+            .with_schema(None)
+            .with_has_header(true);
+        let options = ListingOptions::new(Arc::new(format));
+        let config = config.with_listing_options(options);
+
+        // Infer schema (should use first file's schema)
+        let config = config.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config.schema_source(), SchemaSource::Inferred);
+
+        // Verify that the inferred schema matches the first file's schema (3 columns)
+        let schema = config.file_schema.unwrap();
+        assert_eq!(schema.fields().len(), 3);
+        assert_eq!(schema.field(0).name(), "c1");
+        assert_eq!(schema.field(1).name(), "c2");
+        assert_eq!(schema.field(2).name(), "c3");
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_listing_table_config_with_multiple_files_specified_schema1(
+    ) -> Result<()> {
+        // Test case 2: Specified schema matching first file schema
+        let ctx = SessionContext::new();
+
+        // Create two test files with different schemas
+        let tmp_dir = TempDir::new()?;
+        let file_path1 = tmp_dir.path().join("file1.csv");
+        let file_path2 = tmp_dir.path().join("file2.csv");
+
+        // File 1: c1,c2,c3
+        let mut file1 = std::fs::File::create(&file_path1)?;
+        writeln!(file1, "c1,c2,c3")?;
+        writeln!(file1, "1,2,3")?;
+        writeln!(file1, "4,5,6")?;
+
+        // File 2: c1,c2,c3,c4
+        let mut file2 = std::fs::File::create(&file_path2)?;
+        writeln!(file2, "c1,c2,c3,c4")?;
+        writeln!(file2, "7,8,9,10")?;
+        writeln!(file2, "11,12,13,14")?;
+
+        // Parse paths
+        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
+        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+
+        // Create specified schema matching first file
+        let specified_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8, true),
+            Field::new("c2", DataType::Utf8, true),
+            Field::new("c3", DataType::Utf8, true),
+        ]));
+
+        // Create config with both paths and specified schema
+        let config =
+            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2])
+                .with_schema(specified_schema);
+
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Set up options
+        let format = csv::CsvFormat::default()
+            .with_schema(None)
+            .with_has_header(true);
+        let options = ListingOptions::new(Arc::new(format));
+        let config = config.with_listing_options(options);
+
+        // Infer should not change the schema because it's already specified
+        let config = config.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Verify that the schema is still the one we specified (3 columns)
+        let schema = config.file_schema.unwrap();
+        assert_eq!(schema.fields().len(), 3);
+        assert_eq!(schema.field(0).name(), "c1");
+        assert_eq!(schema.field(1).name(), "c2");
+        assert_eq!(schema.field(2).name(), "c3");
+
+        // Create the ListingTable and verify it maintains the schema source
+        let table = ListingTable::try_new(config)?;
+        assert_eq!(*table.schema_source(), SchemaSource::Specified);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_listing_table_config_with_multiple_files_specified_schema2(
+    ) -> Result<()> {
+        // Test case 3: Specified schema matching second file schema
+        let ctx = SessionContext::new();
+
+        // Create two test files with different schemas
+        let tmp_dir = TempDir::new()?;
+        let file_path1 = tmp_dir.path().join("file1.csv");
+        let file_path2 = tmp_dir.path().join("file2.csv");
+
+        // File 1: c1,c2,c3
+        let mut file1 = std::fs::File::create(&file_path1)?;
+        writeln!(file1, "c1,c2,c3")?;
+        writeln!(file1, "1,2,3")?;
+        writeln!(file1, "4,5,6")?;
+
+        // File 2: c1,c2,c3,c4
+        let mut file2 = std::fs::File::create(&file_path2)?;
+        writeln!(file2, "c1,c2,c3,c4")?;
+        writeln!(file2, "7,8,9,10")?;
+        writeln!(file2, "11,12,13,14")?;
+
+        // Parse paths
+        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
+        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+
+        // Create specified schema matching second file (with 4 columns)
+        let specified_schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8, true),
+            Field::new("c2", DataType::Utf8, true),
+            Field::new("c3", DataType::Utf8, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]));
+
+        // Create config with both paths and specified schema
+        let config =
+            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2])
+                .with_schema(specified_schema.clone());
+
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Set up options
+        let format = csv::CsvFormat::default()
+            .with_schema(None)
+            .with_has_header(true);
+        let options = ListingOptions::new(Arc::new(format));
+        let config = config.with_listing_options(options);
+
+        // Infer should not change the schema because it's already specified
+        let config = config.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+
+        // Verify that the schema is still the one we specified (4 columns)
+        let schema = config.file_schema.unwrap();
+        assert_eq!(schema.fields().len(), 4);
+        assert_eq!(schema.field(0).name(), "c1");
+        assert_eq!(schema.field(1).name(), "c2");
+        assert_eq!(schema.field(2).name(), "c3");
+        assert_eq!(schema.field(3).name(), "c4");
+
+        // Create the ListingTable and verify it maintains the schema source
+        let table = ListingTable::try_new(config)?;
+        assert_eq!(*table.schema_source(), SchemaSource::Specified);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_listing_table_config_with_multiple_files_inferred_reversed(
+    ) -> Result<()> {
+        // Test case: Inferred schema with multiple files having different schemas,
+        // but with the order reversed (schema2 first, then schema1)
+        let ctx = SessionContext::new();
+
+        // Create two test files with different schemas
+        let tmp_dir = TempDir::new()?;
+        let file_path1 = tmp_dir.path().join("file1.csv");
+        let file_path2 = tmp_dir.path().join("file2.csv");
+
+        // File 1: c1,c2,c3,c4 (now schema2 with 4 columns)
+        let mut file1 = std::fs::File::create(&file_path1)?;
+        writeln!(file1, "c1,c2,c3,c4")?;
+        writeln!(file1, "7,8,9,10")?;
+        writeln!(file1, "11,12,13,14")?;
+
+        // File 2: c1,c2,c3 (now schema1 with 3 columns)
+        let mut file2 = std::fs::File::create(&file_path2)?;
+        writeln!(file2, "c1,c2,c3")?;
+        writeln!(file2, "1,2,3")?;
+        writeln!(file2, "4,5,6")?;
+
+        // Parse paths
+        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
+        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+
+        // Create config with both paths, with schema2 (4-column) file first
+        let config =
+            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2]);
+        assert_eq!(*config.schema_source(), SchemaSource::None);
+
+        // Set up options
+        let format = csv::CsvFormat::default()
+            .with_schema(None)
+            .with_has_header(true);
+        let options = ListingOptions::new(Arc::new(format));
+        let config = config.with_listing_options(options);
+
+        // Infer schema (should use first file's schema which is now schema2 with 4 columns)
+        let config = config.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config.schema_source(), SchemaSource::Inferred);
+
+        // Verify that the inferred schema matches the first file's schema, which now has 4 columns
+        let schema = config.file_schema.unwrap();
+        assert_eq!(schema.fields().len(), 4);
+        assert_eq!(schema.field(0).name(), "c1");
+        assert_eq!(schema.field(1).name(), "c2");
+        assert_eq!(schema.field(2).name(), "c3");
+        assert_eq!(schema.field(3).name(), "c4");
+
+        // Create a ListingTable and verify it maintains the schema source
+        let table = ListingTable::try_new(config)?;
+        assert_eq!(*table.schema_source(), SchemaSource::Inferred);
+
+        Ok(())
+    }
 }

From d8d86d8e5403fedc2fd9f0320d0c9140fcf65d13 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 11:47:55 +0800
Subject: [PATCH 06/22] Fix tests

---
 .../core/src/datasource/listing/table.rs      | 45 +++++++++----------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 6b375acce4c8d..fad10dfc19f87 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -1333,6 +1333,7 @@ mod tests {
     use arrow::record_batch::RecordBatch;
     use datafusion_common::stats::Precision;
     use datafusion_common::test_util::batches_to_string;
+    use datafusion_common::test_util::datafusion_test_data;
     use datafusion_common::{assert_contains, ScalarValue};
     use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator};
     use datafusion_physical_expr::PhysicalSortExpr;
@@ -2487,7 +2488,11 @@ mod tests {
             .await?;
 
         // check count
-        let batches = session_ctx.sql("select * from t").await?.collect().await?;
+        let batches = session_ctx
+            .sql("select * from foo")
+            .await?
+            .collect()
+            .await?;
 
         insta::allow_duplicates! {insta::assert_snapshot!(batches_to_string(&batches),@r###"
             +-----+-----+---+
@@ -2525,7 +2530,7 @@ mod tests {
     async fn infer_preserves_provided_schema() -> Result<()> {
         let ctx = SessionContext::new();
 
-        let testdata = crate::test_util::datafusion_test_data();
+        let testdata = datafusion_test_data();
         let filename = format!("{testdata}/aggregate_simple.csv");
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
@@ -2550,7 +2555,7 @@ mod tests {
     async fn test_schema_source_tracking() -> Result<()> {
         let ctx = SessionContext::new();
 
-        let testdata = crate::test_util::datafusion_test_data();
+        let testdata = datafusion_test_data();
         let filename = format!("{testdata}/aggregate_simple.csv");
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
@@ -2570,7 +2575,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
 
         // Test schema source after inferring schema
-        let format = csv::CsvFormat::default().with_schema(None);
+        let format = CsvFormat::default();
         let options = ListingOptions::new(Arc::new(format));
         let config_without_schema =
             ListingTableConfig::new(table_path).with_listing_options(options);
@@ -2589,7 +2594,7 @@ mod tests {
     #[tokio::test]
     async fn test_schema_source_in_listing_table() -> Result<()> {
         let ctx = SessionContext::new();
-        let testdata = crate::test_util::datafusion_test_data();
+        let testdata = datafusion_test_data();
         let filename = format!("{testdata}/aggregate_simple.csv");
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
@@ -2601,7 +2606,7 @@ mod tests {
             Field::new("c4", DataType::Utf8, true),
         ]));
 
-        let format = csv::CsvFormat::default().with_schema(None);
+        let format = CsvFormat::default();
         let options = ListingOptions::new(Arc::new(format));
 
         let config_specified = ListingTableConfig::new(table_path.clone())
@@ -2625,7 +2630,7 @@ mod tests {
     #[tokio::test]
     async fn test_schema_source_preserved_through_config_operations() -> Result<()> {
         let ctx = SessionContext::new();
-        let testdata = crate::test_util::datafusion_test_data();
+        let testdata = datafusion_test_data();
         let filename = format!("{testdata}/aggregate_simple.csv");
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
@@ -2643,7 +2648,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
 
         // Make sure source is preserved after adding options
-        let format = csv::CsvFormat::default().with_schema(None);
+        let format = CsvFormat::default();
         let options = ListingOptions::new(Arc::new(format));
         let config = config.with_listing_options(options);
 
@@ -2688,9 +2693,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::None);
 
         // Set up options
-        let format = csv::CsvFormat::default()
-            .with_schema(None)
-            .with_has_header(true);
+        let format = CsvFormat::default().with_has_header(true);
         let options = ListingOptions::new(Arc::new(format));
         let config = config.with_listing_options(options);
 
@@ -2699,7 +2702,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Inferred);
 
         // Verify that the inferred schema matches the first file's schema (3 columns)
-        let schema = config.file_schema.unwrap();
+        let schema = config.file_schema.as_ref().unwrap().clone();
         assert_eq!(schema.fields().len(), 3);
         assert_eq!(schema.field(0).name(), "c1");
         assert_eq!(schema.field(1).name(), "c2");
@@ -2750,9 +2753,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
 
         // Set up options
-        let format = csv::CsvFormat::default()
-            .with_schema(None)
-            .with_has_header(true);
+        let format = CsvFormat::default().with_has_header(true);
         let options = ListingOptions::new(Arc::new(format));
         let config = config.with_listing_options(options);
 
@@ -2761,7 +2762,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
 
         // Verify that the schema is still the one we specified (3 columns)
-        let schema = config.file_schema.unwrap();
+        let schema = config.file_schema.as_ref().unwrap().clone();
         assert_eq!(schema.fields().len(), 3);
         assert_eq!(schema.field(0).name(), "c1");
         assert_eq!(schema.field(1).name(), "c2");
@@ -2817,9 +2818,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
 
         // Set up options
-        let format = csv::CsvFormat::default()
-            .with_schema(None)
-            .with_has_header(true);
+        let format = CsvFormat::default().with_has_header(true);
         let options = ListingOptions::new(Arc::new(format));
         let config = config.with_listing_options(options);
 
@@ -2828,7 +2827,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
 
         // Verify that the schema is still the one we specified (4 columns)
-        let schema = config.file_schema.unwrap();
+        let schema = config.file_schema.as_ref().unwrap().clone();
         assert_eq!(schema.fields().len(), 4);
         assert_eq!(schema.field(0).name(), "c1");
         assert_eq!(schema.field(1).name(), "c2");
@@ -2876,9 +2875,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::None);
 
         // Set up options
-        let format = csv::CsvFormat::default()
-            .with_schema(None)
-            .with_has_header(true);
+        let format = CsvFormat::default().with_has_header(true);
         let options = ListingOptions::new(Arc::new(format));
         let config = config.with_listing_options(options);
 
@@ -2887,7 +2884,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::Inferred);
 
         // Verify that the inferred schema matches the first file's schema, which now has 4 columns
-        let schema = config.file_schema.unwrap();
+        let schema = config.file_schema.as_ref().unwrap().clone();
         assert_eq!(schema.fields().len(), 4);
         assert_eq!(schema.field(0).name(), "c1");
         assert_eq!(schema.field(1).name(), "c2");

From 7957b87123fcc7e81ca6341c803903cdd1cddfae Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 12:06:53 +0800
Subject: [PATCH 07/22] refactor: remove PartialEq from SchemaSource enum

---
 datafusion/core/src/datasource/listing/table.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index fad10dfc19f87..c79943b14b18f 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -59,7 +59,7 @@ use itertools::Itertools;
 use object_store::ObjectStore;
 
 /// Indicates the source of the schema for a [`ListingTable`]
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone)]
 pub enum SchemaSource {
     /// Schema is not yet set (initial state)
     None,

From 47514f4bda5355752aa879710cc51eda125668d5 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 12:11:13 +0800
Subject: [PATCH 08/22] docs: add back parameter description for collect_stats
 in ListingTable

---
 datafusion/core/src/datasource/listing/table.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index c79943b14b18f..8ded4a9236585 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -1252,6 +1252,7 @@ impl ListingTable {
 /// * `files` - A stream of `Result<PartitionedFile>` items to process
 /// * `limit` - An optional row count limit. If provided, the function will stop collecting files
 ///   once the accumulated number of rows exceeds this limit
+/// * `collect_stats` - Whether to collect and accumulate statistics from the files
 ///
 /// # Returns
 /// A `Result` containing a `FileGroup` with the collected files

From 5e3ac756c2e964769937f01cdf62ce494a51f0a3 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 12:31:46 +0800
Subject: [PATCH 09/22] test: refactor schema creation in tests to use a helper
 function

---
 .../core/src/datasource/listing/table.rs      | 38 +++++++------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 8ded4a9236585..a6ca00708e878 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -1346,6 +1346,16 @@ mod tests {
     use tempfile::TempDir;
     use url::Url;
 
+    /// Creates a test schema with standard field types used in tests
+    fn create_test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Float32, true),
+            Field::new("c2", DataType::Float64, true),
+            Field::new("c3", DataType::Boolean, true),
+            Field::new("c4", DataType::Utf8, true),
+        ]))
+    }
+
     #[tokio::test]
     async fn read_single_file() -> Result<()> {
         let ctx = SessionContext::new_with_config(
@@ -2535,12 +2545,7 @@ mod tests {
         let filename = format!("{testdata}/aggregate_simple.csv");
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
-        let provided_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Float32, true),
-            Field::new("c2", DataType::Float64, true),
-            Field::new("c3", DataType::Boolean, true),
-            Field::new("c4", DataType::Utf8, true),
-        ]));
+        let provided_schema = create_test_schema();
 
         let config =
             ListingTableConfig::new(table_path).with_schema(Arc::clone(&provided_schema));
@@ -2565,12 +2570,7 @@ mod tests {
         assert_eq!(*config.schema_source(), SchemaSource::None);
 
         // Test schema source after setting a schema explicitly
-        let provided_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Float32, true),
-            Field::new("c2", DataType::Float64, true),
-            Field::new("c3", DataType::Boolean, true),
-            Field::new("c4", DataType::Utf8, true),
-        ]));
+        let provided_schema = create_test_schema();
 
         let config = config.with_schema(provided_schema.clone());
         assert_eq!(*config.schema_source(), SchemaSource::Specified);
@@ -2600,12 +2600,7 @@ mod tests {
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
         // Create a table with specified schema
-        let specified_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Float32, true),
-            Field::new("c2", DataType::Float64, true),
-            Field::new("c3", DataType::Boolean, true),
-            Field::new("c4", DataType::Utf8, true),
-        ]));
+        let specified_schema = create_test_schema();
 
         let format = CsvFormat::default();
         let options = ListingOptions::new(Arc::new(format));
@@ -2636,12 +2631,7 @@ mod tests {
         let table_path = ListingTableUrl::parse(filename).unwrap();
 
         // Start with a specified schema
-        let specified_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Float32, true),
-            Field::new("c2", DataType::Float64, true),
-            Field::new("c3", DataType::Boolean, true),
-            Field::new("c4", DataType::Utf8, true),
-        ]));
+        let specified_schema = create_test_schema();
 
         let config =
             ListingTableConfig::new(table_path.clone()).with_schema(specified_schema);

From dd607e49e2b336a13c5bf19d814bbff57d5b3420 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 13:04:29 +0800
Subject: [PATCH 10/22] test: add comprehensive schema source tracking tests
 and replace overlapping tests

---
 .../core/src/datasource/listing/table.rs      | 1057 ++++++-----------
 1 file changed, 378 insertions(+), 679 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index a6ca00708e878..0df66bf73e038 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -1253,6 +1253,7 @@ impl ListingTable {
 /// * `limit` - An optional row count limit. If provided, the function will stop collecting files
 ///   once the accumulated number of rows exceeds this limit
 /// * `collect_stats` - Whether to collect and accumulate statistics from the files
+
 ///
 /// # Returns
 /// A `Result` containing a `FileGroup` with the collected files
@@ -1356,6 +1357,61 @@ mod tests {
         ]))
     }
 
+    #[tokio::test]
+    async fn test_schema_source_tracking_comprehensive() -> Result<()> {
+        let ctx = SessionContext::new();
+        let testdata = datafusion_test_data();
+        let filename = format!("{testdata}/aggregate_simple.csv");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
+
+        // Test default schema source
+        let config = ListingTableConfig::new(table_path.clone());
+        assert_eq!(*config.schema_source(), SchemaSource::None);
+
+        // Test schema source after setting a schema explicitly
+        let provided_schema = create_test_schema();
+        let config_with_schema = config.clone().with_schema(provided_schema.clone());
+        assert_eq!(*config_with_schema.schema_source(), SchemaSource::Specified);
+
+        // Test schema source after inferring schema
+        let format = CsvFormat::default();
+        let options = ListingOptions::new(Arc::new(format));
+        let config_with_options = config.with_listing_options(options.clone());
+        assert_eq!(*config_with_options.schema_source(), SchemaSource::None);
+
+        let config_with_inferred = config_with_options.infer_schema(&ctx.state()).await?;
+        assert_eq!(
+            *config_with_inferred.schema_source(),
+            SchemaSource::Inferred
+        );
+
+        // Test schema preservation through operations
+        let config_with_schema_and_options = config_with_schema
+            .clone()
+            .with_listing_options(options.clone());
+        assert_eq!(
+            *config_with_schema_and_options.schema_source(),
+            SchemaSource::Specified
+        );
+
+        // Make sure inferred schema doesn't override specified schema
+        let config_with_schema_and_infer =
+            config_with_schema_and_options.infer(&ctx.state()).await?;
+        assert_eq!(
+            *config_with_schema_and_infer.schema_source(),
+            SchemaSource::Specified
+        );
+
+        // Verify sources in actual ListingTable objects
+        let table_specified = ListingTable::try_new(config_with_schema_and_options)?;
+        assert_eq!(*table_specified.schema_source(), SchemaSource::Specified);
+
+        let table_inferred = ListingTable::try_new(config_with_inferred)?;
+        assert_eq!(*table_inferred.schema_source(), SchemaSource::Inferred);
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn read_single_file() -> Result<()> {
         let ctx = SessionContext::new_with_config(
@@ -1385,84 +1441,8 @@ mod tests {
         Ok(())
     }
 
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn do_not_load_table_stats_by_default() -> Result<()> {
-        use crate::datasource::file_format::parquet::ParquetFormat;
-
-        let testdata = crate::test_util::parquet_test_data();
-        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        let ctx = SessionContext::new();
-        let state = ctx.state();
-
-        let opt = ListingOptions::new(Arc::new(ParquetFormat::default()));
-        let schema = opt.infer_schema(&state, &table_path).await?;
-        let config = ListingTableConfig::new(table_path.clone())
-            .with_listing_options(opt)
-            .with_schema(schema);
-        let table = ListingTable::try_new(config)?;
-
-        let exec = table.scan(&state, None, &[], None).await?;
-        assert_eq!(exec.partition_statistics(None)?.num_rows, Precision::Absent);
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        assert_eq!(
-            exec.partition_statistics(None)?.total_byte_size,
-            Precision::Absent
-        );
-
-        let opt = ListingOptions::new(Arc::new(ParquetFormat::default()))
-            .with_collect_stat(true);
-        let schema = opt.infer_schema(&state, &table_path).await?;
-        let config = ListingTableConfig::new(table_path)
-            .with_listing_options(opt)
-            .with_schema(schema);
-        let table = ListingTable::try_new(config)?;
-
-        let exec = table.scan(&state, None, &[], None).await?;
-        assert_eq!(
-            exec.partition_statistics(None)?.num_rows,
-            Precision::Exact(8)
-        );
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
-        assert_eq!(
-            exec.partition_statistics(None)?.total_byte_size,
-            Precision::Exact(671)
-        );
-
-        Ok(())
-    }
-
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn load_table_stats_when_no_stats() -> Result<()> {
-        use crate::datasource::file_format::parquet::ParquetFormat;
-
-        let testdata = crate::test_util::parquet_test_data();
-        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        let ctx = SessionContext::new();
-        let state = ctx.state();
-
-        let opt = ListingOptions::new(Arc::new(ParquetFormat::default()))
-            .with_collect_stat(false);
-        let schema = opt.infer_schema(&state, &table_path).await?;
-        let config = ListingTableConfig::new(table_path)
-            .with_listing_options(opt)
-            .with_schema(schema);
-        let table = ListingTable::try_new(config)?;
-
-        let exec = table.scan(&state, None, &[], None).await?;
-        assert_eq!(exec.partition_statistics(None)?.num_rows, Precision::Absent);
-        assert_eq!(
-            exec.partition_statistics(None)?.total_byte_size,
-            Precision::Absent
-        );
-
-        Ok(())
-    }
+    // do_not_load_table_stats_by_default and load_table_stats_when_no_stats
+    // have been replaced by test_table_stats_behaviors
 
     #[cfg(feature = "parquet")]
     #[tokio::test]
@@ -1598,263 +1578,6 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn test_assert_list_files_for_scan_grouping() -> Result<()> {
-        // more expected partitions than files
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/key-prefix/file2",
-                "bucket/key-prefix/file3",
-                "bucket/key-prefix/file4",
-            ],
-            "test:///bucket/key-prefix/",
-            12,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // as many expected partitions as files
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/key-prefix/file2",
-                "bucket/key-prefix/file3",
-            ],
-            "test:///bucket/key-prefix/",
-            4,
-            4,
-            Some(""),
-        )
-        .await?;
-
-        // more files as expected partitions
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/key-prefix/file2",
-                "bucket/key-prefix/file3",
-                "bucket/key-prefix/file4",
-            ],
-            "test:///bucket/key-prefix/",
-            2,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // no files => no groups
-        assert_list_files_for_scan_grouping(
-            &[],
-            "test:///bucket/key-prefix/",
-            2,
-            0,
-            Some(""),
-        )
-        .await?;
-
-        // files that don't match the prefix
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0",
-                "bucket/key-prefix/file1",
-                "bucket/other-prefix/roguefile",
-            ],
-            "test:///bucket/key-prefix/",
-            10,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // files that don't match the prefix or the default file extention
-        assert_list_files_for_scan_grouping(
-            &[
-                "bucket/key-prefix/file0.json",
-                "bucket/key-prefix/file1.parquet",
-                "bucket/other-prefix/roguefile.json",
-            ],
-            "test:///bucket/key-prefix/",
-            10,
-            1,
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_assert_list_files_for_multi_path() -> Result<()> {
-        // more expected partitions than files
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key1/", "test:///bucket/key2/"],
-            12,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // as many expected partitions as files
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key1/", "test:///bucket/key2/"],
-            5,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // more files as expected partitions
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key1/"],
-            2,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // no files => no groups
-        assert_list_files_for_multi_paths(&[], &["test:///bucket/key1/"], 2, 0, Some(""))
-            .await?;
-
-        // files that don't match the prefix
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-                "bucket/key3/file5",
-            ],
-            &["test:///bucket/key3/"],
-            2,
-            1,
-            Some(""),
-        )
-        .await?;
-
-        // files that don't match the prefix or the default file ext
-        assert_list_files_for_multi_paths(
-            &[
-                "bucket/key1/file0.json",
-                "bucket/key1/file1.csv",
-                "bucket/key1/file2.json",
-                "bucket/key2/file3.csv",
-                "bucket/key2/file4.json",
-                "bucket/key3/file5.csv",
-            ],
-            &["test:///bucket/key1/", "test:///bucket/key3/"],
-            2,
-            2,
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_assert_list_files_for_exact_paths() -> Result<()> {
-        // more expected partitions than files
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-            ],
-            12,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // more files than meta_fetch_concurrency (32)
-        let files: Vec<String> =
-            (0..64).map(|i| format!("bucket/key1/file{i}")).collect();
-        // Collect references to each string
-        let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
-        assert_list_files_for_exact_paths(file_refs.as_slice(), 5, 5, Some("")).await?;
-
-        // as many expected partitions as files
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-            ],
-            5,
-            5,
-            Some(""),
-        )
-        .await?;
-
-        // more files as expected partitions
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0",
-                "bucket/key1/file1",
-                "bucket/key1/file2",
-                "bucket/key2/file3",
-                "bucket/key2/file4",
-            ],
-            2,
-            2,
-            Some(""),
-        )
-        .await?;
-
-        // no files => no groups
-        assert_list_files_for_exact_paths(&[], 2, 0, Some("")).await?;
-
-        // files that don't match the default file ext
-        assert_list_files_for_exact_paths(
-            &[
-                "bucket/key1/file0.json",
-                "bucket/key1/file1.csv",
-                "bucket/key1/file2.json",
-                "bucket/key2/file3.csv",
-                "bucket/key2/file4.json",
-                "bucket/key3/file5.csv",
-            ],
-            2,
-            2,
-            None,
-        )
-        .await?;
-        Ok(())
-    }
-
     async fn load_table(
         ctx: &SessionContext,
         name: &str,
@@ -1957,10 +1680,10 @@ mod tests {
             .execution
             .meta_fetch_concurrency;
         let expected_concurrency = files.len().min(meta_fetch_concurrency);
-        let head_blocking_store = ensure_head_concurrency(store, expected_concurrency);
+        let head_concurrency_store = ensure_head_concurrency(store, expected_concurrency);
 
         let url = Url::parse("test://").unwrap();
-        ctx.register_object_store(&url, head_blocking_store.clone());
+        ctx.register_object_store(&url, head_concurrency_store.clone());
 
         let format = JsonFormat::default();
 
@@ -1987,79 +1710,11 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn test_insert_into_append_new_json_files() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "10".into(),
-        );
-        helper_test_append_new_files_to_table(
-            JsonFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            2,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_insert_into_append_new_csv_files() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "10".into(),
-        );
-        helper_test_append_new_files_to_table(
-            CsvFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            2,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn test_insert_into_append_2_new_parquet_files_defaults() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "10".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "10".into(),
-        );
-        helper_test_append_new_files_to_table(
-            ParquetFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            2,
-        )
-        .await?;
-        Ok(())
-    }
-
-    #[cfg(feature = "parquet")]
-    #[tokio::test]
-    async fn test_insert_into_append_1_new_parquet_files_defaults() -> Result<()> {
-        let mut config_map: HashMap<String, String> = HashMap::new();
-        config_map.insert("datafusion.execution.batch_size".into(), "20".into());
-        config_map.insert(
-            "datafusion.execution.soft_max_rows_per_output_file".into(),
-            "20".into(),
-        );
-        helper_test_append_new_files_to_table(
-            ParquetFormat::default().get_ext(),
-            FileCompressionType::UNCOMPRESSED,
-            Some(config_map),
-            1,
-        )
-        .await?;
-        Ok(())
-    }
+    // The following tests have been replaced by test_insert_into_parameterized:
+    // - test_insert_into_append_new_json_files
+    // - test_insert_into_append_new_csv_files
+    // - test_insert_into_append_2_new_parquet_files_defaults
+    // - test_insert_into_append_1_new_parquet_files_defaults
 
     #[tokio::test]
     async fn test_insert_into_sql_csv_defaults() -> Result<()> {
@@ -2400,7 +2055,7 @@ mod tests {
 
         // Read the records in the table
         let batches = session_ctx
-            .sql("select count(*) as count from t")
+            .sql("select count(*) as count from foo")
             .await?
             .collect()
             .await?;
@@ -2557,107 +2212,13 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn test_schema_source_tracking() -> Result<()> {
-        let ctx = SessionContext::new();
-
-        let testdata = datafusion_test_data();
-        let filename = format!("{testdata}/aggregate_simple.csv");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        // Test default schema source
-        let config = ListingTableConfig::new(table_path.clone());
-        assert_eq!(*config.schema_source(), SchemaSource::None);
-
-        // Test schema source after setting a schema explicitly
-        let provided_schema = create_test_schema();
-
-        let config = config.with_schema(provided_schema.clone());
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
-
-        // Test schema source after inferring schema
-        let format = CsvFormat::default();
-        let options = ListingOptions::new(Arc::new(format));
-        let config_without_schema =
-            ListingTableConfig::new(table_path).with_listing_options(options);
-        assert_eq!(*config_without_schema.schema_source(), SchemaSource::None);
-
-        let config_with_inferred =
-            config_without_schema.infer_schema(&ctx.state()).await?;
-        assert_eq!(
-            *config_with_inferred.schema_source(),
-            SchemaSource::Inferred
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_schema_source_in_listing_table() -> Result<()> {
-        let ctx = SessionContext::new();
-        let testdata = datafusion_test_data();
-        let filename = format!("{testdata}/aggregate_simple.csv");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        // Create a table with specified schema
-        let specified_schema = create_test_schema();
-
-        let format = CsvFormat::default();
-        let options = ListingOptions::new(Arc::new(format));
-
-        let config_specified = ListingTableConfig::new(table_path.clone())
-            .with_listing_options(options.clone())
-            .with_schema(specified_schema);
-
-        let table_specified = ListingTable::try_new(config_specified)?;
-        assert_eq!(*table_specified.schema_source(), SchemaSource::Specified);
-
-        // Create a table with inferred schema
-        let config_to_infer =
-            ListingTableConfig::new(table_path).with_listing_options(options);
-
-        let config_inferred = config_to_infer.infer_schema(&ctx.state()).await?;
-        let table_inferred = ListingTable::try_new(config_inferred)?;
-        assert_eq!(*table_inferred.schema_source(), SchemaSource::Inferred);
-
-        Ok(())
-    }
+    // test_schema_source_tracking was replaced by test_schema_source_tracking_comprehensive
 
     #[tokio::test]
-    async fn test_schema_source_preserved_through_config_operations() -> Result<()> {
+    async fn test_listing_table_config_with_multiple_files_comprehensive() -> Result<()> {
         let ctx = SessionContext::new();
-        let testdata = datafusion_test_data();
-        let filename = format!("{testdata}/aggregate_simple.csv");
-        let table_path = ListingTableUrl::parse(filename).unwrap();
-
-        // Start with a specified schema
-        let specified_schema = create_test_schema();
 
-        let config =
-            ListingTableConfig::new(table_path.clone()).with_schema(specified_schema);
-
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
-
-        // Make sure source is preserved after adding options
-        let format = CsvFormat::default();
-        let options = ListingOptions::new(Arc::new(format));
-        let config = config.with_listing_options(options);
-
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
-
-        // Make sure inferred schema doesn't override specified schema
-        let config = config.infer(&ctx.state()).await?;
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_listing_table_config_with_multiple_files_inferred() -> Result<()> {
-        // Test case 1: Inferred schema with multiple files having different schemas
-        let ctx = SessionContext::new();
-
-        // Create two test files with different schemas
+        // Create test files with different schemas
         let tmp_dir = TempDir::new()?;
         let file_path1 = tmp_dir.path().join("file1.csv");
         let file_path2 = tmp_dir.path().join("file2.csv");
@@ -2678,213 +2239,351 @@ mod tests {
         let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
         let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
 
-        // Create config with both paths
-        let config =
-            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2]);
-        assert_eq!(*config.schema_source(), SchemaSource::None);
-
-        // Set up options
+        // Create format and options
         let format = CsvFormat::default().with_has_header(true);
         let options = ListingOptions::new(Arc::new(format));
-        let config = config.with_listing_options(options);
-
-        // Infer schema (should use first file's schema)
-        let config = config.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config.schema_source(), SchemaSource::Inferred);
-
-        // Verify that the inferred schema matches the first file's schema (3 columns)
-        let schema = config.file_schema.as_ref().unwrap().clone();
-        assert_eq!(schema.fields().len(), 3);
-        assert_eq!(schema.field(0).name(), "c1");
-        assert_eq!(schema.field(1).name(), "c2");
-        assert_eq!(schema.field(2).name(), "c3");
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_listing_table_config_with_multiple_files_specified_schema1(
-    ) -> Result<()> {
-        // Test case 2: Specified schema matching first file schema
-        let ctx = SessionContext::new();
-
-        // Create two test files with different schemas
-        let tmp_dir = TempDir::new()?;
-        let file_path1 = tmp_dir.path().join("file1.csv");
-        let file_path2 = tmp_dir.path().join("file2.csv");
-
-        // File 1: c1,c2,c3
-        let mut file1 = std::fs::File::create(&file_path1)?;
-        writeln!(file1, "c1,c2,c3")?;
-        writeln!(file1, "1,2,3")?;
-        writeln!(file1, "4,5,6")?;
 
-        // File 2: c1,c2,c3,c4
-        let mut file2 = std::fs::File::create(&file_path2)?;
-        writeln!(file2, "c1,c2,c3,c4")?;
-        writeln!(file2, "7,8,9,10")?;
-        writeln!(file2, "11,12,13,14")?;
+        // Test case 1: Infer schema using first file's schema
+        let config1 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path1.clone(),
+            table_path2.clone(),
+        ])
+        .with_listing_options(options.clone());
+        let config1 = config1.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config1.schema_source(), SchemaSource::Inferred);
+
+        // Verify schema matches first file
+        let schema1 = config1.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema1.fields().len(), 3);
+        assert_eq!(schema1.field(0).name(), "c1");
+        assert_eq!(schema1.field(1).name(), "c2");
+        assert_eq!(schema1.field(2).name(), "c3");
+
+        // Test case 2: Use specified schema with 3 columns
+        let schema_3cols = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8, true),
+            Field::new("c2", DataType::Utf8, true),
+            Field::new("c3", DataType::Utf8, true),
+        ]));
 
-        // Parse paths
-        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
-        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+        let config2 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path1.clone(),
+            table_path2.clone(),
+        ])
+        .with_schema(schema_3cols)
+        .with_listing_options(options.clone());
+        let config2 = config2.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config2.schema_source(), SchemaSource::Specified);
 
-        // Create specified schema matching first file
-        let specified_schema = Arc::new(Schema::new(vec![
+        // Verify that the schema is still the one we specified (3 columns)
+        let schema2 = config2.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema2.fields().len(), 3);
+        assert_eq!(schema2.field(0).name(), "c1");
+        assert_eq!(schema2.field(1).name(), "c2");
+        assert_eq!(schema2.field(2).name(), "c3");
+
+        // Test case 3: Use specified schema with 4 columns
+        let schema_4cols = Arc::new(Schema::new(vec![
             Field::new("c1", DataType::Utf8, true),
             Field::new("c2", DataType::Utf8, true),
             Field::new("c3", DataType::Utf8, true),
+            Field::new("c4", DataType::Utf8, true),
         ]));
 
-        // Create config with both paths and specified schema
-        let config =
-            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2])
-                .with_schema(specified_schema);
-
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+        let config3 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path1.clone(),
+            table_path2.clone(),
+        ])
+        .with_schema(schema_4cols)
+        .with_listing_options(options.clone());
+        let config3 = config3.infer_schema(&ctx.state()).await?;
+        assert_eq!(*config3.schema_source(), SchemaSource::Specified);
 
-        // Set up options
-        let format = CsvFormat::default().with_has_header(true);
-        let options = ListingOptions::new(Arc::new(format));
-        let config = config.with_listing_options(options);
+        // Verify that the schema is still the one we specified (4 columns)
+        let schema3 = config3.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema3.fields().len(), 4);
+        assert_eq!(schema3.field(0).name(), "c1");
+        assert_eq!(schema3.field(1).name(), "c2");
+        assert_eq!(schema3.field(2).name(), "c3");
+        assert_eq!(schema3.field(3).name(), "c4");
+
+        // Test case 4: Verify order matters when inferring schema
+        let config4 = ListingTableConfig::new_with_multi_paths(vec![
+            table_path2.clone(),
+            table_path1.clone(),
+        ])
+        .with_listing_options(options);
+        let config4 = config4.infer_schema(&ctx.state()).await?;
+
+        // Should use first file's schema, which now has 4 columns
+        let schema4 = config4.file_schema.as_ref().unwrap().clone();
+        assert_eq!(schema4.fields().len(), 4);
+        assert_eq!(schema4.field(0).name(), "c1");
+        assert_eq!(schema4.field(1).name(), "c2");
+        assert_eq!(schema4.field(2).name(), "c3");
+        assert_eq!(schema4.field(3).name(), "c4");
 
-        // Infer should not change the schema because it's already specified
-        let config = config.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+        Ok(())
+    }
 
-        // Verify that the schema is still the one we specified (3 columns)
-        let schema = config.file_schema.as_ref().unwrap().clone();
-        assert_eq!(schema.fields().len(), 3);
-        assert_eq!(schema.field(0).name(), "c1");
-        assert_eq!(schema.field(1).name(), "c2");
-        assert_eq!(schema.field(2).name(), "c3");
+    #[tokio::test]
+    async fn test_list_files_configurations() -> Result<()> {
+        // Define common test cases as (description, files, paths, target_partitions, expected_partitions, file_ext)
+        let test_cases = vec![
+            // Single path cases
+            (
+                "Single path, more partitions than files",
+                vec![
+                    "bucket/key-prefix/file0",
+                    "bucket/key-prefix/file1",
+                    "bucket/key-prefix/file2",
+                    "bucket/key-prefix/file3",
+                    "bucket/key-prefix/file4",
+                ],
+                vec!["test:///bucket/key-prefix/"],
+                12,
+                5,
+                Some(""),
+            ),
+            (
+                "Single path, equal partitions and files",
+                vec![
+                    "bucket/key-prefix/file0",
+                    "bucket/key-prefix/file1",
+                    "bucket/key-prefix/file2",
+                    "bucket/key-prefix/file3",
+                ],
+                vec!["test:///bucket/key-prefix/"],
+                4,
+                4,
+                Some(""),
+            ),
+            (
+                "Single path, more files than partitions",
+                vec![
+                    "bucket/key-prefix/file0",
+                    "bucket/key-prefix/file1",
+                    "bucket/key-prefix/file2",
+                    "bucket/key-prefix/file3",
+                    "bucket/key-prefix/file4",
+                ],
+                vec!["test:///bucket/key-prefix/"],
+                2,
+                2,
+                Some(""),
+            ),
+            // Multi path cases
+            (
+                "Multi path, more partitions than files",
+                vec![
+                    "bucket/key1/file0",
+                    "bucket/key1/file1",
+                    "bucket/key1/file2",
+                    "bucket/key2/file3",
+                    "bucket/key2/file4",
+                    "bucket/key3/file5",
+                ],
+                vec!["test:///bucket/key1/", "test:///bucket/key2/"],
+                12,
+                5,
+                Some(""),
+            ),
+            // No files case
+            (
+                "No files",
+                vec![],
+                vec!["test:///bucket/key-prefix/"],
+                2,
+                0,
+                Some(""),
+            ),
+            // Exact path cases
+            (
+                "Exact paths test",
+                vec![
+                    "bucket/key1/file0",
+                    "bucket/key1/file1",
+                    "bucket/key1/file2",
+                    "bucket/key2/file3",
+                    "bucket/key2/file4",
+                ],
+                vec![
+                    "test:///bucket/key1/file0",
+                    "test:///bucket/key1/file1",
+                    "test:///bucket/key1/file2",
+                    "test:///bucket/key2/file3",
+                    "test:///bucket/key2/file4",
+                ],
+                12,
+                5,
+                Some(""),
+            ),
+        ];
 
-        // Create the ListingTable and verify it maintains the schema source
-        let table = ListingTable::try_new(config)?;
-        assert_eq!(*table.schema_source(), SchemaSource::Specified);
+        // Run each test case
+        for (test_name, files, paths, target_partitions, expected_partitions, file_ext) in
+            test_cases
+        {
+            println!("Running test: {}", test_name);
+
+            if files.is_empty() {
+                // Test empty files case
+                assert_list_files_for_multi_paths(
+                    &[],
+                    &paths.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            } else if paths.len() == 1 {
+                // Test using single path API
+                assert_list_files_for_scan_grouping(
+                    &files.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    paths[0],
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            } else if paths[0].contains("test:///bucket/key") {
+                // Test using multi path API
+                assert_list_files_for_multi_paths(
+                    &files.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    &paths.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            } else {
+                // Test using exact path API for specific cases
+                let file_strs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
+                assert_list_files_for_exact_paths(
+                    &file_strs,
+                    target_partitions,
+                    expected_partitions,
+                    file_ext,
+                )
+                .await?;
+            }
+        }
 
         Ok(())
     }
 
+    #[cfg(feature = "parquet")]
     #[tokio::test]
-    async fn test_listing_table_config_with_multiple_files_specified_schema2(
-    ) -> Result<()> {
-        // Test case 3: Specified schema matching second file schema
-        let ctx = SessionContext::new();
-
-        // Create two test files with different schemas
-        let tmp_dir = TempDir::new()?;
-        let file_path1 = tmp_dir.path().join("file1.csv");
-        let file_path2 = tmp_dir.path().join("file2.csv");
-
-        // File 1: c1,c2,c3
-        let mut file1 = std::fs::File::create(&file_path1)?;
-        writeln!(file1, "c1,c2,c3")?;
-        writeln!(file1, "1,2,3")?;
-        writeln!(file1, "4,5,6")?;
+    async fn test_table_stats_behaviors() -> Result<()> {
+        use crate::datasource::file_format::parquet::ParquetFormat;
 
-        // File 2: c1,c2,c3,c4
-        let mut file2 = std::fs::File::create(&file_path2)?;
-        writeln!(file2, "c1,c2,c3,c4")?;
-        writeln!(file2, "7,8,9,10")?;
-        writeln!(file2, "11,12,13,14")?;
+        let testdata = crate::test_util::parquet_test_data();
+        let filename = format!("{}/{}", testdata, "alltypes_plain.parquet");
+        let table_path = ListingTableUrl::parse(filename).unwrap();
 
-        // Parse paths
-        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
-        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
+        let ctx = SessionContext::new();
+        let state = ctx.state();
 
-        // Create specified schema matching second file (with 4 columns)
-        let specified_schema = Arc::new(Schema::new(vec![
-            Field::new("c1", DataType::Utf8, true),
-            Field::new("c2", DataType::Utf8, true),
-            Field::new("c3", DataType::Utf8, true),
-            Field::new("c4", DataType::Utf8, true),
-        ]));
+        // Test 1: Default behavior - stats not collected
+        let opt_default = ListingOptions::new(Arc::new(ParquetFormat::default()));
+        let schema_default = opt_default.infer_schema(&state, &table_path).await?;
+        let config_default = ListingTableConfig::new(table_path.clone())
+            .with_listing_options(opt_default)
+            .with_schema(schema_default);
+        let table_default = ListingTable::try_new(config_default)?;
 
-        // Create config with both paths and specified schema
-        let config =
-            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2])
-                .with_schema(specified_schema.clone());
+        let exec_default = table_default.scan(&state, None, &[], None).await?;
+        assert_eq!(
+            exec_default.partition_statistics(None)?.num_rows,
+            Precision::Absent
+        );
+        assert_eq!(
+            exec_default.partition_statistics(None)?.total_byte_size,
+            Precision::Absent
+        );
 
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+        // Test 2: Explicitly disable stats
+        let opt_disabled = ListingOptions::new(Arc::new(ParquetFormat::default()))
+            .with_collect_stat(false);
+        let schema_disabled = opt_disabled.infer_schema(&state, &table_path).await?;
+        let config_disabled = ListingTableConfig::new(table_path.clone())
+            .with_listing_options(opt_disabled)
+            .with_schema(schema_disabled);
+        let table_disabled = ListingTable::try_new(config_disabled)?;
 
-        // Set up options
-        let format = CsvFormat::default().with_has_header(true);
-        let options = ListingOptions::new(Arc::new(format));
-        let config = config.with_listing_options(options);
+        let exec_disabled = table_disabled.scan(&state, None, &[], None).await?;
+        assert_eq!(
+            exec_disabled.partition_statistics(None)?.num_rows,
+            Precision::Absent
+        );
+        assert_eq!(
+            exec_disabled.partition_statistics(None)?.total_byte_size,
+            Precision::Absent
+        );
 
-        // Infer should not change the schema because it's already specified
-        let config = config.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config.schema_source(), SchemaSource::Specified);
+        // Test 3: Explicitly enable stats
+        let opt_enabled = ListingOptions::new(Arc::new(ParquetFormat::default()))
+            .with_collect_stat(true);
+        let schema_enabled = opt_enabled.infer_schema(&state, &table_path).await?;
+        let config_enabled = ListingTableConfig::new(table_path)
+            .with_listing_options(opt_enabled)
+            .with_schema(schema_enabled);
+        let table_enabled = ListingTable::try_new(config_enabled)?;
 
-        // Verify that the schema is still the one we specified (4 columns)
-        let schema = config.file_schema.as_ref().unwrap().clone();
-        assert_eq!(schema.fields().len(), 4);
-        assert_eq!(schema.field(0).name(), "c1");
-        assert_eq!(schema.field(1).name(), "c2");
-        assert_eq!(schema.field(2).name(), "c3");
-        assert_eq!(schema.field(3).name(), "c4");
-
-        // Create the ListingTable and verify it maintains the schema source
-        let table = ListingTable::try_new(config)?;
-        assert_eq!(*table.schema_source(), SchemaSource::Specified);
+        let exec_enabled = table_enabled.scan(&state, None, &[], None).await?;
+        assert_eq!(
+            exec_enabled.partition_statistics(None)?.num_rows,
+            Precision::Exact(8)
+        );
+        assert_eq!(
+            exec_enabled.partition_statistics(None)?.total_byte_size,
+            Precision::Exact(671)
+        );
 
         Ok(())
     }
 
     #[tokio::test]
-    async fn test_listing_table_config_with_multiple_files_inferred_reversed(
-    ) -> Result<()> {
-        // Test case: Inferred schema with multiple files having different schemas,
-        // but with the order reversed (schema2 first, then schema1)
-        let ctx = SessionContext::new();
-
-        // Create two test files with different schemas
-        let tmp_dir = TempDir::new()?;
-        let file_path1 = tmp_dir.path().join("file1.csv");
-        let file_path2 = tmp_dir.path().join("file2.csv");
-
-        // File 1: c1,c2,c3,c4 (now schema2 with 4 columns)
-        let mut file1 = std::fs::File::create(&file_path1)?;
-        writeln!(file1, "c1,c2,c3,c4")?;
-        writeln!(file1, "7,8,9,10")?;
-        writeln!(file1, "11,12,13,14")?;
-
-        // File 2: c1,c2,c3 (now schema1 with 3 columns)
-        let mut file2 = std::fs::File::create(&file_path2)?;
-        writeln!(file2, "c1,c2,c3")?;
-        writeln!(file2, "1,2,3")?;
-        writeln!(file2, "4,5,6")?;
-
-        // Parse paths
-        let table_path1 = ListingTableUrl::parse(file_path1.to_str().unwrap())?;
-        let table_path2 = ListingTableUrl::parse(file_path2.to_str().unwrap())?;
-
-        // Create config with both paths, with schema2 (4-column) file first
-        let config =
-            ListingTableConfig::new_with_multi_paths(vec![table_path1, table_path2]);
-        assert_eq!(*config.schema_source(), SchemaSource::None);
+    async fn test_insert_into_parameterized() -> Result<()> {
+        let test_cases = vec![
+            // (file_format, batch_size, soft_max_rows, expected_files)
+            ("json", 10, 10, 2),
+            ("csv", 10, 10, 2),
+            #[cfg(feature = "parquet")]
+            ("parquet", 10, 10, 2),
+            #[cfg(feature = "parquet")]
+            ("parquet", 20, 20, 1),
+        ];
 
-        // Set up options
-        let format = CsvFormat::default().with_has_header(true);
-        let options = ListingOptions::new(Arc::new(format));
-        let config = config.with_listing_options(options);
+        for (format, batch_size, soft_max_rows, expected_files) in test_cases {
+            println!("Testing insert with format: {format}, batch_size: {batch_size}, expected files: {expected_files}");
 
-        // Infer schema (should use first file's schema which is now schema2 with 4 columns)
-        let config = config.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config.schema_source(), SchemaSource::Inferred);
+            let mut config_map = HashMap::new();
+            config_map.insert(
+                "datafusion.execution.batch_size".into(),
+                batch_size.to_string(),
+            );
+            config_map.insert(
+                "datafusion.execution.soft_max_rows_per_output_file".into(),
+                soft_max_rows.to_string(),
+            );
 
-        // Verify that the inferred schema matches the first file's schema, which now has 4 columns
-        let schema = config.file_schema.as_ref().unwrap().clone();
-        assert_eq!(schema.fields().len(), 4);
-        assert_eq!(schema.field(0).name(), "c1");
-        assert_eq!(schema.field(1).name(), "c2");
-        assert_eq!(schema.field(2).name(), "c3");
-        assert_eq!(schema.field(3).name(), "c4");
+            let file_extension = match format {
+                "json" => JsonFormat::default().get_ext(),
+                "csv" => CsvFormat::default().get_ext(),
+                #[cfg(feature = "parquet")]
+                "parquet" => ParquetFormat::default().get_ext(),
+                _ => unreachable!("Unsupported format"),
+            };
 
-        // Create a ListingTable and verify it maintains the schema source
-        let table = ListingTable::try_new(config)?;
-        assert_eq!(*table.schema_source(), SchemaSource::Inferred);
+            helper_test_append_new_files_to_table(
+                file_extension,
+                FileCompressionType::UNCOMPRESSED,
+                Some(config_map),
+                expected_files,
+            )
+            .await?;
+        }
 
         Ok(())
     }

From 29fe6e9a041eb60e7a3f72dee07957647225f58d Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 13:18:09 +0800
Subject: [PATCH 11/22] refactor: remove outdated comments and consolidate test
 cases in ListingTable

---
 datafusion/core/src/datasource/listing/table.rs | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 0df66bf73e038..138a7c57531ad 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -1253,7 +1253,6 @@ impl ListingTable {
 /// * `limit` - An optional row count limit. If provided, the function will stop collecting files
 ///   once the accumulated number of rows exceeds this limit
 /// * `collect_stats` - Whether to collect and accumulate statistics from the files
-
 ///
 /// # Returns
 /// A `Result` containing a `FileGroup` with the collected files
@@ -1441,9 +1440,6 @@ mod tests {
         Ok(())
     }
 
-    // do_not_load_table_stats_by_default and load_table_stats_when_no_stats
-    // have been replaced by test_table_stats_behaviors
-
     #[cfg(feature = "parquet")]
     #[tokio::test]
     async fn test_try_create_output_ordering() {
@@ -1710,12 +1706,6 @@ mod tests {
         Ok(())
     }
 
-    // The following tests have been replaced by test_insert_into_parameterized:
-    // - test_insert_into_append_new_json_files
-    // - test_insert_into_append_new_csv_files
-    // - test_insert_into_append_2_new_parquet_files_defaults
-    // - test_insert_into_append_1_new_parquet_files_defaults
-
     #[tokio::test]
     async fn test_insert_into_sql_csv_defaults() -> Result<()> {
         helper_test_insert_into_sql("csv", FileCompressionType::UNCOMPRESSED, "", None)
@@ -2212,8 +2202,6 @@ mod tests {
         Ok(())
     }
 
-    // test_schema_source_tracking was replaced by test_schema_source_tracking_comprehensive
-
     #[tokio::test]
     async fn test_listing_table_config_with_multiple_files_comprehensive() -> Result<()> {
         let ctx = SessionContext::new();

From 517818baa6d1fa647ba5e3543bc712cb5920e9e4 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 13:22:10 +0800
Subject: [PATCH 12/22] fix: add PartialEq derivation to SchemaSource enum for
 comparison support

---
 .../core/src/datasource/listing/table.rs      | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 138a7c57531ad..dc3209aacc152 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -59,7 +59,7 @@ use itertools::Itertools;
 use object_store::ObjectStore;
 
 /// Indicates the source of the schema for a [`ListingTable`]
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub enum SchemaSource {
     /// Schema is not yet set (initial state)
     None,
@@ -1394,8 +1394,10 @@ mod tests {
         );
 
         // Make sure inferred schema doesn't override specified schema
-        let config_with_schema_and_infer =
-            config_with_schema_and_options.infer(&ctx.state()).await?;
+        let config_with_schema_and_infer = config_with_schema_and_options
+            .clone()
+            .infer(&ctx.state())
+            .await?;
         assert_eq!(
             *config_with_schema_and_infer.schema_source(),
             SchemaSource::Specified
@@ -2418,7 +2420,7 @@ mod tests {
                 // Test empty files case
                 assert_list_files_for_multi_paths(
                     &[],
-                    &paths.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    &paths,
                     target_partitions,
                     expected_partitions,
                     file_ext,
@@ -2427,7 +2429,7 @@ mod tests {
             } else if paths.len() == 1 {
                 // Test using single path API
                 assert_list_files_for_scan_grouping(
-                    &files.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    &files,
                     paths[0],
                     target_partitions,
                     expected_partitions,
@@ -2437,8 +2439,8 @@ mod tests {
             } else if paths[0].contains("test:///bucket/key") {
                 // Test using multi path API
                 assert_list_files_for_multi_paths(
-                    &files.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
-                    &paths.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
+                    &files,
+                    &paths,
                     target_partitions,
                     expected_partitions,
                     file_ext,
@@ -2446,9 +2448,8 @@ mod tests {
                 .await?;
             } else {
                 // Test using exact path API for specific cases
-                let file_strs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
                 assert_list_files_for_exact_paths(
-                    &file_strs,
+                    &files,
                     target_partitions,
                     expected_partitions,
                     file_ext,

From feb2f093f875f3099badd31958d677e92e606f10 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 14:22:46 +0800
Subject: [PATCH 13/22] fix: update SQL query in tests to reference correct
 table name t

---
 datafusion/core/src/datasource/listing/table.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index dc3209aacc152..8ac133eef8891 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -2047,7 +2047,7 @@ mod tests {
 
         // Read the records in the table
         let batches = session_ctx
-            .sql("select count(*) as count from foo")
+            .sql("select count(*) as count from t")
             .await?
             .collect()
             .await?;

From 46c26464f5863f35243eb746a36afa0938936e92 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 14:24:04 +0800
Subject: [PATCH 14/22] refactor: reorganize imports for clarity and
 consistency in table.rs

---
 .../core/src/datasource/listing/table.rs      | 56 +++++++++----------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 8ac133eef8891..db5cae233fec2 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -17,46 +17,42 @@
 
 //! The table implementation.
 
-use super::helpers::{expr_applicable_for_cols, pruned_partition_list};
-use super::{ListingTableUrl, PartitionedFile};
-use std::collections::HashMap;
-use std::{any::Any, str::FromStr, sync::Arc};
-
-use crate::datasource::{
-    create_ordering,
-    file_format::{file_compression_type::FileCompressionType, FileFormat},
-    physical_plan::FileSinkConfig,
+use super::{
+    helpers::{expr_applicable_for_cols, pruned_partition_list},
+    ListingTableUrl, PartitionedFile,
+};
+use crate::{
+    datasource::file_format::{file_compression_type::FileCompressionType, FileFormat},
+    datasource::{create_ordering, physical_plan::FileSinkConfig},
+    execution::context::SessionState,
 };
-use crate::execution::context::SessionState;
-use datafusion_catalog::TableProvider;
-use datafusion_common::{config_err, DataFusionError, Result};
-use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
-use datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory;
-use datafusion_execution::config::SessionConfig;
-use datafusion_expr::dml::InsertOp;
-use datafusion_expr::{Expr, TableProviderFilterPushDown};
-use datafusion_expr::{SortExpr, TableType};
-use datafusion_physical_plan::empty::EmptyExec;
-use datafusion_physical_plan::{ExecutionPlan, Statistics};
-
 use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef};
+use async_trait::async_trait;
+use datafusion_catalog::{Session, TableProvider};
 use datafusion_common::{
-    config_datafusion_err, internal_err, plan_err, project_schema, Constraints, SchemaExt,
+    config_datafusion_err, config_err, internal_err, plan_err, project_schema,
+    stats::Precision, Constraints, DataFusionError, Result, SchemaExt,
+};
+use datafusion_datasource::{
+    compute_all_files_statistics,
+    file_groups::FileGroup,
+    file_scan_config::{FileScanConfig, FileScanConfigBuilder},
+    schema_adapter::DefaultSchemaAdapterFactory,
 };
-use datafusion_execution::cache::{
-    cache_manager::FileStatisticsCache, cache_unit::DefaultFileStatisticsCache,
+use datafusion_execution::{
+    cache::{cache_manager::FileStatisticsCache, cache_unit::DefaultFileStatisticsCache},
+    config::SessionConfig,
+};
+use datafusion_expr::{
+    dml::InsertOp, Expr, SortExpr, TableProviderFilterPushDown, TableType,
 };
 use datafusion_physical_expr::{LexOrdering, PhysicalSortRequirement};
-
-use async_trait::async_trait;
-use datafusion_catalog::Session;
-use datafusion_common::stats::Precision;
-use datafusion_datasource::compute_all_files_statistics;
-use datafusion_datasource::file_groups::FileGroup;
 use datafusion_physical_expr_common::sort_expr::LexRequirement;
+use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics};
 use futures::{future, stream, Stream, StreamExt, TryStreamExt};
 use itertools::Itertools;
 use object_store::ObjectStore;
+use std::{any::Any, collections::HashMap, str::FromStr, sync::Arc};
 
 /// Indicates the source of the schema for a [`ListingTable`]
 #[derive(Debug, Clone, PartialEq, Eq)]

From a69642a6ed553c9d1123afe8f6154061b7b2b3ff Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 14:53:56 +0800
Subject: [PATCH 15/22] test: add helper functions to generate test file paths
 for improved flexibility

---
 .../core/src/datasource/listing/table.rs      | 77 +++++++++----------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index db5cae233fec2..6dd38974d88d8 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -55,7 +55,8 @@ use object_store::ObjectStore;
 use std::{any::Any, collections::HashMap, str::FromStr, sync::Arc};
 
 /// Indicates the source of the schema for a [`ListingTable`]
-#[derive(Debug, Clone, PartialEq, Eq)]
+/// PartialEq required for assert_eq! in tests
+#[derive(Debug, Clone, PartialEq)]
 pub enum SchemaSource {
     /// Schema is not yet set (initial state)
     None,
@@ -1352,6 +1353,22 @@ mod tests {
         ]))
     }
 
+    /// Helper function to generate test file paths with given prefix, count, and optional start index
+    fn generate_test_files(prefix: &str, count: usize) -> Vec<String> {
+        generate_test_files_with_start(prefix, count, 0)
+    }
+
+    /// Helper function to generate test file paths with given prefix, count, and start index
+    fn generate_test_files_with_start(
+        prefix: &str,
+        count: usize,
+        start_index: usize,
+    ) -> Vec<String> {
+        (start_index..start_index + count)
+            .map(|i| format!("{prefix}/file{i}"))
+            .collect()
+    }
+
     #[tokio::test]
     async fn test_schema_source_tracking_comprehensive() -> Result<()> {
         let ctx = SessionContext::new();
@@ -2319,13 +2336,7 @@ mod tests {
             // Single path cases
             (
                 "Single path, more partitions than files",
-                vec![
-                    "bucket/key-prefix/file0",
-                    "bucket/key-prefix/file1",
-                    "bucket/key-prefix/file2",
-                    "bucket/key-prefix/file3",
-                    "bucket/key-prefix/file4",
-                ],
+                generate_test_files("bucket/key-prefix", 5),
                 vec!["test:///bucket/key-prefix/"],
                 12,
                 5,
@@ -2333,12 +2344,7 @@ mod tests {
             ),
             (
                 "Single path, equal partitions and files",
-                vec![
-                    "bucket/key-prefix/file0",
-                    "bucket/key-prefix/file1",
-                    "bucket/key-prefix/file2",
-                    "bucket/key-prefix/file3",
-                ],
+                generate_test_files("bucket/key-prefix", 4),
                 vec!["test:///bucket/key-prefix/"],
                 4,
                 4,
@@ -2346,13 +2352,7 @@ mod tests {
             ),
             (
                 "Single path, more files than partitions",
-                vec![
-                    "bucket/key-prefix/file0",
-                    "bucket/key-prefix/file1",
-                    "bucket/key-prefix/file2",
-                    "bucket/key-prefix/file3",
-                    "bucket/key-prefix/file4",
-                ],
+                generate_test_files("bucket/key-prefix", 5),
                 vec!["test:///bucket/key-prefix/"],
                 2,
                 2,
@@ -2361,14 +2361,12 @@ mod tests {
             // Multi path cases
             (
                 "Multi path, more partitions than files",
-                vec![
-                    "bucket/key1/file0",
-                    "bucket/key1/file1",
-                    "bucket/key1/file2",
-                    "bucket/key2/file3",
-                    "bucket/key2/file4",
-                    "bucket/key3/file5",
-                ],
+                {
+                    let mut files = generate_test_files("bucket/key1", 3);
+                    files.extend(generate_test_files_with_start("bucket/key2", 2, 3));
+                    files.extend(generate_test_files_with_start("bucket/key3", 1, 5));
+                    files
+                },
                 vec!["test:///bucket/key1/", "test:///bucket/key2/"],
                 12,
                 5,
@@ -2386,13 +2384,11 @@ mod tests {
             // Exact path cases
             (
                 "Exact paths test",
-                vec![
-                    "bucket/key1/file0",
-                    "bucket/key1/file1",
-                    "bucket/key1/file2",
-                    "bucket/key2/file3",
-                    "bucket/key2/file4",
-                ],
+                {
+                    let mut files = generate_test_files("bucket/key1", 3);
+                    files.extend(generate_test_files_with_start("bucket/key2", 2, 3));
+                    files
+                },
                 vec![
                     "test:///bucket/key1/file0",
                     "test:///bucket/key1/file1",
@@ -2424,8 +2420,9 @@ mod tests {
                 .await?;
             } else if paths.len() == 1 {
                 // Test using single path API
+                let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
                 assert_list_files_for_scan_grouping(
-                    &files,
+                    &file_refs,
                     paths[0],
                     target_partitions,
                     expected_partitions,
@@ -2434,8 +2431,9 @@ mod tests {
                 .await?;
             } else if paths[0].contains("test:///bucket/key") {
                 // Test using multi path API
+                let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
                 assert_list_files_for_multi_paths(
-                    &files,
+                    &file_refs,
                     &paths,
                     target_partitions,
                     expected_partitions,
@@ -2444,8 +2442,9 @@ mod tests {
                 .await?;
             } else {
                 // Test using exact path API for specific cases
+                let file_refs: Vec<&str> = files.iter().map(|s| s.as_str()).collect();
                 assert_list_files_for_exact_paths(
-                    &files,
+                    &file_refs,
                     target_partitions,
                     expected_partitions,
                     file_ext,

From 890b5e2f2d3368b465e92c9c57c527d1f92cfd9c Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 15:44:46 +0800
Subject: [PATCH 16/22] refactor: reorganize test imports for improved clarity
 and consistency

---
 .../core/src/datasource/listing/table.rs      | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index ae6cb1cb0d182..98c33459af7c3 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -26,7 +26,7 @@ use crate::{
     datasource::{create_ordering, physical_plan::FileSinkConfig},
     execution::context::SessionState,
 };
-use arrow::datatypes::{DataType, Field, Schema, SchemaBuilder, SchemaRef};
+use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef};
 use arrow_schema::Schema;
 use async_trait::async_trait;
 use datafusion_catalog::{Session, TableProvider};
@@ -47,14 +47,12 @@ use datafusion_execution::{
 use datafusion_expr::{
     dml::InsertOp, Expr, SortExpr, TableProviderFilterPushDown, TableType,
 };
-use datafusion_physical_expr::{LexOrdering, PhysicalSortRequirement};
-use datafusion_physical_expr_common::sort_expr::{LexOrdering, LexRequirement};
+use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics};
 use futures::{future, stream, Stream, StreamExt, TryStreamExt};
 use itertools::Itertools;
 use object_store::ObjectStore;
 use std::{any::Any, collections::HashMap, str::FromStr, sync::Arc};
-
 /// Indicates the source of the schema for a [`ListingTable`]
 /// PartialEq required for assert_eq! in tests
 #[derive(Debug, Clone, PartialEq)]
@@ -1303,29 +1301,30 @@ async fn get_files_with_limit(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::datasource::file_format::csv::CsvFormat;
-    use crate::datasource::file_format::json::JsonFormat;
     #[cfg(feature = "parquet")]
     use crate::datasource::file_format::parquet::ParquetFormat;
-    use crate::datasource::{provider_as_source, DefaultTableSource, MemTable};
-    use crate::execution::options::ArrowReadOptions;
     use crate::prelude::*;
-    use crate::test::columns;
-    use crate::test::object_store::{
-        ensure_head_concurrency, make_test_store_and_state, register_test_store,
+    use crate::{
+        datasource::{
+            file_format::csv::CsvFormat, file_format::json::JsonFormat,
+            provider_as_source, DefaultTableSource, MemTable,
+        },
+        execution::options::ArrowReadOptions,
+        test::{
+            columns, object_store::ensure_head_concurrency,
+            object_store::make_test_store_and_state, object_store::register_test_store,
+        },
+    };
+    use arrow::{compute::SortOptions, record_batch::RecordBatch};
+    use datafusion_common::{
+        assert_contains,
+        stats::Precision,
+        test_util::{batches_to_string, datafusion_test_data},
+        ScalarValue,
     };
-
-    use arrow::compute::SortOptions;
-    use arrow::record_batch::RecordBatch;
-    use datafusion_common::stats::Precision;
-    use datafusion_common::test_util::batches_to_string;
-    use datafusion_common::test_util::datafusion_test_data;
-    use datafusion_common::{assert_contains, ScalarValue};
     use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator};
     use datafusion_physical_expr::PhysicalSortExpr;
     use datafusion_physical_plan::{collect, ExecutionPlanProperties};
-
-    use crate::test::object_store::{ensure_head_concurrency, make_test_store_and_state};
     use std::io::Write;
     use tempfile::TempDir;
     use url::Url;

From 712c5723c2d32b30483778140b6d4c38ff624ece Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 17:07:33 +0800
Subject: [PATCH 17/22] fix clippy errors

---
 datafusion/core/src/datasource/listing/table.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 98c33459af7c3..3e79fdcb12b13 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -2388,7 +2388,7 @@ mod tests {
         for (test_name, files, paths, target_partitions, expected_partitions, file_ext) in
             test_cases
         {
-            println!("Running test: {}", test_name);
+            println!("Running test: {test_name}");
 
             if files.is_empty() {
                 // Test empty files case

From 22d38c0a19c40a61f559dff312bbcc0a17a08ac1 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sat, 7 Jun 2025 23:48:52 +0800
Subject: [PATCH 18/22] test: add TODO comment to correct byte size in
 partition statistics assertion

---
 datafusion/core/src/datasource/listing/table.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 3e79fdcb12b13..cd6ab13cb2f6f 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -2463,6 +2463,7 @@ mod tests {
             exec_default.partition_statistics(None)?.num_rows,
             Precision::Absent
         );
+        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_default.partition_statistics(None)?.total_byte_size,
             Precision::Absent
@@ -2482,6 +2483,7 @@ mod tests {
             exec_disabled.partition_statistics(None)?.num_rows,
             Precision::Absent
         );
+        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_disabled.partition_statistics(None)?.total_byte_size,
             Precision::Absent
@@ -2501,6 +2503,7 @@ mod tests {
             exec_enabled.partition_statistics(None)?.num_rows,
             Precision::Exact(8)
         );
+        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_enabled.partition_statistics(None)?.total_byte_size,
             Precision::Exact(671)

From a40a448305b501d1c67b63668d694f31c31d5ecf Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 8 Jun 2025 00:02:07 +0800
Subject: [PATCH 19/22] test: remove TODO comment for byte size correction in
 partition statistics assertion from Test 1

---
 datafusion/core/src/datasource/listing/table.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index cd6ab13cb2f6f..3e2dc92f574fc 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -2463,7 +2463,6 @@ mod tests {
             exec_default.partition_statistics(None)?.num_rows,
             Precision::Absent
         );
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_default.partition_statistics(None)?.total_byte_size,
             Precision::Absent

From 89923f02023ab4d58ac5b74fec5d1006758921c0 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 8 Jun 2025 11:06:12 +0800
Subject: [PATCH 20/22] Move todo comment to case 1

---
 datafusion/core/src/datasource/listing/table.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 3e2dc92f574fc..90ee5df4f8c94 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -2463,6 +2463,8 @@ mod tests {
             exec_default.partition_statistics(None)?.num_rows,
             Precision::Absent
         );
+
+        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_default.partition_statistics(None)?.total_byte_size,
             Precision::Absent
@@ -2482,7 +2484,6 @@ mod tests {
             exec_disabled.partition_statistics(None)?.num_rows,
             Precision::Absent
         );
-        // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
         assert_eq!(
             exec_disabled.partition_statistics(None)?.total_byte_size,
             Precision::Absent

From 21ad8fbc3a7728fa04c943c9bedd01acd9766b01 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 9 Jun 2025 21:22:50 +0800
Subject: [PATCH 21/22] fix: simplify schema source access in ListingTable and
 ListingTableConfig

---
 .../core/src/datasource/listing/table.rs      | 37 +++++++++----------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 90ee5df4f8c94..07ee1d31cf3b6 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -54,8 +54,8 @@ use itertools::Itertools;
 use object_store::ObjectStore;
 use std::{any::Any, collections::HashMap, str::FromStr, sync::Arc};
 /// Indicates the source of the schema for a [`ListingTable`]
-/// PartialEq required for assert_eq! in tests
-#[derive(Debug, Clone, PartialEq)]
+// PartialEq required for assert_eq! in tests
+#[derive(Debug, Clone, Copy, PartialEq)]
 pub enum SchemaSource {
     /// Schema is not yet set (initial state)
     None,
@@ -110,8 +110,8 @@ impl ListingTableConfig {
     }
 
     /// Returns the source of the schema for this configuration
-    pub fn schema_source(&self) -> &SchemaSource {
-        &self.schema_source
+    pub fn schema_source(&self) -> SchemaSource {
+        self.schema_source
     }
     /// Set the `schema` for the overall [`ListingTable`]
     ///
@@ -890,8 +890,8 @@ impl ListingTable {
     }
 
     /// Get the schema source
-    pub fn schema_source(&self) -> &SchemaSource {
-        &self.schema_source
+    pub fn schema_source(&self) -> SchemaSource {
+        self.schema_source
     }
 
     /// If file_sort_order is specified, creates the appropriate physical expressions
@@ -1364,31 +1364,28 @@ mod tests {
 
         // Test default schema source
         let config = ListingTableConfig::new(table_path.clone());
-        assert_eq!(*config.schema_source(), SchemaSource::None);
+        assert_eq!(config.schema_source(), SchemaSource::None);
 
         // Test schema source after setting a schema explicitly
         let provided_schema = create_test_schema();
         let config_with_schema = config.clone().with_schema(provided_schema.clone());
-        assert_eq!(*config_with_schema.schema_source(), SchemaSource::Specified);
+        assert_eq!(config_with_schema.schema_source(), SchemaSource::Specified);
 
         // Test schema source after inferring schema
         let format = CsvFormat::default();
         let options = ListingOptions::new(Arc::new(format));
         let config_with_options = config.with_listing_options(options.clone());
-        assert_eq!(*config_with_options.schema_source(), SchemaSource::None);
+        assert_eq!(config_with_options.schema_source(), SchemaSource::None);
 
         let config_with_inferred = config_with_options.infer_schema(&ctx.state()).await?;
-        assert_eq!(
-            *config_with_inferred.schema_source(),
-            SchemaSource::Inferred
-        );
+        assert_eq!(config_with_inferred.schema_source(), SchemaSource::Inferred);
 
         // Test schema preservation through operations
         let config_with_schema_and_options = config_with_schema
             .clone()
             .with_listing_options(options.clone());
         assert_eq!(
-            *config_with_schema_and_options.schema_source(),
+            config_with_schema_and_options.schema_source(),
             SchemaSource::Specified
         );
 
@@ -1398,16 +1395,16 @@ mod tests {
             .infer(&ctx.state())
             .await?;
         assert_eq!(
-            *config_with_schema_and_infer.schema_source(),
+            config_with_schema_and_infer.schema_source(),
             SchemaSource::Specified
         );
 
         // Verify sources in actual ListingTable objects
         let table_specified = ListingTable::try_new(config_with_schema_and_options)?;
-        assert_eq!(*table_specified.schema_source(), SchemaSource::Specified);
+        assert_eq!(table_specified.schema_source(), SchemaSource::Specified);
 
         let table_inferred = ListingTable::try_new(config_with_inferred)?;
-        assert_eq!(*table_inferred.schema_source(), SchemaSource::Inferred);
+        assert_eq!(table_inferred.schema_source(), SchemaSource::Inferred);
 
         Ok(())
     }
@@ -2235,7 +2232,7 @@ mod tests {
         ])
         .with_listing_options(options.clone());
         let config1 = config1.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config1.schema_source(), SchemaSource::Inferred);
+        assert_eq!(config1.schema_source(), SchemaSource::Inferred);
 
         // Verify schema matches first file
         let schema1 = config1.file_schema.as_ref().unwrap().clone();
@@ -2258,7 +2255,7 @@ mod tests {
         .with_schema(schema_3cols)
         .with_listing_options(options.clone());
         let config2 = config2.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config2.schema_source(), SchemaSource::Specified);
+        assert_eq!(config2.schema_source(), SchemaSource::Specified);
 
         // Verify that the schema is still the one we specified (3 columns)
         let schema2 = config2.file_schema.as_ref().unwrap().clone();
@@ -2282,7 +2279,7 @@ mod tests {
         .with_schema(schema_4cols)
         .with_listing_options(options.clone());
         let config3 = config3.infer_schema(&ctx.state()).await?;
-        assert_eq!(*config3.schema_source(), SchemaSource::Specified);
+        assert_eq!(config3.schema_source(), SchemaSource::Specified);
 
         // Verify that the schema is still the one we specified (4 columns)
         let schema3 = config3.file_schema.as_ref().unwrap().clone();

From 5485976ac7f5dccb49959b9b3a6010951081b649 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 9 Jun 2025 22:18:26 +0800
Subject: [PATCH 22/22] Fix clippy error

---
 datafusion/core/src/datasource/listing/table.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs
index 07ee1d31cf3b6..3ddf1c85e241b 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -809,7 +809,7 @@ impl ListingTable {
     /// See documentation and example on [`ListingTable`] and [`ListingTableConfig`]
     pub fn try_new(config: ListingTableConfig) -> Result<Self> {
         // Extract schema_source before moving other parts of the config
-        let schema_source = config.schema_source().clone();
+        let schema_source = config.schema_source();
 
         let file_schema = config
             .file_schema