From ec5498916bc09ad98194251455e277d7b60ff16e Mon Sep 17 00:00:00 2001 From: kysshsy Date: Sun, 28 Dec 2025 14:36:33 +0800 Subject: [PATCH 1/4] fix: enhance CTE resolution with identifier normalization --- datafusion/sql/src/resolve.rs | 88 ++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs index 148e886161fc..d2b84eece1bf 100644 --- a/datafusion/sql/src/resolve.rs +++ b/datafusion/sql/src/resolve.rs @@ -20,7 +20,7 @@ use std::collections::BTreeSet; use std::ops::ControlFlow; use crate::parser::{CopyToSource, CopyToStatement, Statement as DFStatement}; -use crate::planner::object_name_to_table_reference; +use crate::planner::{IdentNormalizer, object_name_to_table_reference}; use sqlparser::ast::*; // following constants are used in `resolve_table_references` @@ -49,13 +49,31 @@ struct RelationVisitor { relations: BTreeSet, all_ctes: BTreeSet, ctes_in_scope: Vec, + normalizer: IdentNormalizer, } impl RelationVisitor { + fn normalize_object_name(&self, name: &ObjectName) -> ObjectName { + let parts: Vec = name + .0 + .iter() + .cloned() + .map(|part| match part { + ObjectNamePart::Identifier(ident) => ObjectNamePart::Identifier( + Ident::new(self.normalizer.normalize(ident)), + ), + other => other, + }) + .collect(); + ObjectName(parts) + } + /// Record the reference to `relation`, if it's not a CTE reference. fn insert_relation(&mut self, relation: &ObjectName) { - if !self.relations.contains(relation) && !self.ctes_in_scope.contains(relation) { - self.relations.insert(relation.clone()); + let relation = self.normalize_object_name(relation); + if !self.relations.contains(&relation) && !self.ctes_in_scope.contains(&relation) + { + self.relations.insert(relation); } } } @@ -81,7 +99,9 @@ impl Visitor for RelationVisitor { let _ = cte.visit(self); } self.ctes_in_scope - .push(ObjectName::from(vec![cte.alias.name.clone()])); + .push(self.normalize_object_name(&ObjectName::from(vec![ + cte.alias.name.clone(), + ]))); } } ControlFlow::Continue(()) @@ -136,7 +156,7 @@ fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) { let _ = s.as_ref().visit(visitor); } DFStatement::CreateExternalTable(table) => { - visitor.relations.insert(table.name.clone()); + visitor.insert_relation(&table.name); } DFStatement::CopyTo(CopyToStatement { source, .. }) => match source { CopyToSource::Relation(table_name) => { @@ -193,6 +213,7 @@ pub fn resolve_table_references( relations: BTreeSet::new(), all_ctes: BTreeSet::new(), ctes_in_scope: vec![], + normalizer: IdentNormalizer::new(enable_ident_normalization), }; visit_statement(statement, &mut visitor); @@ -200,12 +221,12 @@ pub fn resolve_table_references( let table_refs = visitor .relations .into_iter() - .map(|x| object_name_to_table_reference(x, enable_ident_normalization)) + .map(|x| object_name_to_table_reference(x, false)) .collect::>()?; let ctes = visitor .all_ctes .into_iter() - .map(|x| object_name_to_table_reference(x, enable_ident_normalization)) + .map(|x| object_name_to_table_reference(x, false)) .collect::>()?; Ok((table_refs, ctes)) } @@ -270,4 +291,57 @@ mod tests { assert_eq!(ctes.len(), 1); assert_eq!(ctes[0].to_string(), "nodes"); } + + #[test] + fn resolve_table_references_cte_with_quoted_reference() { + use crate::parser::DFParser; + + let query = r#"with barbaz as (select 1) select * from "barbaz""#; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "barbaz"); + // Quoted reference should still resolve to the CTE when normalization is on + assert_eq!(table_refs.len(), 0); + } + + #[test] + fn resolve_table_references_cte_with_quoted_reference_normalization_off() { + use crate::parser::DFParser; + + let query = r#"with barbaz as (select 1) select * from "barbaz""#; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap(); + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "barbaz"); + // Even with normalization off, quoted reference matches same-case CTE name + assert_eq!(table_refs.len(), 0); + } + + #[test] + fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_on() { + use crate::parser::DFParser; + + let query = r#"with FOObar as (select 1) select * from "FOObar""#; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap(); + // CTE name is normalized to lowercase, quoted reference preserves case, so they differ + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "foobar"); + assert_eq!(table_refs.len(), 1); + assert_eq!(table_refs[0].to_string(), "FOObar"); + } + + #[test] + fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_off() { + use crate::parser::DFParser; + + let query = r#"with FOObar as (select 1) select * from "FOObar""#; + let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap(); + let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap(); + // Without normalization, cases match exactly, so quoted reference resolves to the CTE + assert_eq!(ctes.len(), 1); + assert_eq!(ctes[0].to_string(), "FOObar"); + assert_eq!(table_refs.len(), 0); + } } From 86476df5d1968fc36cc31463cae784cd25c7f706 Mon Sep 17 00:00:00 2001 From: kysshsy Date: Sun, 11 Jan 2026 22:13:32 +0800 Subject: [PATCH 2/4] fix: improve table reference handling with enhanced normalization and error management --- datafusion/sql/src/resolve.rs | 136 +++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs index d2b84eece1bf..50fd0f051d76 100644 --- a/datafusion/sql/src/resolve.rs +++ b/datafusion/sql/src/resolve.rs @@ -15,12 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::TableReference; use std::collections::BTreeSet; use std::ops::ControlFlow; +use datafusion_common::{DataFusionError, Result}; + +use crate::TableReference; use crate::parser::{CopyToSource, CopyToStatement, Statement as DFStatement}; -use crate::planner::{IdentNormalizer, object_name_to_table_reference}; +use crate::planner::object_name_to_table_reference; use sqlparser::ast::*; // following constants are used in `resolve_table_references` @@ -45,45 +47,40 @@ const INFORMATION_SCHEMA_TABLES: &[&str] = &[ PARAMETERS, ]; +// Collect table/CTE references as `TableReference`s and normalize them during traversal. +// This avoids a second normalization/conversion pass after visiting the AST. struct RelationVisitor { - relations: BTreeSet, - all_ctes: BTreeSet, - ctes_in_scope: Vec, - normalizer: IdentNormalizer, + relations: BTreeSet, + all_ctes: BTreeSet, + ctes_in_scope: Vec, + enable_ident_normalization: bool, } impl RelationVisitor { - fn normalize_object_name(&self, name: &ObjectName) -> ObjectName { - let parts: Vec = name - .0 - .iter() - .cloned() - .map(|part| match part { - ObjectNamePart::Identifier(ident) => ObjectNamePart::Identifier( - Ident::new(self.normalizer.normalize(ident)), - ), - other => other, - }) - .collect(); - ObjectName(parts) - } - /// Record the reference to `relation`, if it's not a CTE reference. - fn insert_relation(&mut self, relation: &ObjectName) { - let relation = self.normalize_object_name(relation); - if !self.relations.contains(&relation) && !self.ctes_in_scope.contains(&relation) - { - self.relations.insert(relation); + fn insert_relation(&mut self, relation: &ObjectName) -> ControlFlow { + match object_name_to_table_reference( + relation.clone(), + self.enable_ident_normalization, + ) { + Ok(relation) => { + if !self.relations.contains(&relation) + && !self.ctes_in_scope.contains(&relation) + { + self.relations.insert(relation); + } + ControlFlow::Continue(()) + } + Err(e) => ControlFlow::Break(e), } } } impl Visitor for RelationVisitor { - type Break = (); + type Break = DataFusionError; - fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<()> { - self.insert_relation(relation); - ControlFlow::Continue(()) + fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow { + self.insert_relation(relation) } fn pre_visit_query(&mut self, q: &Query) -> ControlFlow { @@ -96,12 +93,18 @@ impl Visitor for RelationVisitor { if !with.recursive { // This is a bit hackish as the CTE will be visited again as part of visiting `q`, // but thankfully `insert_relation` is idempotent. - let _ = cte.visit(self); + if let Some(err) = cte.visit(self).break_value() { + return ControlFlow::Break(err); + } + } + let cte_name = ObjectName::from(vec![cte.alias.name.clone()]); + match object_name_to_table_reference( + cte_name, + self.enable_ident_normalization, + ) { + Ok(cte_ref) => self.ctes_in_scope.push(cte_ref), + Err(e) => return ControlFlow::Break(e), } - self.ctes_in_scope - .push(self.normalize_object_name(&ObjectName::from(vec![ - cte.alias.name.clone(), - ]))); } } ControlFlow::Continue(()) @@ -117,13 +120,15 @@ impl Visitor for RelationVisitor { ControlFlow::Continue(()) } - fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<()> { + fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow { if let Statement::ShowCreate { obj_type: ShowCreateObject::Table | ShowCreateObject::View, obj_name, } = statement { - self.insert_relation(obj_name) + if let Some(err) = self.insert_relation(obj_name).break_value() { + return ControlFlow::Break(err); + } } // SHOW statements will later be rewritten into a SELECT from the information_schema @@ -140,35 +145,53 @@ impl Visitor for RelationVisitor { ); if requires_information_schema { for s in INFORMATION_SCHEMA_TABLES { - self.relations.insert(ObjectName::from(vec![ + // Information schema references are synthesized here, so convert directly. + let obj = ObjectName::from(vec![ Ident::new(INFORMATION_SCHEMA), Ident::new(*s), - ])); + ]); + match object_name_to_table_reference(obj, self.enable_ident_normalization) + { + Ok(tbl_ref) => { + self.relations.insert(tbl_ref); + } + Err(e) => return ControlFlow::Break(e), + } } } ControlFlow::Continue(()) } } -fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) { +fn control_flow_to_result(flow: ControlFlow) -> Result<()> { + match flow { + ControlFlow::Continue(()) => Ok(()), + ControlFlow::Break(err) => Err(err), + } +} + +fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) -> Result<()> { match statement { DFStatement::Statement(s) => { - let _ = s.as_ref().visit(visitor); + control_flow_to_result(s.as_ref().visit(visitor))?; } DFStatement::CreateExternalTable(table) => { - visitor.insert_relation(&table.name); + control_flow_to_result(visitor.insert_relation(&table.name))?; } DFStatement::CopyTo(CopyToStatement { source, .. }) => match source { CopyToSource::Relation(table_name) => { - visitor.insert_relation(table_name); + control_flow_to_result(visitor.insert_relation(table_name))?; } CopyToSource::Query(query) => { - let _ = query.visit(visitor); + control_flow_to_result(query.visit(visitor))?; } }, - DFStatement::Explain(explain) => visit_statement(&explain.statement, visitor), + DFStatement::Explain(explain) => { + visit_statement(&explain.statement, visitor)?; + } DFStatement::Reset(_) => {} } + Ok(()) } /// Collects all tables and views referenced in the SQL statement. CTEs are collected separately. @@ -208,27 +231,20 @@ fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) { pub fn resolve_table_references( statement: &crate::parser::Statement, enable_ident_normalization: bool, -) -> datafusion_common::Result<(Vec, Vec)> { +) -> Result<(Vec, Vec)> { let mut visitor = RelationVisitor { relations: BTreeSet::new(), all_ctes: BTreeSet::new(), ctes_in_scope: vec![], - normalizer: IdentNormalizer::new(enable_ident_normalization), + enable_ident_normalization, }; - visit_statement(statement, &mut visitor); - - let table_refs = visitor - .relations - .into_iter() - .map(|x| object_name_to_table_reference(x, false)) - .collect::>()?; - let ctes = visitor - .all_ctes - .into_iter() - .map(|x| object_name_to_table_reference(x, false)) - .collect::>()?; - Ok((table_refs, ctes)) + visit_statement(statement, &mut visitor)?; + + Ok(( + visitor.relations.into_iter().collect(), + visitor.all_ctes.into_iter().collect(), + )) } #[cfg(test)] From 84e11d7ad936ff4f8e9bcbfd82a76781e14add4b Mon Sep 17 00:00:00 2001 From: kysshsy Date: Sun, 11 Jan 2026 23:46:23 +0800 Subject: [PATCH 3/4] test(sqllogictest): add strict provider regression for #18932 Add `cte_quoted_reference.slt` and a strict `CatalogProvider`/`SchemaProvider` used only for that file. The provider panics on unexpected table lookups so we can observe when a CTE reference is maybe incorrectly treated as a catalog lookup. Refs: https://github.com/apache/datafusion/issues/18932 --- datafusion/sqllogictest/src/test_context.rs | 104 +++++++++++++++++- .../test_files/cte_quoted_reference.slt | 70 ++++++++++++ 2 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 datafusion/sqllogictest/test_files/cte_quoted_reference.slt diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 9ec085b41eec..a9aa3baa2463 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -30,7 +30,7 @@ use arrow::buffer::ScalarBuffer; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit, UnionFields}; use arrow::record_batch::RecordBatch; use datafusion::catalog::{ - CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session, + CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, Session, }; use datafusion::common::{DataFusionError, Result, not_impl_err}; use datafusion::functions::math::abs; @@ -96,6 +96,10 @@ impl TestContext { let file_name = relative_path.file_name().unwrap().to_str().unwrap(); match file_name { + "cte_quoted_reference.slt" => { + info!("Registering strict catalog provider for CTE tests"); + register_strict_orders_catalog(test_ctx.session_ctx()); + } "information_schema_table_types.slt" => { info!("Registering local temporary table"); register_temp_table(test_ctx.session_ctx()).await; @@ -171,6 +175,104 @@ impl TestContext { } } +// ============================================================================== +// Strict Catalog / Schema Provider (sqllogictest-only) +// ============================================================================== +// +// The goal of `cte_quoted_reference.slt` is to exercise end-to-end query planning +// while detecting *unexpected* catalog lookups. +// +// Specifically, if DataFusion incorrectly treats a CTE reference (e.g. `"barbaz"`) +// as a real table reference, the planner will attempt to resolve it through the +// schema provider. The types below deliberately `panic!` on any lookup other than +// the one table we expect (`orders`). +// +// This makes the "extra provider lookup" bug observable in an end-to-end test, +// rather than being silently ignored by default providers that return `Ok(None)` +// for unknown tables. + +#[derive(Debug)] +struct StrictOrdersCatalog { + schema: Arc, +} + +impl CatalogProvider for StrictOrdersCatalog { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + vec!["public".to_string()] + } + + fn schema(&self, name: &str) -> Option> { + (name == "public").then(|| Arc::clone(&self.schema)) + } +} + +#[derive(Debug)] +struct StrictOrdersSchema { + orders: Arc, +} + +#[async_trait] +impl SchemaProvider for StrictOrdersSchema { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + vec!["orders".to_string()] + } + + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { + match name { + "orders" => Ok(Some(Arc::clone(&self.orders))), + other => panic!( + "unexpected table lookup: {other}. This maybe indicates a CTE reference was \ + incorrectly treated as a catalog table reference." + ), + } + } + + fn table_exist(&self, name: &str) -> bool { + name == "orders" + } +} + +fn register_strict_orders_catalog(ctx: &SessionContext) { + let schema = Arc::new(Schema::new(vec![Field::new( + "order_id", + DataType::Int32, + false, + )])); + + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from(vec![1, 2]))], + ) + .expect("record batch should be valid"); + + let orders = + MemTable::try_new(schema, vec![vec![batch]]).expect("memtable should be valid"); + + let schema_provider: Arc = Arc::new(StrictOrdersSchema { + orders: Arc::new(orders), + }); + + // Override the default "datafusion" catalog for this test file so that any + // unexpected lookup is caught immediately. + ctx.register_catalog( + "datafusion", + Arc::new(StrictOrdersCatalog { + schema: schema_provider, + }), + ); +} + #[cfg(feature = "avro")] pub async fn register_avro_tables(ctx: &mut TestContext) { use datafusion::prelude::AvroReadOptions; diff --git a/datafusion/sqllogictest/test_files/cte_quoted_reference.slt b/datafusion/sqllogictest/test_files/cte_quoted_reference.slt new file mode 100644 index 000000000000..6142157e5ec8 --- /dev/null +++ b/datafusion/sqllogictest/test_files/cte_quoted_reference.slt @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## CTE Reference Resolution +########## + +# These tests exercise CTE reference resolution with and without identifier +# normalization. The session is configured with a strict catalog/schema provider +# (see `datafusion/sqllogictest/src/test_context.rs`) that only provides the +# `orders` table and panics on any unexpected table lookup. +# +# This makes it observable if DataFusion incorrectly treats a CTE reference as a +# catalog lookup. +# +# Refs: https://github.com/apache/datafusion/issues/18932 +# +# NOTE: This test relies on a strict catalog/schema provider registered in +# `datafusion/sqllogictest/src/test_context.rs` that provides only the `orders` +# table and panics on unexpected lookups. + +statement ok +set datafusion.sql_parser.enable_ident_normalization = true; + +query I +with barbaz as (select * from orders) select * from "barbaz"; +---- +1 +2 + +query I +with BarBaz as (select * from orders) select * from "barbaz"; +---- +1 +2 + +query I +with barbaz as (select * from orders) select * from barbaz; +---- +1 +2 + +statement ok +set datafusion.sql_parser.enable_ident_normalization = false; + +query I +with barbaz as (select * from orders) select * from "barbaz"; +---- +1 +2 + +query I +with barbaz as (select * from orders) select * from barbaz; +---- +1 +2 From 168baba43cab261d036da8509b29ceb92bb756fa Mon Sep 17 00:00:00 2001 From: jonahgao Date: Mon, 12 Jan 2026 16:05:03 +0800 Subject: [PATCH 4/4] Fix ci failures --- datafusion/sql/src/resolve.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/sql/src/resolve.rs b/datafusion/sql/src/resolve.rs index 50fd0f051d76..955dbb86602a 100644 --- a/datafusion/sql/src/resolve.rs +++ b/datafusion/sql/src/resolve.rs @@ -93,9 +93,7 @@ impl Visitor for RelationVisitor { if !with.recursive { // This is a bit hackish as the CTE will be visited again as part of visiting `q`, // but thankfully `insert_relation` is idempotent. - if let Some(err) = cte.visit(self).break_value() { - return ControlFlow::Break(err); - } + cte.visit(self)?; } let cte_name = ObjectName::from(vec![cte.alias.name.clone()]); match object_name_to_table_reference( @@ -126,9 +124,7 @@ impl Visitor for RelationVisitor { obj_name, } = statement { - if let Some(err) = self.insert_relation(obj_name).break_value() { - return ControlFlow::Break(err); - } + self.insert_relation(obj_name)?; } // SHOW statements will later be rewritten into a SELECT from the information_schema