From b176c79c624b327fda5fd94e21ef05a744968852 Mon Sep 17 00:00:00 2001 From: Armin Primadi Date: Tue, 16 May 2023 15:04:47 +0700 Subject: [PATCH 1/5] Adding stack overflow test --- datafusion/sql/Cargo.toml | 1 + datafusion/sql/src/expr/mod.rs | 121 +++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index 7257da6e344f8..6b37b155b80c7 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -47,4 +47,5 @@ sqlparser = "0.33" [dev-dependencies] ctor = "0.2.0" env_logger = "0.10" +paste = "^1.0" rstest = "0.17" diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index a120b3400ad8f..dca102a18639d 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -566,3 +566,124 @@ fn plan_indexed(expr: Expr, mut keys: Vec) -> Result { plan_key(key)?, ))) } + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::HashMap; + use std::sync::Arc; + + use arrow::datatypes::{DataType, Field, Schema}; + use sqlparser::dialect::GenericDialect; + use sqlparser::parser::Parser; + + use datafusion_common::config::ConfigOptions; + use datafusion_expr::logical_plan::builder::LogicalTableSource; + use datafusion_expr::{AggregateUDF, ScalarUDF, TableSource}; + + use crate::TableReference; + + struct TestSchemaProvider { + options: ConfigOptions, + tables: HashMap>, + } + + impl TestSchemaProvider { + pub fn new() -> Self { + let mut tables = HashMap::new(); + tables.insert( + "table1".to_string(), + create_table_source(vec![Field::new( + "column1".to_string(), + DataType::Utf8, + false, + )]), + ); + + Self { + options: Default::default(), + tables, + } + } + } + + impl ContextProvider for TestSchemaProvider { + fn get_table_provider( + &self, + name: TableReference, + ) -> Result> { + match self.tables.get(name.table()) { + Some(table) => Ok(table.clone()), + _ => Err(DataFusionError::Plan(format!( + "Table not found: {}", + name.table() + ))), + } + } + + fn get_function_meta(&self, _name: &str) -> Option> { + None + } + + fn get_aggregate_meta(&self, _name: &str) -> Option> { + None + } + + fn get_variable_type(&self, _variable_names: &[String]) -> Option { + None + } + + fn options(&self) -> &ConfigOptions { + &self.options + } + } + + fn create_table_source(fields: Vec) -> Arc { + Arc::new(LogicalTableSource::new(Arc::new( + Schema::new_with_metadata(fields, HashMap::new()), + ))) + } + + macro_rules! test_stack_overflow { + ($num_expr:expr) => { + paste::item! { + #[test] + fn []() { + let schema = DFSchema::empty(); + let mut planner_context = PlannerContext::default(); + + let expr_str = (0..$num_expr) + .map(|i| format!("column1 = 'value{:?}'", i)) + .collect::>() + .join(" OR "); + + let dialect = GenericDialect{}; + let mut parser = Parser::new(&dialect) + .try_with_sql(expr_str.as_str()) + .unwrap(); + let sql_expr = parser.parse_expr().unwrap(); + + let schema_provider = TestSchemaProvider::new(); + let sql_to_rel = SqlToRel::new(&schema_provider); + + // Should not stack overflow + sql_to_rel.sql_expr_to_logical_expr( + sql_expr, + &schema, + &mut planner_context, + ).unwrap(); + } + } + }; + } + + test_stack_overflow!(64); + test_stack_overflow!(128); + test_stack_overflow!(256); + test_stack_overflow!(512); + test_stack_overflow!(1024); + test_stack_overflow!(2048); + test_stack_overflow!(4096); + test_stack_overflow!(8192); +} From 59099ea81ad6c163400fe48be835ae9d9b005766 Mon Sep 17 00:00:00 2001 From: Armin Primadi Date: Tue, 16 May 2023 16:15:27 +0700 Subject: [PATCH 2/5] Implement heap based non-recursive visitor --- datafusion/sql/src/expr/binary_op.rs | 27 +++-------- datafusion/sql/src/expr/mod.rs | 67 ++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 39 deletions(-) diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs index af545c7371d94..c5d2238ac0b77 100644 --- a/datafusion/sql/src/expr/binary_op.rs +++ b/datafusion/sql/src/expr/binary_op.rs @@ -15,21 +15,14 @@ // specific language governing permissions and limitations // under the License. -use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{DFSchema, DataFusionError, Result}; -use datafusion_expr::{BinaryExpr, Expr, Operator}; -use sqlparser::ast::{BinaryOperator, Expr as SQLExpr}; +use crate::planner::{ContextProvider, SqlToRel}; +use datafusion_common::{DataFusionError, Result}; +use datafusion_expr::Operator; +use sqlparser::ast::BinaryOperator; impl<'a, S: ContextProvider> SqlToRel<'a, S> { - pub(crate) fn parse_sql_binary_op( - &self, - left: SQLExpr, - op: BinaryOperator, - right: SQLExpr, - schema: &DFSchema, - planner_context: &mut PlannerContext, - ) -> Result { - let operator = match op { + pub(crate) fn parse_sql_binary_op(&self, op: BinaryOperator) -> Result { + match op { BinaryOperator::Gt => Ok(Operator::Gt), BinaryOperator::GtEq => Ok(Operator::GtEq), BinaryOperator::Lt => Ok(Operator::Lt), @@ -56,12 +49,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { _ => Err(DataFusionError::NotImplemented(format!( "Unsupported SQL binary operator {op:?}" ))), - }?; - - Ok(Expr::BinaryExpr(BinaryExpr::new( - Box::new(self.sql_expr_to_logical_expr(left, schema, planner_context)?), - operator, - Box::new(self.sql_expr_to_logical_expr(right, schema, planner_context)?), - ))) + } } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index dca102a18639d..615bd145ebc02 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -46,27 +46,56 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { - // Workaround for https://github.com/apache/arrow-datafusion/issues/4065 - // - // Minimize stack space required in debug builds to plan - // deeply nested binary operators by keeping the stack space - // needed for sql_expr_to_logical_expr minimal for BinaryOp - // - // The reason this reduces stack size in debug builds is - // explained in the "Technical Backstory" heading of - // https://github.com/apache/arrow-datafusion/pull/1047 - // - // A likely better way to support deeply nested expressions - // would be to avoid recursion all together and use an - // iterative algorithm. - match sql { - SQLExpr::BinaryOp { left, op, right } => { - self.parse_sql_binary_op(*left, op, *right, schema, planner_context) + enum StackEntry { + SQLExpr(Box), + Operator(Operator), + } + + // Virtual stack machine to convert SQLExpr to Expr + // This allows visiting the expr tree in a depth-first manner which + // produces expressions in postfix notations, i.e. `a + b` => `a b +`. + let mut stack: Box> = + Box::new(vec![StackEntry::SQLExpr(Box::new(sql))]); + let mut eval_stack = Box::new(vec![]); + + while let Some(entry) = stack.pop() { + match entry { + StackEntry::SQLExpr(sql_expr) => { + match *sql_expr { + SQLExpr::BinaryOp { left, op, right } => { + // Note the order that we push the entries to the stack + // is important. We want to visit the left node first. + let op = self.parse_sql_binary_op(op)?; + stack.push(StackEntry::Operator(op)); + stack.push(StackEntry::SQLExpr(right)); + stack.push(StackEntry::SQLExpr(left)); + } + _ => { + let expr = self.sql_expr_to_logical_expr_internal( + *sql_expr, + schema, + planner_context, + )?; + eval_stack.push(expr); + } + } + } + StackEntry::Operator(op) => { + let right = eval_stack.pop().unwrap(); + let left = eval_stack.pop().unwrap(); + let expr = Expr::BinaryExpr(BinaryExpr::new( + Box::new(left), + op, + Box::new(right), + )); + eval_stack.push(expr); + } } - // since this function requires more space per frame - // avoid calling it for binary ops - _ => self.sql_expr_to_logical_expr_internal(sql, schema, planner_context), } + + assert_eq!(1, eval_stack.len()); + let expr = eval_stack.pop().unwrap(); + Ok(expr) } /// Generate a relational expression from a SQL expression From c50b62be1d03e811cc75ba4421820aa849736ff2 Mon Sep 17 00:00:00 2001 From: Armin Primadi Date: Tue, 16 May 2023 17:06:47 +0700 Subject: [PATCH 3/5] Fix clippy --- datafusion/sql/src/expr/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 615bd145ebc02..fba6dda084aed 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -56,7 +56,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // produces expressions in postfix notations, i.e. `a + b` => `a b +`. let mut stack: Box> = Box::new(vec![StackEntry::SQLExpr(Box::new(sql))]); - let mut eval_stack = Box::new(vec![]); + let mut eval_stack = Box::>::default(); while let Some(entry) = stack.pop() { match entry { From e38636c45cf62c9b5dfaf18f081fec15f7e3f74c Mon Sep 17 00:00:00 2001 From: Armin Primadi Date: Thu, 18 May 2023 10:27:17 +0700 Subject: [PATCH 4/5] Update datafusion/sql/src/expr/mod.rs Co-authored-by: Andrew Lamb --- datafusion/sql/src/expr/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index fba6dda084aed..59c52d30fd569 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -54,6 +54,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Virtual stack machine to convert SQLExpr to Expr // This allows visiting the expr tree in a depth-first manner which // produces expressions in postfix notations, i.e. `a + b` => `a b +`. + // See https://github.com/apache/arrow-datafusion/issues/1444 let mut stack: Box> = Box::new(vec![StackEntry::SQLExpr(Box::new(sql))]); let mut eval_stack = Box::>::default(); From 6a93b69dc5f4ca42e3576e87b16e180b11320cb2 Mon Sep 17 00:00:00 2001 From: Armin Primadi Date: Thu, 18 May 2023 11:27:30 +0700 Subject: [PATCH 5/5] Update datafusion/sql/src/expr/mod.rs Co-authored-by: Andrew Lamb --- datafusion/sql/src/expr/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 59c52d30fd569..d3230c411900b 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -55,9 +55,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // This allows visiting the expr tree in a depth-first manner which // produces expressions in postfix notations, i.e. `a + b` => `a b +`. // See https://github.com/apache/arrow-datafusion/issues/1444 - let mut stack: Box> = - Box::new(vec![StackEntry::SQLExpr(Box::new(sql))]); - let mut eval_stack = Box::>::default(); + let mut stack = vec![StackEntry::SQLExpr(Box::new(sql))]; + let mut eval_stack = vec![]; while let Some(entry) = stack.pop() { match entry {