diff --git a/Cargo.toml b/Cargo.toml index 1cc7aa6eb3b88..7a349735b64cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ members = [ "datafusion/data-access", "datafusion/expr", "datafusion/jit", + "datafusion/optimizer", "datafusion/physical-expr", "datafusion/proto", "datafusion/row", diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 79a48cf1593f5..a598b65d30554 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -63,6 +63,7 @@ datafusion-common = { path = "../common", version = "8.0.0", features = ["parque datafusion-data-access = { path = "../data-access", version = "8.0.0" } datafusion-expr = { path = "../expr", version = "8.0.0" } datafusion-jit = { path = "../jit", version = "8.0.0", optional = true } +datafusion-optimizer = { path = "../optimizer", version = "8.0.0" } datafusion-physical-expr = { path = "../physical-expr", version = "8.0.0" } datafusion-row = { path = "../row", version = "8.0.0" } datafusion-sql = { path = "../sql", version = "8.0.0" } diff --git a/datafusion/core/src/optimizer/mod.rs b/datafusion/core/src/optimizer/mod.rs index b274ab645f54c..cf6412db9ab2f 100644 --- a/datafusion/core/src/optimizer/mod.rs +++ b/datafusion/core/src/optimizer/mod.rs @@ -19,14 +19,10 @@ //! some simple rules to a logical plan, such as "Projection Push Down" and "Type Coercion". #![allow(clippy::module_inception)] -pub mod common_subexpr_eliminate; -pub mod eliminate_filter; -pub mod eliminate_limit; -pub mod filter_push_down; -pub mod limit_push_down; -pub mod optimizer; -pub mod projection_push_down; pub mod simplify_expressions; -pub mod single_distinct_to_groupby; -pub mod subquery_filter_to_join; -pub mod utils; + +pub use datafusion_optimizer::{ + common_subexpr_eliminate, eliminate_filter, eliminate_limit, filter_push_down, + limit_push_down, optimizer, projection_push_down, single_distinct_to_groupby, + subquery_filter_to_join, utils, +}; diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index dd00a50286e09..304c1b376f69c 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -24,7 +24,7 @@ use crate::error::Result; use crate::from_slice::FromSlice; use crate::logical_plan::LogicalPlan; use crate::physical_plan::file_format::{CsvExec, FileScanConfig}; -use crate::test_util::{aggr_test_schema, scan_empty}; +use crate::test_util::aggr_test_schema; use array::{Array, ArrayRef}; use arrow::array::{self, DecimalBuilder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; @@ -128,21 +128,6 @@ pub fn partitioned_csv_config( }) } -/// some tests share a common table with different names -pub fn test_table_scan_with_name(name: &str) -> Result { - let schema = Schema::new(vec![ - Field::new("a", DataType::UInt32, false), - Field::new("b", DataType::UInt32, false), - Field::new("c", DataType::UInt32, false), - ]); - scan_empty(Some(name), &schema, None)?.build() -} - -/// some tests share a common table -pub fn test_table_scan() -> Result { - test_table_scan_with_name("test") -} - pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { let actual: Vec = plan .schema() @@ -259,5 +244,4 @@ fn create_batch(schema: &Schema) -> RecordBatch { pub mod exec; pub mod object_store; -pub mod user_defined; pub mod variable; diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml new file mode 100644 index 0000000000000..4d024f4c51b0a --- /dev/null +++ b/datafusion/optimizer/Cargo.toml @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-optimizer" +description = "DataFusion Query Optimizer" +version = "8.0.0" +homepage = "https://github.com/apache/arrow-datafusion" +repository = "https://github.com/apache/arrow-datafusion" +readme = "README.md" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = [ "datafusion", "query", "optimizer" ] +edition = "2021" +rust-version = "1.59" + +[lib] +name = "datafusion_optimizer" +path = "src/lib.rs" + +[features] +default = ["unicode_expressions"] +unicode_expressions = [] + +[dependencies] +arrow = { version = "15.0.0", features = ["prettyprint"] } +async-trait = "0.1.41" +chrono = { version = "0.4", default-features = false } +datafusion-common = { path = "../common", version = "8.0.0" } +datafusion-expr = { path = "../expr", version = "8.0.0" } +hashbrown = { version = "0.12", features = ["raw"] } +log = "^0.4" diff --git a/datafusion/optimizer/README.md b/datafusion/optimizer/README.md new file mode 100644 index 0000000000000..39d28a8fae37a --- /dev/null +++ b/datafusion/optimizer/README.md @@ -0,0 +1,26 @@ + + +# DataFusion Query Optimizer Rules + +[DataFusion](df) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. + +This crate is a submodule of DataFusion that provides query optimizer rules. + +[df]: https://crates.io/crates/datafusion diff --git a/datafusion/core/src/optimizer/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs similarity index 99% rename from datafusion/core/src/optimizer/common_subexpr_eliminate.rs rename to datafusion/optimizer/src/common_subexpr_eliminate.rs index 916e99713d4a6..bc635c215e2e4 100644 --- a/datafusion/core/src/optimizer/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -17,7 +17,7 @@ //! Eliminate common sub-expression. -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; +use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::DataType; use datafusion_common::{DFField, DFSchema, Result}; use datafusion_expr::{ diff --git a/datafusion/core/src/optimizer/eliminate_filter.rs b/datafusion/optimizer/src/eliminate_filter.rs similarity index 98% rename from datafusion/core/src/optimizer/eliminate_filter.rs rename to datafusion/optimizer/src/eliminate_filter.rs index 4bbc2c4019e44..86cd3bb8d7184 100644 --- a/datafusion/core/src/optimizer/eliminate_filter.rs +++ b/datafusion/optimizer/src/eliminate_filter.rs @@ -18,6 +18,7 @@ //! Optimizer rule to replace `where false` on a plan with an empty relation. //! This saves time in planning and executing the query. //! Note that this rule should be applied after simplify expressions optimizer rule. +use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ logical_plan::{EmptyRelation, Filter, LogicalPlan}, @@ -25,8 +26,6 @@ use datafusion_expr::{ Expr, }; -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; - /// Optimization rule that elimanate the scalar value (true/false) filter with an [LogicalPlan::EmptyRelation] #[derive(Default)] pub struct EliminateFilter; diff --git a/datafusion/core/src/optimizer/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs similarity index 98% rename from datafusion/core/src/optimizer/eliminate_limit.rs rename to datafusion/optimizer/src/eliminate_limit.rs index 27e5fab1720d6..f6d3b847288c8 100644 --- a/datafusion/core/src/optimizer/eliminate_limit.rs +++ b/datafusion/optimizer/src/eliminate_limit.rs @@ -17,7 +17,7 @@ //! Optimizer rule to replace `LIMIT 0` on a plan with an empty relation. //! This saves time in planning and executing the query. -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; +use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::Result; use datafusion_expr::{ logical_plan::{EmptyRelation, Limit, LogicalPlan}, diff --git a/datafusion/core/src/optimizer/filter_push_down.rs b/datafusion/optimizer/src/filter_push_down.rs similarity index 99% rename from datafusion/core/src/optimizer/filter_push_down.rs rename to datafusion/optimizer/src/filter_push_down.rs index a1a50a3788259..96abdc4c5dedc 100644 --- a/datafusion/core/src/optimizer/filter_push_down.rs +++ b/datafusion/optimizer/src/filter_push_down.rs @@ -14,10 +14,7 @@ //! Filter Push Down optimizer rule ensures that filters are applied as early as possible in the plan -use crate::optimizer::{ - optimizer::{OptimizerConfig, OptimizerRule}, - utils, -}; +use crate::{utils, OptimizerConfig, OptimizerRule}; use datafusion_common::{Column, DFSchema, Result}; use datafusion_expr::{ col, diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs new file mode 100644 index 0000000000000..9a51300508789 --- /dev/null +++ b/datafusion/optimizer/src/lib.rs @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub mod common_subexpr_eliminate; +pub mod eliminate_filter; +pub mod eliminate_limit; +pub mod filter_push_down; +pub mod limit_push_down; +pub mod optimizer; +pub mod projection_push_down; +pub mod single_distinct_to_groupby; +pub mod subquery_filter_to_join; +pub mod utils; + +#[cfg(test)] +pub mod test; + +pub use optimizer::{OptimizerConfig, OptimizerRule}; diff --git a/datafusion/core/src/optimizer/limit_push_down.rs b/datafusion/optimizer/src/limit_push_down.rs similarity index 99% rename from datafusion/core/src/optimizer/limit_push_down.rs rename to datafusion/optimizer/src/limit_push_down.rs index 41fb7cc5ff8a0..91f9760015e02 100644 --- a/datafusion/core/src/optimizer/limit_push_down.rs +++ b/datafusion/optimizer/src/limit_push_down.rs @@ -17,7 +17,7 @@ //! Optimizer rule to push down LIMIT in the query plan //! It will push down through projection, limits (taking the smaller limit) -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; +use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{ logical_plan::{ diff --git a/datafusion/core/src/optimizer/optimizer.rs b/datafusion/optimizer/src/optimizer.rs similarity index 100% rename from datafusion/core/src/optimizer/optimizer.rs rename to datafusion/optimizer/src/optimizer.rs diff --git a/datafusion/core/src/optimizer/projection_push_down.rs b/datafusion/optimizer/src/projection_push_down.rs similarity index 99% rename from datafusion/core/src/optimizer/projection_push_down.rs rename to datafusion/optimizer/src/projection_push_down.rs index b99b81f5259fb..cd26d886c6f2a 100644 --- a/datafusion/core/src/optimizer/projection_push_down.rs +++ b/datafusion/optimizer/src/projection_push_down.rs @@ -18,7 +18,7 @@ //! Projection Push Down optimizer rule ensures that only referenced columns are //! loaded into memory -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; +use crate::{OptimizerConfig, OptimizerRule}; use arrow::datatypes::{Field, Schema}; use arrow::error::Result as ArrowResult; use datafusion_common::{ @@ -530,7 +530,6 @@ mod tests { use super::*; use crate::test::*; - use crate::test_util::scan_empty; use arrow::datatypes::DataType; use datafusion_expr::{ col, lit, diff --git a/datafusion/core/src/optimizer/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs similarity index 98% rename from datafusion/core/src/optimizer/single_distinct_to_groupby.rs rename to datafusion/optimizer/src/single_distinct_to_groupby.rs index d29a2477b125d..c508b9772c34f 100644 --- a/datafusion/core/src/optimizer/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -17,7 +17,7 @@ //! single distinct to group by optimizer rule -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; +use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{DFSchema, Result}; use datafusion_expr::{ col, @@ -201,10 +201,10 @@ impl OptimizerRule for SingleDistinctToGroupBy { #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::aggregates; use crate::test::*; use datafusion_expr::{ col, count, count_distinct, lit, logical_plan::builder::LogicalPlanBuilder, max, + AggregateFunction, }; fn assert_optimized_plan_eq(plan: &LogicalPlan, expected: &str) { @@ -314,7 +314,7 @@ mod tests { vec![ count_distinct(col("b")), Expr::AggregateFunction { - fun: aggregates::AggregateFunction::Max, + fun: AggregateFunction::Max, distinct: true, args: vec![col("b")], }, diff --git a/datafusion/core/src/optimizer/subquery_filter_to_join.rs b/datafusion/optimizer/src/subquery_filter_to_join.rs similarity index 99% rename from datafusion/core/src/optimizer/subquery_filter_to_join.rs rename to datafusion/optimizer/src/subquery_filter_to_join.rs index bcbd9ae8a73a7..f2621e19098a0 100644 --- a/datafusion/core/src/optimizer/subquery_filter_to_join.rs +++ b/datafusion/optimizer/src/subquery_filter_to_join.rs @@ -26,10 +26,7 @@ //! WHERE t1.f IN (SELECT f FROM t2) OR t2.f = 'x' //! ``` //! won't -use crate::optimizer::{ - optimizer::{OptimizerConfig, OptimizerRule}, - utils, -}; +use crate::{utils, OptimizerConfig, OptimizerRule}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{ logical_plan::{ diff --git a/datafusion/optimizer/src/test/mod.rs b/datafusion/optimizer/src/test/mod.rs new file mode 100644 index 0000000000000..86e12bc30c09a --- /dev/null +++ b/datafusion/optimizer/src/test/mod.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, Schema}; +use datafusion_common::Result; +use datafusion_expr::{logical_plan::table_scan, LogicalPlan, LogicalPlanBuilder}; + +pub mod user_defined; + +/// some tests share a common table with different names +pub fn test_table_scan_with_name(name: &str) -> Result { + let schema = Schema::new(vec![ + Field::new("a", DataType::UInt32, false), + Field::new("b", DataType::UInt32, false), + Field::new("c", DataType::UInt32, false), + ]); + table_scan(Some(name), &schema, None)?.build() +} + +/// some tests share a common table +pub fn test_table_scan() -> Result { + test_table_scan_with_name("test") +} + +/// Scan an empty data source, mainly used in tests +pub fn scan_empty( + name: Option<&str>, + table_schema: &Schema, + projection: Option>, +) -> Result { + table_scan(name, table_schema, projection) +} + +pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { + let actual: Vec = plan + .schema() + .fields() + .iter() + .map(|f| f.name().clone()) + .collect(); + assert_eq!(actual, expected); +} diff --git a/datafusion/core/src/test/user_defined.rs b/datafusion/optimizer/src/test/user_defined.rs similarity index 93% rename from datafusion/core/src/test/user_defined.rs rename to datafusion/optimizer/src/test/user_defined.rs index 19ed0580b91c5..c9993568c4373 100644 --- a/datafusion/core/src/test/user_defined.rs +++ b/datafusion/optimizer/src/test/user_defined.rs @@ -17,15 +17,17 @@ //! Simple user defined logical plan node for testing +use datafusion_common::DFSchemaRef; +use datafusion_expr::{ + logical_plan::{Extension, UserDefinedLogicalNode}, + Expr, LogicalPlan, +}; use std::{ any::Any, fmt::{self, Debug}, sync::Arc, }; -use crate::logical_plan::plan::Extension; -use crate::logical_plan::{DFSchemaRef, Expr, LogicalPlan, UserDefinedLogicalNode}; - /// Create a new user defined plan node, for testing pub fn new(input: LogicalPlan) -> LogicalPlan { let node = Arc::new(TestUserDefinedPlanNode { input }); diff --git a/datafusion/core/src/optimizer/utils.rs b/datafusion/optimizer/src/utils.rs similarity index 99% rename from datafusion/core/src/optimizer/utils.rs rename to datafusion/optimizer/src/utils.rs index 863536972f735..35414f5f87231 100644 --- a/datafusion/core/src/optimizer/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -17,7 +17,7 @@ //! Collection of utility functions that are leveraged by the query optimizer rules -use crate::optimizer::optimizer::{OptimizerConfig, OptimizerRule}; +use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::{ and,