From aecc83ef182dff76ff9bc84d84873e51cafe62df Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 28 Aug 2023 07:01:55 -0400 Subject: [PATCH 01/15] Split out `datafusion-physical-plan` crate --- Cargo.toml | 2 +- datafusion/physical-plan/Cargo.toml | 69 +++++++++++++++++++ .../src}/aggregates/group_values/mod.rs | 0 .../src}/aggregates/group_values/primitive.rs | 0 .../src}/aggregates/group_values/row.rs | 0 .../src}/aggregates/mod.rs | 0 .../src}/aggregates/no_grouping.rs | 0 .../src}/aggregates/order/full.rs | 0 .../src}/aggregates/order/mod.rs | 0 .../src}/aggregates/order/partial.rs | 0 .../src}/aggregates/row_hash.rs | 0 .../src}/analyze.rs | 0 .../src}/coalesce_batches.rs | 0 .../src}/coalesce_partitions.rs | 0 .../src}/common.rs | 0 .../src}/display.rs | 0 .../src}/empty.rs | 0 .../src}/explain.rs | 0 .../src}/filter.rs | 0 .../src}/insert.rs | 0 .../src}/joins/cross_join.rs | 0 .../src}/joins/hash_join.rs | 0 .../src}/joins/hash_join_utils.rs | 0 .../src}/joins/mod.rs | 0 .../src}/joins/nested_loop_join.rs | 0 .../src}/joins/sort_merge_join.rs | 0 .../src}/joins/symmetric_hash_join.rs | 0 .../src}/joins/test_utils.rs | 0 .../src}/joins/utils.rs | 0 .../mod.rs => physical-plan/src/lib.rs} | 0 .../src}/limit.rs | 0 .../src}/memory.rs | 0 .../src}/metrics/baseline.rs | 0 .../src}/metrics/builder.rs | 0 .../src}/metrics/mod.rs | 0 .../src}/metrics/value.rs | 0 .../src}/projection.rs | 0 .../src}/repartition/distributor_channels.rs | 0 .../src}/repartition/mod.rs | 0 .../src}/sorts/builder.rs | 0 .../src}/sorts/cursor.rs | 0 .../src}/sorts/index.rs | 0 .../src}/sorts/merge.rs | 0 .../src}/sorts/mod.rs | 0 .../src}/sorts/sort.rs | 0 .../src}/sorts/sort_preserving_merge.rs | 0 .../src}/sorts/stream.rs | 0 .../src}/stream.rs | 0 .../src}/streaming.rs | 0 .../src}/tree_node.rs | 0 .../src}/udaf.rs | 0 .../src}/union.rs | 0 .../src}/unnest.rs | 0 .../src}/values.rs | 0 .../src}/visitor.rs | 0 .../src}/windows/bounded_window_agg_exec.rs | 0 .../src}/windows/mod.rs | 0 .../src}/windows/window_agg_exec.rs | 0 58 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 datafusion/physical-plan/Cargo.toml rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/group_values/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/group_values/primitive.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/group_values/row.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/no_grouping.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/order/full.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/order/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/order/partial.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/aggregates/row_hash.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/analyze.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/coalesce_batches.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/coalesce_partitions.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/common.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/display.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/empty.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/explain.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/filter.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/insert.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/cross_join.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/hash_join.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/hash_join_utils.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/nested_loop_join.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/sort_merge_join.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/symmetric_hash_join.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/test_utils.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/joins/utils.rs (100%) rename datafusion/{core/src/physical_plan/mod.rs => physical-plan/src/lib.rs} (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/limit.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/memory.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/metrics/baseline.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/metrics/builder.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/metrics/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/metrics/value.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/projection.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/repartition/distributor_channels.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/repartition/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/builder.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/cursor.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/index.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/merge.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/sort.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/sort_preserving_merge.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/sorts/stream.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/stream.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/streaming.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/tree_node.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/udaf.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/union.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/unnest.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/values.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/visitor.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/windows/bounded_window_agg_exec.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/windows/mod.rs (100%) rename datafusion/{core/src/physical_plan => physical-plan/src}/windows/window_agg_exec.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 1dae101d2f8fc..ae3ce0bf6cbf6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [workspace] exclude = ["datafusion-cli"] -members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/proto", "datafusion/proto/gen", "datafusion/sql", "datafusion/sqllogictest", "datafusion/substrait", "datafusion-examples", "test-utils", "benchmarks", +members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/physical-plan", "datafusion/proto", "datafusion/proto/gen", "datafusion/sql", "datafusion/sqllogictest", "datafusion/substrait", "datafusion-examples", "test-utils", "benchmarks", ] resolver = "2" diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml new file mode 100644 index 0000000000000..ebf4781d223f6 --- /dev/null +++ b/datafusion/physical-plan/Cargo.toml @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-physical-plan" +description = "Physical (ExecutionPlan) implementations for DataFusion query engine" +keywords = ["arrow", "query", "sql"] +version = { workspace = true } +edition = { workspace = true } +readme = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } + +[lib] +name = "datafusion_physical_plan" +path = "src/lib.rs" + +[features] + +[dependencies] +#ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } +#base64 = { version = "0.21", optional = true } +#blake2 = { version = "^0.10.2", optional = true } +#blake3 = { version = "1.0", optional = true } +#chrono = { version = "0.4.23", default-features = false } +datafusion-common = { path = "../common", version = "30.0.0" } +datafusion-expr = { path = "../expr", version = "30.0.0" } +datafusion-physical-expr = { path = "../physical-expr", version = "30.0.0" } +#half = { version = "2.1", default-features = false } +#hashbrown = { version = "0.14", features = ["raw"] } +#hex = { version = "0.4", optional = true } +#indexmap = "2.0.0" +#itertools = { version = "0.11", features = ["use_std"] } +#libc = "0.2.140" +#log = "^0.4" +#md-5 = { version = "^0.10.0", optional = true } +#paste = "^1.0" +#petgraph = "0.6.2" +#rand = "0.8" +#regex = { version = "1.8", optional = true } +#sha2 = { version = "^0.10.1", optional = true } +#unicode-segmentation = { version = "^1.7.1", optional = true } +#uuid = { version = "^1.2", features = ["v4"] } + +#[dev-dependencies] +#criterion = "0.5" +#rand = "0.8" +#rstest = "0.18.0" diff --git a/datafusion/core/src/physical_plan/aggregates/group_values/mod.rs b/datafusion/physical-plan/src/aggregates/group_values/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/group_values/mod.rs rename to datafusion/physical-plan/src/aggregates/group_values/mod.rs diff --git a/datafusion/core/src/physical_plan/aggregates/group_values/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/primitive.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/group_values/primitive.rs rename to datafusion/physical-plan/src/aggregates/group_values/primitive.rs diff --git a/datafusion/core/src/physical_plan/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/group_values/row.rs rename to datafusion/physical-plan/src/aggregates/group_values/row.rs diff --git a/datafusion/core/src/physical_plan/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/mod.rs rename to datafusion/physical-plan/src/aggregates/mod.rs diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/physical-plan/src/aggregates/no_grouping.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/no_grouping.rs rename to datafusion/physical-plan/src/aggregates/no_grouping.rs diff --git a/datafusion/core/src/physical_plan/aggregates/order/full.rs b/datafusion/physical-plan/src/aggregates/order/full.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/order/full.rs rename to datafusion/physical-plan/src/aggregates/order/full.rs diff --git a/datafusion/core/src/physical_plan/aggregates/order/mod.rs b/datafusion/physical-plan/src/aggregates/order/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/order/mod.rs rename to datafusion/physical-plan/src/aggregates/order/mod.rs diff --git a/datafusion/core/src/physical_plan/aggregates/order/partial.rs b/datafusion/physical-plan/src/aggregates/order/partial.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/order/partial.rs rename to datafusion/physical-plan/src/aggregates/order/partial.rs diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs similarity index 100% rename from datafusion/core/src/physical_plan/aggregates/row_hash.rs rename to datafusion/physical-plan/src/aggregates/row_hash.rs diff --git a/datafusion/core/src/physical_plan/analyze.rs b/datafusion/physical-plan/src/analyze.rs similarity index 100% rename from datafusion/core/src/physical_plan/analyze.rs rename to datafusion/physical-plan/src/analyze.rs diff --git a/datafusion/core/src/physical_plan/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs similarity index 100% rename from datafusion/core/src/physical_plan/coalesce_batches.rs rename to datafusion/physical-plan/src/coalesce_batches.rs diff --git a/datafusion/core/src/physical_plan/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs similarity index 100% rename from datafusion/core/src/physical_plan/coalesce_partitions.rs rename to datafusion/physical-plan/src/coalesce_partitions.rs diff --git a/datafusion/core/src/physical_plan/common.rs b/datafusion/physical-plan/src/common.rs similarity index 100% rename from datafusion/core/src/physical_plan/common.rs rename to datafusion/physical-plan/src/common.rs diff --git a/datafusion/core/src/physical_plan/display.rs b/datafusion/physical-plan/src/display.rs similarity index 100% rename from datafusion/core/src/physical_plan/display.rs rename to datafusion/physical-plan/src/display.rs diff --git a/datafusion/core/src/physical_plan/empty.rs b/datafusion/physical-plan/src/empty.rs similarity index 100% rename from datafusion/core/src/physical_plan/empty.rs rename to datafusion/physical-plan/src/empty.rs diff --git a/datafusion/core/src/physical_plan/explain.rs b/datafusion/physical-plan/src/explain.rs similarity index 100% rename from datafusion/core/src/physical_plan/explain.rs rename to datafusion/physical-plan/src/explain.rs diff --git a/datafusion/core/src/physical_plan/filter.rs b/datafusion/physical-plan/src/filter.rs similarity index 100% rename from datafusion/core/src/physical_plan/filter.rs rename to datafusion/physical-plan/src/filter.rs diff --git a/datafusion/core/src/physical_plan/insert.rs b/datafusion/physical-plan/src/insert.rs similarity index 100% rename from datafusion/core/src/physical_plan/insert.rs rename to datafusion/physical-plan/src/insert.rs diff --git a/datafusion/core/src/physical_plan/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/cross_join.rs rename to datafusion/physical-plan/src/joins/cross_join.rs diff --git a/datafusion/core/src/physical_plan/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/hash_join.rs rename to datafusion/physical-plan/src/joins/hash_join.rs diff --git a/datafusion/core/src/physical_plan/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/hash_join_utils.rs rename to datafusion/physical-plan/src/joins/hash_join_utils.rs diff --git a/datafusion/core/src/physical_plan/joins/mod.rs b/datafusion/physical-plan/src/joins/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/mod.rs rename to datafusion/physical-plan/src/joins/mod.rs diff --git a/datafusion/core/src/physical_plan/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/nested_loop_join.rs rename to datafusion/physical-plan/src/joins/nested_loop_join.rs diff --git a/datafusion/core/src/physical_plan/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/sort_merge_join.rs rename to datafusion/physical-plan/src/joins/sort_merge_join.rs diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs rename to datafusion/physical-plan/src/joins/symmetric_hash_join.rs diff --git a/datafusion/core/src/physical_plan/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/test_utils.rs rename to datafusion/physical-plan/src/joins/test_utils.rs diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs similarity index 100% rename from datafusion/core/src/physical_plan/joins/utils.rs rename to datafusion/physical-plan/src/joins/utils.rs diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/physical-plan/src/lib.rs similarity index 100% rename from datafusion/core/src/physical_plan/mod.rs rename to datafusion/physical-plan/src/lib.rs diff --git a/datafusion/core/src/physical_plan/limit.rs b/datafusion/physical-plan/src/limit.rs similarity index 100% rename from datafusion/core/src/physical_plan/limit.rs rename to datafusion/physical-plan/src/limit.rs diff --git a/datafusion/core/src/physical_plan/memory.rs b/datafusion/physical-plan/src/memory.rs similarity index 100% rename from datafusion/core/src/physical_plan/memory.rs rename to datafusion/physical-plan/src/memory.rs diff --git a/datafusion/core/src/physical_plan/metrics/baseline.rs b/datafusion/physical-plan/src/metrics/baseline.rs similarity index 100% rename from datafusion/core/src/physical_plan/metrics/baseline.rs rename to datafusion/physical-plan/src/metrics/baseline.rs diff --git a/datafusion/core/src/physical_plan/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs similarity index 100% rename from datafusion/core/src/physical_plan/metrics/builder.rs rename to datafusion/physical-plan/src/metrics/builder.rs diff --git a/datafusion/core/src/physical_plan/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/metrics/mod.rs rename to datafusion/physical-plan/src/metrics/mod.rs diff --git a/datafusion/core/src/physical_plan/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs similarity index 100% rename from datafusion/core/src/physical_plan/metrics/value.rs rename to datafusion/physical-plan/src/metrics/value.rs diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/physical-plan/src/projection.rs similarity index 100% rename from datafusion/core/src/physical_plan/projection.rs rename to datafusion/physical-plan/src/projection.rs diff --git a/datafusion/core/src/physical_plan/repartition/distributor_channels.rs b/datafusion/physical-plan/src/repartition/distributor_channels.rs similarity index 100% rename from datafusion/core/src/physical_plan/repartition/distributor_channels.rs rename to datafusion/physical-plan/src/repartition/distributor_channels.rs diff --git a/datafusion/core/src/physical_plan/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/repartition/mod.rs rename to datafusion/physical-plan/src/repartition/mod.rs diff --git a/datafusion/core/src/physical_plan/sorts/builder.rs b/datafusion/physical-plan/src/sorts/builder.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/builder.rs rename to datafusion/physical-plan/src/sorts/builder.rs diff --git a/datafusion/core/src/physical_plan/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/cursor.rs rename to datafusion/physical-plan/src/sorts/cursor.rs diff --git a/datafusion/core/src/physical_plan/sorts/index.rs b/datafusion/physical-plan/src/sorts/index.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/index.rs rename to datafusion/physical-plan/src/sorts/index.rs diff --git a/datafusion/core/src/physical_plan/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/merge.rs rename to datafusion/physical-plan/src/sorts/merge.rs diff --git a/datafusion/core/src/physical_plan/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/mod.rs rename to datafusion/physical-plan/src/sorts/mod.rs diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/sort.rs rename to datafusion/physical-plan/src/sorts/sort.rs diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs rename to datafusion/physical-plan/src/sorts/sort_preserving_merge.rs diff --git a/datafusion/core/src/physical_plan/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs similarity index 100% rename from datafusion/core/src/physical_plan/sorts/stream.rs rename to datafusion/physical-plan/src/sorts/stream.rs diff --git a/datafusion/core/src/physical_plan/stream.rs b/datafusion/physical-plan/src/stream.rs similarity index 100% rename from datafusion/core/src/physical_plan/stream.rs rename to datafusion/physical-plan/src/stream.rs diff --git a/datafusion/core/src/physical_plan/streaming.rs b/datafusion/physical-plan/src/streaming.rs similarity index 100% rename from datafusion/core/src/physical_plan/streaming.rs rename to datafusion/physical-plan/src/streaming.rs diff --git a/datafusion/core/src/physical_plan/tree_node.rs b/datafusion/physical-plan/src/tree_node.rs similarity index 100% rename from datafusion/core/src/physical_plan/tree_node.rs rename to datafusion/physical-plan/src/tree_node.rs diff --git a/datafusion/core/src/physical_plan/udaf.rs b/datafusion/physical-plan/src/udaf.rs similarity index 100% rename from datafusion/core/src/physical_plan/udaf.rs rename to datafusion/physical-plan/src/udaf.rs diff --git a/datafusion/core/src/physical_plan/union.rs b/datafusion/physical-plan/src/union.rs similarity index 100% rename from datafusion/core/src/physical_plan/union.rs rename to datafusion/physical-plan/src/union.rs diff --git a/datafusion/core/src/physical_plan/unnest.rs b/datafusion/physical-plan/src/unnest.rs similarity index 100% rename from datafusion/core/src/physical_plan/unnest.rs rename to datafusion/physical-plan/src/unnest.rs diff --git a/datafusion/core/src/physical_plan/values.rs b/datafusion/physical-plan/src/values.rs similarity index 100% rename from datafusion/core/src/physical_plan/values.rs rename to datafusion/physical-plan/src/values.rs diff --git a/datafusion/core/src/physical_plan/visitor.rs b/datafusion/physical-plan/src/visitor.rs similarity index 100% rename from datafusion/core/src/physical_plan/visitor.rs rename to datafusion/physical-plan/src/visitor.rs diff --git a/datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs similarity index 100% rename from datafusion/core/src/physical_plan/windows/bounded_window_agg_exec.rs rename to datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs diff --git a/datafusion/core/src/physical_plan/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs similarity index 100% rename from datafusion/core/src/physical_plan/windows/mod.rs rename to datafusion/physical-plan/src/windows/mod.rs diff --git a/datafusion/core/src/physical_plan/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs similarity index 100% rename from datafusion/core/src/physical_plan/windows/window_agg_exec.rs rename to datafusion/physical-plan/src/windows/window_agg_exec.rs From 70f3fb5487671efddaa1b4abca4479a786fe8438 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 28 Aug 2023 07:13:00 -0400 Subject: [PATCH 02/15] Update paths --- datafusion-examples/examples/csv_opener.rs | 2 +- datafusion/physical-plan/Cargo.toml | 21 +++++--- .../src/aggregates/group_values/primitive.rs | 2 +- .../src/aggregates/group_values/row.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 6 +-- .../src/aggregates/no_grouping.rs | 8 +-- .../src/aggregates/order/full.rs | 2 +- .../src/aggregates/order/partial.rs | 2 +- .../physical-plan/src/aggregates/row_hash.rs | 10 ++-- datafusion/physical-plan/src/analyze.rs | 2 +- .../physical-plan/src/coalesce_batches.rs | 4 +- .../physical-plan/src/coalesce_partitions.rs | 6 +-- datafusion/physical-plan/src/common.rs | 10 ++-- datafusion/physical-plan/src/display.rs | 8 +-- datafusion/physical-plan/src/empty.rs | 6 +-- datafusion/physical-plan/src/explain.rs | 4 +- datafusion/physical-plan/src/filter.rs | 8 +-- datafusion/physical-plan/src/insert.rs | 2 +- .../physical-plan/src/joins/cross_join.rs | 8 +-- .../physical-plan/src/joins/hash_join.rs | 8 +-- .../src/joins/hash_join_utils.rs | 6 +-- .../src/joins/nested_loop_join.rs | 10 ++-- .../src/joins/sort_merge_join.rs | 20 ++++---- .../src/joins/symmetric_hash_join.rs | 18 +++---- datafusion/physical-plan/src/joins/utils.rs | 6 +-- datafusion/physical-plan/src/lib.rs | 6 +-- datafusion/physical-plan/src/limit.rs | 6 +-- datafusion/physical-plan/src/memory.rs | 6 +-- datafusion/physical-plan/src/projection.rs | 6 +-- .../physical-plan/src/repartition/mod.rs | 50 +++++++------------ datafusion/physical-plan/src/sorts/cursor.rs | 2 +- datafusion/physical-plan/src/sorts/merge.rs | 14 ++---- datafusion/physical-plan/src/sorts/sort.rs | 20 ++++---- .../src/sorts/sort_preserving_merge.rs | 26 +++++----- datafusion/physical-plan/src/sorts/stream.rs | 6 +-- datafusion/physical-plan/src/stream.rs | 2 +- datafusion/physical-plan/src/streaming.rs | 6 +-- datafusion/physical-plan/src/tree_node.rs | 2 +- datafusion/physical-plan/src/udaf.rs | 2 +- datafusion/physical-plan/src/union.rs | 6 +-- datafusion/physical-plan/src/unnest.rs | 8 +-- datafusion/physical-plan/src/values.rs | 2 +- .../src/windows/bounded_window_agg_exec.rs | 10 ++-- datafusion/physical-plan/src/windows/mod.rs | 8 +-- .../src/windows/window_agg_exec.rs | 12 ++--- 45 files changed, 176 insertions(+), 205 deletions(-) diff --git a/datafusion-examples/examples/csv_opener.rs b/datafusion-examples/examples/csv_opener.rs index 0587b515b2d8d..6366f16b7aa3f 100644 --- a/datafusion-examples/examples/csv_opener.rs +++ b/datafusion-examples/examples/csv_opener.rs @@ -17,6 +17,7 @@ use std::{sync::Arc, vec}; +use crate::metrics::ExecutionPlanMetricsSet; use datafusion::{ assert_batches_eq, datasource::{ @@ -25,7 +26,6 @@ use datafusion::{ physical_plan::{CsvConfig, CsvOpener, FileScanConfig, FileStream}, }, error::Result, - physical_plan::metrics::ExecutionPlanMetricsSet, test_util::aggr_test_schema, }; use datafusion_common::FileCompressionType; diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index ebf4781d223f6..161569513a783 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -35,31 +35,38 @@ path = "src/lib.rs" [features] [dependencies] -#ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } +ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } arrow = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } +async-trait = "0.1.41" #base64 = { version = "0.21", optional = true } #blake2 = { version = "^0.10.2", optional = true } #blake3 = { version = "1.0", optional = true } -#chrono = { version = "0.4.23", default-features = false } +chrono = { version = "0.4.23", default-features = false } datafusion-common = { path = "../common", version = "30.0.0" } datafusion-expr = { path = "../expr", version = "30.0.0" } +datafusion-execution = { path = "../execution", version = "30.0.0" } datafusion-physical-expr = { path = "../physical-expr", version = "30.0.0" } -#half = { version = "2.1", default-features = false } -#hashbrown = { version = "0.14", features = ["raw"] } +futures = "0.3" +half = { version = "2.1", default-features = false } +hashbrown = { version = "0.14", features = ["raw"] } #hex = { version = "0.4", optional = true } -#indexmap = "2.0.0" -#itertools = { version = "0.11", features = ["use_std"] } +indexmap = "2.0.0" +itertools = { version = "0.11", features = ["use_std"] } #libc = "0.2.140" -#log = "^0.4" +log = "^0.4" #md-5 = { version = "^0.10.0", optional = true } +parking_lot = "0.12" +pin-project-lite = "^0.2.7" #paste = "^1.0" #petgraph = "0.6.2" #rand = "0.8" #regex = { version = "1.8", optional = true } #sha2 = { version = "^0.10.1", optional = true } +tempfile = "3" +tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] } #unicode-segmentation = { version = "^1.7.1", optional = true } #uuid = { version = "^1.2", features = ["v4"] } diff --git a/datafusion/physical-plan/src/aggregates/group_values/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/primitive.rs index 7b8691c67fdd0..b2cb5a8bcaf38 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/primitive.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/primitive.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::physical_plan::aggregates::group_values::GroupValues; +use crate::aggregates::group_values::GroupValues; use ahash::RandomState; use arrow::array::BooleanBufferBuilder; use arrow::buffer::NullBuffer; diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs index 4eb660d52590f..f215f29dc42ba 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/row.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::physical_plan::aggregates::group_values::GroupValues; +use crate::aggregates::group_values::GroupValues; use ahash::RandomState; use arrow::row::{RowConverter, Rows, SortField}; use arrow_array::ArrayRef; diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 78ef5e37b239d..cb88d132d0d8a 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -17,11 +17,11 @@ //! Aggregates functionalities -use crate::physical_plan::aggregates::{ +use crate::aggregates::{ no_grouping::AggregateStream, row_hash::GroupedHashAggregateStream, }; -use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use crate::physical_plan::{ +use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use crate::{ DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, }; diff --git a/datafusion/physical-plan/src/aggregates/no_grouping.rs b/datafusion/physical-plan/src/aggregates/no_grouping.rs index 610c24faab1c7..32c0bbc78a5de 100644 --- a/datafusion/physical-plan/src/aggregates/no_grouping.rs +++ b/datafusion/physical-plan/src/aggregates/no_grouping.rs @@ -17,12 +17,12 @@ //! Aggregate without grouping columns -use crate::physical_plan::aggregates::{ +use crate::aggregates::{ aggregate_expressions, create_accumulators, finalize_aggregation, AccumulatorItem, AggregateMode, }; -use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; -use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; +use crate::metrics::{BaselineMetrics, RecordOutput}; +use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::Result; @@ -33,7 +33,7 @@ use std::borrow::Cow; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::physical_plan::filter::batch_filter; +use crate::filter::batch_filter; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use futures::stream::{Stream, StreamExt}; diff --git a/datafusion/physical-plan/src/aggregates/order/full.rs b/datafusion/physical-plan/src/aggregates/order/full.rs index 69b308da7c8ca..f46ee687faf16 100644 --- a/datafusion/physical-plan/src/aggregates/order/full.rs +++ b/datafusion/physical-plan/src/aggregates/order/full.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::physical_expr::EmitTo; +use datafusion_physical_expr::EmitTo; /// Tracks grouping state when the data is ordered entirely by its /// group keys diff --git a/datafusion/physical-plan/src/aggregates/order/partial.rs b/datafusion/physical-plan/src/aggregates/order/partial.rs index 019e61ef26885..8c72d834e729b 100644 --- a/datafusion/physical-plan/src/aggregates/order/partial.rs +++ b/datafusion/physical-plan/src/aggregates/order/partial.rs @@ -15,12 +15,12 @@ // specific language governing permissions and limitations // under the License. -use crate::physical_expr::EmitTo; use arrow::row::{OwnedRow, RowConverter, Rows, SortField}; use arrow_array::ArrayRef; use arrow_schema::Schema; use datafusion_common::Result; use datafusion_execution::memory_pool::proxy::VecAllocExt; +use datafusion_physical_expr::EmitTo; use datafusion_physical_expr::PhysicalSortExpr; /// Tracks grouping state when the data is ordered by some subset of diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 4613a2e46443e..e80497ac59a5e 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -28,14 +28,14 @@ use std::vec; use futures::ready; use futures::stream::{Stream, StreamExt}; -use crate::physical_plan::aggregates::group_values::{new_group_values, GroupValues}; -use crate::physical_plan::aggregates::{ +use crate::aggregates::group_values::{new_group_values, GroupValues}; +use crate::aggregates::{ evaluate_group_by, evaluate_many, evaluate_optional, group_schema, AggregateMode, PhysicalGroupBy, }; -use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; -use crate::physical_plan::{aggregates, PhysicalExpr}; -use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; +use crate::metrics::{BaselineMetrics, RecordOutput}; +use crate::{aggregates, PhysicalExpr}; +use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::*; use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; use datafusion_common::Result; diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index 98fce19a1dd7d..e055400c18696 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use std::{any::Any, time::Instant}; -use crate::physical_plan::{ +use crate::{ display::DisplayableExecutionPlan, DisplayFormatType, ExecutionPlan, Partitioning, Statistics, }; diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index cc32d4163b19b..7e6e129934168 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -23,7 +23,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::physical_plan::{ +use crate::{ DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, }; @@ -309,8 +309,8 @@ pub fn concat_batches( #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::{memory::MemoryExec, repartition::RepartitionExec}; use crate::test::create_vec_batches; + use crate::{memory::MemoryExec, repartition::RepartitionExec}; use arrow::datatypes::{DataType, Field, Schema}; #[tokio::test(flavor = "multi_thread")] diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 78cb7b201f263..296743ba63145 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -26,9 +26,7 @@ use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::stream::{ObservedStream, RecordBatchReceiverStream}; use super::{DisplayAs, SendableRecordBatchStream, Statistics}; -use crate::physical_plan::{ - DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning, -}; +use crate::{DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning}; use arrow::datatypes::SchemaRef; use datafusion_common::{internal_err, DataFusionError, Result}; @@ -174,11 +172,11 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::physical_plan::{collect, common}; use crate::test::exec::{ assert_strong_count_converges_to_zero, BlockingExec, PanicExec, }; use crate::test::{self, assert_is_pending}; + use crate::{collect, common}; #[tokio::test] async fn merge() -> Result<()> { diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs index 787f3eed2673e..c6cfbbfbbac74 100644 --- a/datafusion/physical-plan/src/common.rs +++ b/datafusion/physical-plan/src/common.rs @@ -18,8 +18,8 @@ //! Defines common code used in execution plans use super::SendableRecordBatchStream; -use crate::physical_plan::stream::RecordBatchReceiverStream; -use crate::physical_plan::{ColumnStatistics, ExecutionPlan, Statistics}; +use crate::stream::RecordBatchReceiverStream; +use crate::{ColumnStatistics, ExecutionPlan, Statistics}; use arrow::datatypes::Schema; use arrow::ipc::writer::{FileWriter, IpcWriteOptions}; use arrow::record_batch::RecordBatch; @@ -375,9 +375,9 @@ mod tests { use std::ops::Not; use super::*; - use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::sorts::sort::SortExec; - use crate::physical_plan::union::UnionExec; + use crate::memory::MemoryExec; + use crate::sorts::sort::SortExec; + use crate::union::UnionExec; use arrow::compute::SortOptions; use arrow::{ array::{Float32Array, Float64Array}, diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 3b345bdf9e3ad..0b4379fe96e58 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -26,7 +26,7 @@ use datafusion_common::display::StringifiedPlan; use datafusion_physical_expr::PhysicalSortExpr; use super::{accept, ExecutionPlan, ExecutionPlanVisitor}; -use datafusion_common::display::GraphvizBuilder; +use datafusion_common::display::{GraphvizBuilder, PlanType}; /// Options for controlling how each [`ExecutionPlan`] should format itself #[derive(Debug, Clone, Copy)] @@ -204,11 +204,7 @@ impl<'a> DisplayableExecutionPlan<'a> { } /// format as a `StringifiedPlan` - pub fn to_stringified( - &self, - verbose: bool, - plan_type: crate::logical_expr::PlanType, - ) -> StringifiedPlan { + pub fn to_stringified(&self, verbose: bool, plan_type: PlanType) -> StringifiedPlan { StringifiedPlan::new(plan_type, self.indent(verbose).to_string()) } } diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index b38486991e82f..2f03e0750e7b0 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -20,9 +20,7 @@ use std::any::Any; use std::sync::Arc; -use crate::physical_plan::{ - memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning, -}; +use crate::{memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::array::{ArrayRef, NullArray}; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; @@ -174,7 +172,7 @@ impl ExecutionPlan for EmptyExec { #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::with_new_children_if_necessary; + use crate::with_new_children_if_necessary; use crate::{physical_plan::common, test_util}; #[tokio::test] diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs index 7ee6f268d8151..8d6bf4105f6a1 100644 --- a/datafusion/physical-plan/src/explain.rs +++ b/datafusion/physical-plan/src/explain.rs @@ -24,13 +24,13 @@ use datafusion_common::display::StringifiedPlan; use datafusion_common::{internal_err, DataFusionError, Result}; -use crate::physical_plan::{DisplayFormatType, ExecutionPlan, Partitioning, Statistics}; +use crate::{DisplayFormatType, ExecutionPlan, Partitioning, Statistics}; use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; use log::trace; use super::DisplayAs; use super::{expressions::PhysicalSortExpr, SendableRecordBatchStream}; -use crate::physical_plan::stream::RecordBatchStreamAdapter; +use crate::stream::RecordBatchStreamAdapter; use datafusion_execution::TaskContext; /// Explain execution plan operator. This operator contains the string diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 0b878814e305a..1030eeba42508 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -28,7 +28,7 @@ use super::{ ColumnStatistics, DisplayAs, RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use crate::physical_plan::{ +use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, Column, DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning, }; @@ -378,12 +378,12 @@ pub type EqualAndNonEqual<'a> = mod tests { use super::*; - use crate::physical_plan::expressions::*; - use crate::physical_plan::ExecutionPlan; - use crate::physical_plan::{collect, with_new_children_if_necessary}; + use crate::expressions::*; use crate::test; use crate::test::exec::StatisticsExec; use crate::test_util; + use crate::ExecutionPlan; + use crate::{collect, with_new_children_if_necessary}; use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::utils::DataPtr; use datafusion_common::ColumnStatistics; diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs index 8c03fb543f5b1..e60afcbcb0412 100644 --- a/datafusion/physical-plan/src/insert.rs +++ b/datafusion/physical-plan/src/insert.rs @@ -35,7 +35,7 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -use crate::physical_plan::stream::RecordBatchStreamAdapter; +use crate::stream::RecordBatchStreamAdapter; use datafusion_common::{exec_err, internal_err, DataFusionError}; use datafusion_execution::TaskContext; diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 256942754350e..e5302420c27a7 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -25,9 +25,9 @@ use std::{any::Any, sync::Arc, task::Poll}; use arrow::datatypes::{Fields, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; -use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use crate::physical_plan::DisplayAs; -use crate::physical_plan::{ +use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use crate::DisplayAs; +use crate::{ coalesce_batches::concat_batches, coalesce_partitions::CoalescePartitionsExec, ColumnStatistics, DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, PhysicalSortExpr, RecordBatchStream, @@ -458,8 +458,8 @@ impl CrossJoinStream { mod tests { use super::*; use crate::assert_batches_sorted_eq; + use crate::common; use crate::common::assert_contains; - use crate::physical_plan::common; use crate::test::{build_table_scan_i32, columns}; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index e0e522b2150af..75807863198ce 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -24,13 +24,13 @@ use std::sync::Arc; use std::task::Poll; use std::{any::Any, usize, vec}; -use crate::physical_plan::joins::utils::{ +use crate::joins::utils::{ adjust_indices_by_join_type, apply_join_filter_to_indices, build_batch_from_indices, calculate_join_output_ordering, combine_join_ordering_equivalence_properties, get_final_indices_from_bit_map, need_produce_result_in_final, JoinSide, }; -use crate::physical_plan::DisplayAs; -use crate::physical_plan::{ +use crate::DisplayAs; +use crate::{ coalesce_batches::concat_batches, coalesce_partitions::CoalescePartitionsExec, expressions::Column, @@ -2765,7 +2765,7 @@ mod tests { let stream = join.execute(0, task_ctx).unwrap(); // Expect that an error is returned - let result_string = crate::physical_plan::common::collect(stream) + let result_string = crate::common::collect(stream) .await .unwrap_err() .to_string(); diff --git a/datafusion/physical-plan/src/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs index ac0b183818147..bb7976345800d 100644 --- a/datafusion/physical-plan/src/joins/hash_join_utils.rs +++ b/datafusion/physical-plan/src/joins/hash_join_utils.rs @@ -24,8 +24,8 @@ use std::ops::IndexMut; use std::sync::Arc; use std::{fmt, usize}; -use crate::physical_plan::joins::utils::{JoinFilter, JoinSide}; -use crate::physical_plan::ExecutionPlan; +use crate::joins::utils::{JoinFilter, JoinSide}; +use crate::ExecutionPlan; use arrow::compute::concat_batches; use arrow::datatypes::{ArrowNativeType, SchemaRef}; @@ -830,7 +830,7 @@ pub fn record_visited_indices( #[cfg(test)] pub mod tests { use super::*; - use crate::physical_plan::{ + use crate::{ expressions::Column, expressions::PhysicalSortExpr, joins::utils::{ColumnIndex, JoinFilter, JoinSide}, diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 618dd66e69a09..b66454c732262 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -19,7 +19,7 @@ //! The nested loop join can execute in parallel by partitions and it is //! determined by the [`JoinType`]. -use crate::physical_plan::joins::utils::{ +use crate::joins::utils::{ append_right_indices, apply_join_filter_to_indices, build_batch_from_indices, build_join_schema, check_join_is_valid, combine_join_equivalence_properties, estimate_join_statistics, get_anti_indices, get_anti_u64_indices, @@ -27,8 +27,8 @@ use crate::physical_plan::joins::utils::{ partitioned_join_output_partitioning, BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinSide, OnceAsync, OnceFut, }; -use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use crate::physical_plan::{ +use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use crate::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, }; @@ -48,7 +48,7 @@ use std::fmt::Formatter; use std::sync::Arc; use std::task::Poll; -use crate::physical_plan::coalesce_batches::concat_batches; +use crate::coalesce_batches::concat_batches; use datafusion_common::Result; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; @@ -753,7 +753,7 @@ mod tests { use arrow::datatypes::{DataType, Field}; use datafusion_expr::Operator; - use crate::physical_plan::joins::utils::JoinSide; + use crate::joins::utils::JoinSide; use datafusion_common::ScalarValue; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::PhysicalExpr; diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 3de98f5452c57..e85f4bcb2ecc3 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -30,15 +30,15 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::physical_plan::expressions::Column; -use crate::physical_plan::expressions::PhysicalSortExpr; -use crate::physical_plan::joins::utils::{ +use crate::expressions::Column; +use crate::expressions::PhysicalSortExpr; +use crate::joins::utils::{ build_join_schema, calculate_join_output_ordering, check_join_is_valid, combine_join_equivalence_properties, combine_join_ordering_equivalence_properties, estimate_join_statistics, partitioned_join_output_partitioning, JoinOn, JoinSide, }; -use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; -use crate::physical_plan::{ +use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; +use crate::{ metrics, DisplayAs, DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, Statistics, @@ -1393,13 +1393,13 @@ mod tests { use datafusion_execution::TaskContext; use crate::common::assert_contains; - use crate::physical_plan::expressions::Column; - use crate::physical_plan::joins::utils::JoinOn; - use crate::physical_plan::joins::SortMergeJoinExec; - use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::{common, ExecutionPlan}; + use crate::expressions::Column; + use crate::joins::utils::JoinOn; + use crate::joins::SortMergeJoinExec; + use crate::memory::MemoryExec; use crate::test::{build_table_i32, columns}; use crate::{assert_batches_eq, assert_batches_sorted_eq}; + use crate::{common, ExecutionPlan}; use datafusion_common::JoinType; use datafusion_common::Result; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 1c664adfbb715..69f7aaf840dfe 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -32,19 +32,17 @@ use std::task::Poll; use std::vec; use std::{any::Any, usize}; -use crate::physical_plan::common::SharedMemoryReservation; -use crate::physical_plan::joins::hash_join::{ - build_equal_condition_join_indices, update_hash, -}; -use crate::physical_plan::joins::hash_join_utils::{ +use crate::common::SharedMemoryReservation; +use crate::joins::hash_join::{build_equal_condition_join_indices, update_hash}; +use crate::joins::hash_join_utils::{ build_filter_expression_graph, calculate_filter_expr_intervals, combine_two_batches, convert_sort_expr_with_filter_schema, get_pruning_anti_indices, get_pruning_semi_indices, record_visited_indices, IntervalCalculatorInnerState, PruningJoinHashMap, }; -use crate::physical_plan::joins::StreamJoinPartitionMode; -use crate::physical_plan::DisplayAs; -use crate::physical_plan::{ +use crate::joins::StreamJoinPartitionMode; +use crate::DisplayAs; +use crate::{ expressions::Column, expressions::PhysicalSortExpr, joins::{ @@ -1220,9 +1218,9 @@ mod tests { use datafusion_physical_expr::expressions::{binary, col, Column}; use datafusion_physical_expr::intervals::test_utils::gen_conjunctive_numerical_expr; - use crate::physical_plan::joins::hash_join_utils::tests::complicated_filter; + use crate::joins::hash_join_utils::tests::complicated_filter; - use crate::physical_plan::joins::test_utils::{ + use crate::joins::test_utils::{ build_sides_record_batches, compare_batches, create_memory_table, join_expr_tests_fixture_f64, join_expr_tests_fixture_i32, join_expr_tests_fixture_temporal, partitioned_hash_join_with_filter, diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index bd3de1acbf0fd..e33de001df304 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -25,9 +25,9 @@ use std::sync::Arc; use std::task::{Context, Poll}; use std::usize; -use crate::physical_plan::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder}; -use crate::physical_plan::SchemaRef; -use crate::physical_plan::{ +use crate::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder}; +use crate::SchemaRef; +use crate::{ ColumnStatistics, EquivalenceProperties, ExecutionPlan, Partitioning, Statistics, }; diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 89e3852a3e972..a34e8e651b3bd 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -23,9 +23,9 @@ use self::metrics::MetricsSet; use self::{ coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, }; -use crate::physical_plan::expressions::PhysicalSortExpr; use datafusion_common::Result; pub use datafusion_common::{internal_err, ColumnStatistics, Statistics}; +use datafusion_physical_expr::PhysicalSortExpr; pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor}; use arrow::datatypes::SchemaRef; @@ -397,8 +397,8 @@ pub mod unnest; pub mod values; pub mod windows; -use crate::physical_plan::repartition::RepartitionExec; -use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use crate::repartition::RepartitionExec; +use crate::sorts::sort_preserving_merge::SortPreservingMergeExec; pub use datafusion_common::utils::project_schema; use datafusion_execution::TaskContext; pub use datafusion_physical_expr::{ diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 87a07f8d46fec..360b0d6179ed0 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -22,7 +22,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::physical_plan::{ +use crate::{ DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, }; @@ -525,8 +525,8 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; - use crate::physical_plan::common; + use crate::coalesce_partitions::CoalescePartitionsExec; + use crate::common; use crate::test; #[tokio::test] diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 877410c97ca54..d36d93d29edd0 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -30,7 +30,7 @@ use std::any::Any; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::physical_plan::ordering_equivalence_properties_helper; +use crate::ordering_equivalence_properties_helper; use datafusion_common::DataFusionError; use datafusion_execution::TaskContext; use datafusion_physical_expr::{LexOrdering, OrderingEquivalenceProperties}; @@ -260,8 +260,8 @@ impl RecordBatchStream for MemoryStream { #[cfg(test)] mod tests { - use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::ExecutionPlan; + use crate::memory::MemoryExec; + use crate::ExecutionPlan; use arrow_schema::{DataType, Field, Schema, SortOptions}; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 12c89eee19312..1de23d4b89267 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -29,7 +29,7 @@ use std::task::{Context, Poll}; use super::expressions::{Column, PhysicalSortExpr}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{DisplayAs, RecordBatchStream, SendableRecordBatchStream, Statistics}; -use crate::physical_plan::{ +use crate::{ ColumnStatistics, DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning, PhysicalExpr, }; @@ -508,8 +508,8 @@ impl RecordBatchStream for ProjectionStream { #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::common::collect; - use crate::physical_plan::expressions::{self, col}; + use crate::common::collect; + use crate::expressions::{self, col}; use crate::test::{self}; use crate::test_util; use arrow_schema::DataType; diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index a69f33aa19c1e..44a7739a343c5 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -24,14 +24,12 @@ use std::sync::Arc; use std::task::{Context, Poll}; use std::{any::Any, vec}; -use crate::physical_plan::common::transpose; -use crate::physical_plan::hash_utils::create_hashes; -use crate::physical_plan::metrics::BaselineMetrics; -use crate::physical_plan::repartition::distributor_channels::{ - channels, partition_aware_channels, -}; -use crate::physical_plan::sorts::streaming_merge; -use crate::physical_plan::{ +use crate::common::transpose; +use crate::hash_utils::create_hashes; +use crate::metrics::BaselineMetrics; +use crate::repartition::distributor_channels::{channels, partition_aware_channels}; +use crate::sorts::streaming_merge; +use crate::{ DisplayFormatType, EquivalenceProperties, ExecutionPlan, Partitioning, Statistics, }; @@ -1075,7 +1073,7 @@ mod tests { let output_stream = exec.execute(0, task_ctx).unwrap(); // Expect that an error is returned - let result_string = crate::physical_plan::common::collect(output_stream) + let result_string = crate::common::collect(output_stream) .await .unwrap_err() .to_string(); @@ -1101,7 +1099,7 @@ mod tests { let output_stream = exec.execute(0, task_ctx).unwrap(); // Expect that an error is returned - let result_string = crate::physical_plan::common::collect(output_stream) + let result_string = crate::common::collect(output_stream) .await .unwrap_err() .to_string(); @@ -1134,7 +1132,7 @@ mod tests { let output_stream = exec.execute(0, task_ctx).unwrap(); // Expect that an error is returned - let result_string = crate::physical_plan::common::collect(output_stream) + let result_string = crate::common::collect(output_stream) .await .unwrap_err() .to_string(); @@ -1182,9 +1180,7 @@ mod tests { assert_batches_sorted_eq!(&expected, &expected_batches); let output_stream = exec.execute(0, task_ctx).unwrap(); - let batches = crate::physical_plan::common::collect(output_stream) - .await - .unwrap(); + let batches = crate::common::collect(output_stream).await.unwrap(); assert_batches_sorted_eq!(&expected, &batches); } @@ -1211,9 +1207,7 @@ mod tests { input.wait().await; // output stream 1 should *not* error and have one of the input batches - let batches = crate::physical_plan::common::collect(output_stream1) - .await - .unwrap(); + let batches = crate::common::collect(output_stream1).await.unwrap(); let expected = vec![ "+------------------+", @@ -1236,7 +1230,7 @@ mod tests { async fn hash_repartition_with_dropping_output_stream() { let task_ctx = Arc::new(TaskContext::default()); let partitioning = Partitioning::Hash( - vec![Arc::new(crate::physical_plan::expressions::Column::new( + vec![Arc::new(crate::expressions::Column::new( "my_awesome_field", 0, ))], @@ -1248,9 +1242,7 @@ mod tests { let exec = RepartitionExec::try_new(input.clone(), partitioning.clone()).unwrap(); let output_stream1 = exec.execute(1, task_ctx.clone()).unwrap(); input.wait().await; - let batches_without_drop = crate::physical_plan::common::collect(output_stream1) - .await - .unwrap(); + let batches_without_drop = crate::common::collect(output_stream1).await.unwrap(); // run some checks on the result let items_vec = str_batches_to_vec(&batches_without_drop); @@ -1272,9 +1264,7 @@ mod tests { // *before* any outputs are produced std::mem::drop(output_stream0); input.wait().await; - let batches_with_drop = crate::physical_plan::common::collect(output_stream1) - .await - .unwrap(); + let batches_with_drop = crate::common::collect(output_stream1).await.unwrap(); assert_eq!(batches_without_drop, batches_with_drop); } @@ -1359,22 +1349,16 @@ mod tests { )]) .unwrap(); let partitioning = Partitioning::Hash( - vec![Arc::new(crate::physical_plan::expressions::Column::new( - "a", 0, - ))], + vec![Arc::new(crate::expressions::Column::new("a", 0))], 2, ); let schema = batch.schema(); let input = MockExec::new(vec![Ok(batch)], schema); let exec = RepartitionExec::try_new(Arc::new(input), partitioning).unwrap(); let output_stream0 = exec.execute(0, task_ctx.clone()).unwrap(); - let batch0 = crate::physical_plan::common::collect(output_stream0) - .await - .unwrap(); + let batch0 = crate::common::collect(output_stream0).await.unwrap(); let output_stream1 = exec.execute(1, task_ctx.clone()).unwrap(); - let batch1 = crate::physical_plan::common::collect(output_stream1) - .await - .unwrap(); + let batch1 = crate::common::collect(output_stream1).await.unwrap(); assert!(batch0.is_empty() || batch1.is_empty()); Ok(()) } diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index c0c791288644b..baa417649fb08 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::physical_plan::sorts::sort::SortOptions; +use crate::sorts::sort::SortOptions; use arrow::buffer::ScalarBuffer; use arrow::datatypes::ArrowNativeTypeOp; use arrow::row::{Row, Rows}; diff --git a/datafusion/physical-plan/src/sorts/merge.rs b/datafusion/physical-plan/src/sorts/merge.rs index f8a1457dd62a1..67685509abe5b 100644 --- a/datafusion/physical-plan/src/sorts/merge.rs +++ b/datafusion/physical-plan/src/sorts/merge.rs @@ -18,15 +18,11 @@ //! Merge that deals with an arbitrary size of streaming inputs. //! This is an order-preserving merge. -use crate::physical_plan::metrics::BaselineMetrics; -use crate::physical_plan::sorts::builder::BatchBuilder; -use crate::physical_plan::sorts::cursor::Cursor; -use crate::physical_plan::sorts::stream::{ - FieldCursorStream, PartitionedStream, RowCursorStream, -}; -use crate::physical_plan::{ - PhysicalSortExpr, RecordBatchStream, SendableRecordBatchStream, -}; +use crate::metrics::BaselineMetrics; +use crate::sorts::builder::BatchBuilder; +use crate::sorts::cursor::Cursor; +use crate::sorts::stream::{FieldCursorStream, PartitionedStream, RowCursorStream}; +use crate::{PhysicalSortExpr, RecordBatchStream, SendableRecordBatchStream}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::*; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 82badb7d879c9..695272767696d 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -19,14 +19,14 @@ //! It will do in-memory sorting if it has enough memory budget //! but spills to disk if needed. -use crate::physical_plan::common::{spawn_buffered, IPCWriter}; -use crate::physical_plan::expressions::PhysicalSortExpr; -use crate::physical_plan::metrics::{ +use crate::common::{spawn_buffered, IPCWriter}; +use crate::expressions::PhysicalSortExpr; +use crate::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, }; -use crate::physical_plan::sorts::merge::streaming_merge; -use crate::physical_plan::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; -use crate::physical_plan::{ +use crate::sorts::merge::streaming_merge; +use crate::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; +use crate::{ DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, }; @@ -890,10 +890,10 @@ impl ExecutionPlan for SortExec { #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; - use crate::physical_plan::collect; - use crate::physical_plan::expressions::col; - use crate::physical_plan::memory::MemoryExec; + use crate::coalesce_partitions::CoalescePartitionsExec; + use crate::collect; + use crate::expressions::col; + use crate::memory::MemoryExec; use crate::test; use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index c9f693a8a24e4..507d66c920fb5 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -20,13 +20,11 @@ use std::any::Any; use std::sync::Arc; -use crate::physical_plan::common::spawn_buffered; -use crate::physical_plan::expressions::PhysicalSortExpr; -use crate::physical_plan::metrics::{ - BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, -}; -use crate::physical_plan::sorts::streaming_merge; -use crate::physical_plan::{ +use crate::common::spawn_buffered; +use crate::expressions::PhysicalSortExpr; +use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; +use crate::sorts::streaming_merge; +use crate::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, }; @@ -280,16 +278,16 @@ mod tests { use futures::{FutureExt, StreamExt}; use tempfile::TempDir; - use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; - use crate::physical_plan::expressions::col; - use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::metrics::MetricValue; - use crate::physical_plan::sorts::sort::SortExec; - use crate::physical_plan::stream::RecordBatchReceiverStream; - use crate::physical_plan::{collect, common}; + use crate::coalesce_partitions::CoalescePartitionsExec; + use crate::expressions::col; + use crate::memory::MemoryExec; + use crate::metrics::MetricValue; + use crate::sorts::sort::SortExec; + use crate::stream::RecordBatchReceiverStream; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{self, assert_is_pending}; use crate::{assert_batches_eq, test_util}; + use crate::{collect, common}; use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; use super::*; diff --git a/datafusion/physical-plan/src/sorts/stream.rs b/datafusion/physical-plan/src/sorts/stream.rs index 9ef13b7eb25e4..a7f9e7380c473 100644 --- a/datafusion/physical-plan/src/sorts/stream.rs +++ b/datafusion/physical-plan/src/sorts/stream.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::physical_plan::sorts::cursor::{FieldArray, FieldCursor, RowCursor}; -use crate::physical_plan::SendableRecordBatchStream; -use crate::physical_plan::{PhysicalExpr, PhysicalSortExpr}; +use crate::sorts::cursor::{FieldArray, FieldCursor, RowCursor}; +use crate::SendableRecordBatchStream; +use crate::{PhysicalExpr, PhysicalSortExpr}; use arrow::array::Array; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs index 55683a5df3d7b..a3fb856c326d0 100644 --- a/datafusion/physical-plan/src/stream.rs +++ b/datafusion/physical-plan/src/stream.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use std::task::Context; use std::task::Poll; -use crate::physical_plan::displayable; +use crate::displayable; use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; use datafusion_common::DataFusionError; use datafusion_common::Result; diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs index 6c33f88a3991e..00809b71e4431 100644 --- a/datafusion/physical-plan/src/streaming.rs +++ b/datafusion/physical-plan/src/streaming.rs @@ -28,9 +28,9 @@ use datafusion_common::{internal_err, plan_err, DataFusionError, Result, Statist use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; use log::debug; -use crate::physical_plan::display::{OutputOrderingDisplay, ProjectSchemaDisplay}; -use crate::physical_plan::stream::RecordBatchStreamAdapter; -use crate::physical_plan::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; +use crate::display::{OutputOrderingDisplay, ProjectSchemaDisplay}; +use crate::stream::RecordBatchStreamAdapter; +use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; use datafusion_execution::TaskContext; use super::{DisplayAs, DisplayFormatType}; diff --git a/datafusion/physical-plan/src/tree_node.rs b/datafusion/physical-plan/src/tree_node.rs index fad6508fdabef..bce906a00c4d8 100644 --- a/datafusion/physical-plan/src/tree_node.rs +++ b/datafusion/physical-plan/src/tree_node.rs @@ -17,7 +17,7 @@ //! This module provides common traits for visiting or rewriting tree nodes easily. -use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; +use crate::{with_new_children_if_necessary, ExecutionPlan}; use datafusion_common::tree_node::{DynTreeNode, Transformed}; use datafusion_common::Result; use std::sync::Arc; diff --git a/datafusion/physical-plan/src/udaf.rs b/datafusion/physical-plan/src/udaf.rs index 70a43bb397f30..7cc3cc7d59fed 100644 --- a/datafusion/physical-plan/src/udaf.rs +++ b/datafusion/physical-plan/src/udaf.rs @@ -27,9 +27,9 @@ use arrow::{ }; use super::{expressions::format_state_name, Accumulator, AggregateExpr}; -use crate::physical_plan::PhysicalExpr; use datafusion_common::{not_impl_err, DataFusionError, Result}; pub use datafusion_expr::AggregateUDF; +use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::aggregate::utils::down_cast_any_ref; use std::sync::Arc; diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 491d24c2897b8..8e0d871e0e34a 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -41,9 +41,9 @@ use super::{ ColumnStatistics, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, Statistics, }; -use crate::physical_plan::common::get_meet_of_orderings; -use crate::physical_plan::stream::ObservedStream; -use crate::physical_plan::{expressions, metrics::BaselineMetrics}; +use crate::common::get_meet_of_orderings; +use crate::stream::ObservedStream; +use crate::{expressions, metrics::BaselineMetrics}; use datafusion_common::Result; use datafusion_execution::TaskContext; use tokio::macros::support::thread_rng_n; diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 40c4edc953309..410ea97887e0c 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -37,7 +37,7 @@ use log::trace; use std::time::Instant; use std::{any::Any, sync::Arc}; -use crate::physical_plan::{ +use crate::{ expressions::Column, DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, PhysicalExpr, PhysicalSortExpr, RecordBatchStream, SendableRecordBatchStream, Statistics, @@ -309,7 +309,7 @@ fn build_batch_generic_list Date: Mon, 28 Aug 2023 08:14:00 -0400 Subject: [PATCH 03/15] Update tests --- datafusion-cli/Cargo.lock | 121 ++++-- datafusion-examples/examples/csv_opener.rs | 2 +- datafusion/core/Cargo.toml | 1 + .../physical_plan/file_scan_config.rs | 30 +- datafusion/core/src/lib.rs | 6 +- .../enforce_distribution.rs | 21 +- .../src/physical_optimizer/join_selection.rs | 4 +- .../src/physical_optimizer/sort_pushdown.rs | 20 +- datafusion/core/src/physical_planner.rs | 4 +- datafusion/core/src/test/mod.rs | 161 ++++---- datafusion/core/tests/sql/displayable.rs | 57 +++ datafusion/core/tests/sql/mod.rs | 1 + datafusion/physical-plan/Cargo.toml | 31 +- .../physical-plan/src/aggregates/mod.rs | 49 +-- datafusion/physical-plan/src/analyze.rs | 2 +- .../physical-plan/src/coalesce_batches.rs | 21 +- .../physical-plan/src/coalesce_partitions.rs | 8 +- datafusion/physical-plan/src/display.rs | 3 +- datafusion/physical-plan/src/empty.rs | 12 +- datafusion/physical-plan/src/filter.rs | 59 +-- .../physical-plan/src/joins/cross_join.rs | 13 +- .../physical-plan/src/joins/hash_join.rs | 41 ++- .../src/joins/nested_loop_join.rs | 17 +- .../src/joins/sort_merge_join.rs | 32 +- .../physical-plan/src/joins/test_utils.rs | 10 +- datafusion/physical-plan/src/lib.rs | 42 +-- datafusion/physical-plan/src/limit.rs | 52 +-- .../physical-plan/src/metrics/baseline.rs | 2 +- .../physical-plan/src/metrics/builder.rs | 2 +- datafusion/physical-plan/src/metrics/mod.rs | 2 +- datafusion/physical-plan/src/projection.rs | 100 +---- .../physical-plan/src/repartition/mod.rs | 23 +- datafusion/physical-plan/src/sorts/sort.rs | 118 ++---- .../src/sorts/sort_preserving_merge.rs | 98 ++--- datafusion/physical-plan/src/test.rs | 343 ++++++++++++++++++ .../{core => physical-plan}/src/test/exec.rs | 17 +- datafusion/physical-plan/src/union.rs | 12 +- datafusion/physical-plan/src/values.rs | 31 +- datafusion/physical-plan/src/windows/mod.rs | 208 +---------- 39 files changed, 909 insertions(+), 867 deletions(-) create mode 100644 datafusion/core/tests/sql/displayable.rs create mode 100644 datafusion/physical-plan/src/test.rs rename datafusion/{core => physical-plan}/src/test/exec.rs (98%) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 3e16b756a01ba..0c6c6846a89a8 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -77,9 +77,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" +checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" [[package]] name = "arrayref" @@ -347,7 +347,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -674,9 +674,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.3" +version = "0.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "414dcefbc63d77c526a76b3afcf6fbb9b5e2791c19c3aa2297733208750c6e53" +checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" [[package]] name = "base64-simd" @@ -1031,7 +1031,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f34ba9a9bcb8645379e9de8cb3ecfcf4d1c85ba66d90deb3259206fa5aa193b" dependencies = [ "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -1066,6 +1066,7 @@ dependencies = [ "datafusion-expr", "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-physical-plan", "datafusion-sql", "flate2", "futures", @@ -1111,7 +1112,7 @@ dependencies = [ "parking_lot", "predicates", "regex", - "rstest", + "rstest 0.17.0", "rustyline", "tokio", "url", @@ -1217,6 +1218,35 @@ dependencies = [ "uuid", ] +[[package]] +name = "datafusion-physical-plan" +version = "31.0.0" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "futures", + "half", + "hashbrown 0.14.0", + "indexmap 2.0.0", + "itertools 0.11.0", + "log", + "parking_lot", + "pin-project-lite", + "rand", + "rstest 0.18.2", + "tempfile", + "tokio", +] + [[package]] name = "datafusion-sql" version = "31.0.0" @@ -1502,7 +1532,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -1959,9 +1989,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.5" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" +checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128" [[package]] name = "lock_api" @@ -2388,7 +2418,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -2602,6 +2632,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "relative-path" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c707298afce11da2efef2f600116fa93ffa7a032b5d7b628aa17711ec81383ca" + [[package]] name = "reqwest" version = "0.11.20" @@ -2666,7 +2702,19 @@ checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962" dependencies = [ "futures", "futures-timer", - "rstest_macros", + "rstest_macros 0.17.0", + "rustc_version", +] + +[[package]] +name = "rstest" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97eeab2f3c0a199bc4be135c36c924b6590b88c377d416494288c14f2db30199" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros 0.18.2", "rustc_version", ] @@ -2684,6 +2732,23 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "rstest_macros" +version = "0.18.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d428f8247852f894ee1be110b375111b586d4fa431f6c46e64ba5a0dcccbe605" +dependencies = [ + "cfg-if", + "glob", + "proc-macro2", + "quote", + "regex", + "relative-path", + "rustc_version", + "syn 2.0.32", + "unicode-ident", +] + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -2701,9 +2766,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.11" +version = "0.38.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0c3dde1fc030af041adc40e79c0e7fbcf431dd24870053d187d7c66e4b87453" +checksum = "d7db8590df6dfcd144d22afd1b83b36c21a18d7cbc1dc4bb5295a8712e9eb662" dependencies = [ "bitflags 2.4.0", "errno", @@ -2888,14 +2953,14 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] name = "serde_json" -version = "1.0.105" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" +checksum = "2cc66a619ed80bf7a0f6b17dd063a84b88f6dea1813737cf469aef1d081142c2" dependencies = [ "itoa", "ryu", @@ -2986,9 +3051,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877" +checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" dependencies = [ "libc", "windows-sys", @@ -3077,7 +3142,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -3099,9 +3164,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.31" +version = "2.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398" +checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" dependencies = [ "proc-macro2", "quote", @@ -3159,7 +3224,7 @@ checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -3237,7 +3302,7 @@ dependencies = [ "num_cpus", "parking_lot", "pin-project-lite", - "socket2 0.5.3", + "socket2 0.5.4", "tokio-macros", "windows-sys", ] @@ -3250,7 +3315,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -3348,7 +3413,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", ] [[package]] @@ -3520,7 +3585,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -3554,7 +3619,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.31", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/datafusion-examples/examples/csv_opener.rs b/datafusion-examples/examples/csv_opener.rs index 6366f16b7aa3f..0587b515b2d8d 100644 --- a/datafusion-examples/examples/csv_opener.rs +++ b/datafusion-examples/examples/csv_opener.rs @@ -17,7 +17,6 @@ use std::{sync::Arc, vec}; -use crate::metrics::ExecutionPlanMetricsSet; use datafusion::{ assert_batches_eq, datasource::{ @@ -26,6 +25,7 @@ use datafusion::{ physical_plan::{CsvConfig, CsvOpener, FileScanConfig, FileStream}, }, error::Result, + physical_plan::metrics::ExecutionPlanMetricsSet, test_util::aggr_test_schema, }; use datafusion_common::FileCompressionType; diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index d5738cb1fe13e..ab2821547d4e6 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -67,6 +67,7 @@ datafusion-execution = { path = "../execution", version = "31.0.0" } datafusion-expr = { path = "../expr", version = "31.0.0" } datafusion-optimizer = { path = "../optimizer", version = "31.0.0", default-features = false } datafusion-physical-expr = { path = "../physical-expr", version = "31.0.0", default-features = false } +datafusion-physical-plan = { path = "../physical-plan", version = "31.0.0", default-features = false } datafusion-sql = { path = "../sql", version = "31.0.0" } flate2 = { version = "1.0.24", optional = true } futures = "0.3" diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index e9ce5238c5962..819bfabae2902 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -496,11 +496,10 @@ fn create_output_array( #[cfg(test)] mod tests { + use arrow_array::Int32Array; + use super::*; - use crate::{ - test::{build_table_i32, columns}, - test_util::aggr_test_schema, - }; + use crate::{test::columns, test_util::aggr_test_schema}; #[test] fn physical_plan_config_no_projection() { @@ -776,4 +775,27 @@ mod tests { infinite_source: false, } } + + /// returns record batch with 3 columns of i32 in memory + pub fn build_table_i32( + a: (&str, &Vec), + b: (&str, &Vec), + c: (&str, &Vec), + ) -> RecordBatch { + let schema = Schema::new(vec![ + Field::new(a.0, DataType::Int32, false), + Field::new(b.0, DataType::Int32, false), + Field::new(c.0, DataType::Int32, false), + ]); + + RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(a.1.clone())), + Arc::new(Int32Array::from(b.1.clone())), + Arc::new(Int32Array::from(c.1.clone())), + ], + ) + .unwrap() + } } diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index c0ed13c8e063d..4f74888c840b1 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -428,7 +428,6 @@ pub mod datasource; pub mod error; pub mod execution; pub mod physical_optimizer; -pub mod physical_plan; pub mod physical_planner; pub mod prelude; pub mod scalar; @@ -467,6 +466,11 @@ pub mod physical_expr { pub use datafusion_physical_expr::*; } +/// re-export of [`datafusion_physical_plan`] crate +pub mod physical_plan { + pub use datafusion_physical_plan::*; +} + /// re-export of [`datafusion_sql`] crate pub mod sql { pub use datafusion_sql::*; diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 5be53cf81b606..2fbe5a9b39be3 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -378,7 +378,7 @@ fn adjust_input_keys_ordering( )?) } else if let Some(aggregate_exec) = plan_any.downcast_ref::() { if !parent_required.is_empty() { - match aggregate_exec.mode { + match aggregate_exec.mode() { AggregateMode::FinalPartitioned => Some(reorder_aggregate_keys( requirements.plan.clone(), &parent_required, @@ -390,9 +390,8 @@ fn adjust_input_keys_ordering( // Keep everything unchanged None } - } else if let Some(ProjectionExec { expr, .. }) = - plan_any.downcast_ref::() - { + } else if let Some(proj) = plan_any.downcast_ref::() { + let expr = proj.expr(); // For Projection, we need to transform the requirements to the columns before the Projection // And then to push down the requirements // Construct a mapping from new name to the the orginal Column @@ -487,7 +486,7 @@ fn reorder_aggregate_keys( agg_exec: &AggregateExec, ) -> Result { let out_put_columns = agg_exec - .group_by + .group_by() .expr() .iter() .enumerate() @@ -500,7 +499,7 @@ fn reorder_aggregate_keys( .collect::>(); if parent_required.len() != out_put_exprs.len() - || !agg_exec.group_by.null_expr().is_empty() + || !agg_exec.group_by().null_expr().is_empty() || expr_list_eq_strict_order(&out_put_exprs, parent_required) { Ok(PlanWithKeyRequirements::new(agg_plan)) @@ -519,7 +518,7 @@ fn reorder_aggregate_keys( input_schema, .. }) = - agg_exec.input.as_any().downcast_ref::() + agg_exec.input().as_any().downcast_ref::() { if matches!(mode, AggregateMode::Partial) { let mut new_group_exprs = vec![]; @@ -564,11 +563,11 @@ fn reorder_aggregate_keys( let new_final_agg = Arc::new(AggregateExec::try_new( AggregateMode::FinalPartitioned, new_group_by, - agg_exec.aggr_expr.to_vec(), - agg_exec.filter_expr.to_vec(), - agg_exec.order_by_expr.to_vec(), + agg_exec.aggr_expr().to_vec(), + agg_exec.filter_expr().to_vec(), + agg_exec.order_by_expr().to_vec(), partial_agg, - agg_exec.input_schema.clone(), + agg_exec.input_schema().clone(), )?); // Need to create a new projection to change the expr ordering back diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs index 628cc1da3b1eb..4cff4a8f6c555 100644 --- a/datafusion/core/src/physical_optimizer/join_selection.rs +++ b/datafusion/core/src/physical_optimizer/join_selection.rs @@ -579,14 +579,14 @@ fn apply_subrules( #[cfg(test)] mod tests_statistical { + use super::*; use crate::{ physical_plan::{ displayable, joins::PartitionMode, ColumnStatistics, Statistics, }, - test::exec::StatisticsExec, + test::StatisticsExec, }; - use super::*; use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema}; diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 81c2e76b74c61..629011cb0faa5 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -140,7 +140,7 @@ pub(crate) fn pushdown_sorts( let parent_required_expr = PhysicalSortRequirement::to_sort_exprs( parent_required.ok_or_else(err)?.iter().cloned(), ); - new_plan = sort_exec.input.clone(); + new_plan = sort_exec.input().clone(); add_sort_above(&mut new_plan, parent_required_expr, sort_exec.fetch())?; }; let required_ordering = new_plan @@ -221,12 +221,12 @@ fn pushdown_requirement_to_children( ])) } else if let Some(smj) = plan.as_any().downcast_ref::() { // If the current plan is SortMergeJoinExec - let left_columns_len = smj.left.schema().fields().len(); + let left_columns_len = smj.left().schema().fields().len(); let parent_required_expr = PhysicalSortRequirement::to_sort_exprs( parent_required.ok_or_else(err)?.iter().cloned(), ); let expr_source_side = - expr_source_sides(&parent_required_expr, smj.join_type, left_columns_len); + expr_source_sides(&parent_required_expr, smj.join_type(), left_columns_len); match expr_source_side { Some(JoinSide::Left) => try_pushdown_requirements_to_join( smj, @@ -236,7 +236,7 @@ fn pushdown_requirement_to_children( ), Some(JoinSide::Right) => { let right_offset = - smj.schema().fields.len() - smj.right.schema().fields.len(); + smj.schema().fields.len() - smj.right().schema().fields.len(); let new_right_required = shift_right_required(parent_required.ok_or_else(err)?, right_offset)?; let new_right_required_expr = PhysicalSortRequirement::to_sort_exprs( @@ -331,8 +331,8 @@ fn try_pushdown_requirements_to_join( sort_expr: Vec, push_side: JoinSide, ) -> Result>>>> { - let left_ordering = smj.left.output_ordering().unwrap_or(&[]); - let right_ordering = smj.right.output_ordering().unwrap_or(&[]); + let left_ordering = smj.left().output_ordering().unwrap_or(&[]); + let right_ordering = smj.right().output_ordering().unwrap_or(&[]); let (new_left_ordering, new_right_ordering) = match push_side { JoinSide::Left => (sort_expr.as_slice(), right_ordering), JoinSide::Right => (left_ordering, sort_expr.as_slice()), @@ -340,11 +340,11 @@ fn try_pushdown_requirements_to_join( let new_output_ordering = calculate_join_output_ordering( new_left_ordering, new_right_ordering, - smj.join_type, - &smj.on, - smj.left.schema().fields.len(), + smj.join_type(), + smj.on(), + smj.left().schema().fields.len(), &smj.maintains_input_order(), - Some(SortMergeJoinExec::probe_side(&smj.join_type)), + Some(SortMergeJoinExec::probe_side(&smj.join_type())), )?; Ok(ordering_satisfy_requirement( new_output_ordering.as_deref(), diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index def4d59873df5..485ec777e4a5c 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -814,8 +814,8 @@ impl DefaultPhysicalPlanner { // into a LAST_VALUE with the reverse ordering requirement. // To reflect such changes to subsequent stages, use the updated // `AggregateExpr`/`PhysicalSortExpr` objects. - let updated_aggregates = initial_aggr.aggr_expr.clone(); - let updated_order_bys = initial_aggr.order_by_expr.clone(); + let updated_aggregates = initial_aggr.aggr_expr().to_vec(); + let updated_order_bys = initial_aggr.order_by_expr().to_vec(); let (initial_aggr, next_partition_mode): ( Arc, diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 011d2e64281ad..a26be4857d4c2 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -17,38 +17,37 @@ //! Common unit test utility methods -use crate::arrow::array::UInt32Array; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; use crate::datasource::physical_plan::{CsvExec, FileScanConfig}; use crate::datasource::{MemTable, TableProvider}; use crate::error::Result; use crate::logical_expr::LogicalPlan; -use crate::physical_plan::memory::MemoryExec; use crate::physical_plan::ExecutionPlan; use crate::test::object_store::local_unpartitioned_file; use crate::test_util::{aggr_test_schema, arrow_test_data}; use array::ArrayRef; use arrow::array::{self, Array, Decimal128Builder, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::{RecordBatch, RecordBatchOptions}; +use arrow::record_batch::RecordBatch; #[cfg(feature = "compression")] use bzip2::write::BzEncoder; #[cfg(feature = "compression")] use bzip2::Compression as BzCompression; use datafusion_common::{DataFusionError, Statistics}; use datafusion_common::{FileCompressionType, FileType}; -use datafusion_physical_expr::PhysicalSortExpr; +use datafusion_execution::{SendableRecordBatchStream, TaskContext}; +use datafusion_physical_expr::{Partitioning, PhysicalSortExpr}; +use datafusion_physical_plan::{DisplayAs, DisplayFormatType}; #[cfg(feature = "compression")] use flate2::write::GzEncoder; #[cfg(feature = "compression")] use flate2::Compression as GzCompression; -use futures::{Future, FutureExt}; +use std::any::Any; use std::fs::File; use std::io::prelude::*; use std::io::{BufReader, BufWriter}; use std::path::Path; -use std::pin::Pin; use std::sync::Arc; #[cfg(feature = "compression")] use xz2::write::XzEncoder; @@ -214,40 +213,6 @@ pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { assert_eq!(actual, expected); } -/// returns record batch with 3 columns of i32 in memory -pub fn build_table_i32( - a: (&str, &Vec), - b: (&str, &Vec), - c: (&str, &Vec), -) -> RecordBatch { - let schema = Schema::new(vec![ - Field::new(a.0, DataType::Int32, false), - Field::new(b.0, DataType::Int32, false), - Field::new(c.0, DataType::Int32, false), - ]); - - RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(Int32Array::from(a.1.clone())), - Arc::new(Int32Array::from(b.1.clone())), - Arc::new(Int32Array::from(c.1.clone())), - ], - ) - .unwrap() -} - -/// returns memory table scan wrapped around record batch with 3 columns of i32 -pub fn build_table_scan_i32( - a: (&str, &Vec), - b: (&str, &Vec), - c: (&str, &Vec), -) -> Arc { - let batch = build_table_i32(a, b, c); - let schema = batch.schema(); - Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None).unwrap()) -} - /// Returns the column names on the schema pub fn columns(schema: &Schema) -> Vec { schema.fields().iter().map(|f| f.name().clone()).collect() @@ -280,14 +245,6 @@ pub fn make_partition(sz: i32) -> RecordBatch { RecordBatch::try_new(schema, vec![arr]).unwrap() } -/// Return a RecordBatch with a single array with row_count sz -pub fn make_batch_no_column(sz: usize) -> RecordBatch { - let schema = Arc::new(Schema::empty()); - - let options = RecordBatchOptions::new().with_row_count(Option::from(sz)); - RecordBatch::try_new_with_options(schema, vec![], &options).unwrap() -} - /// Return a new table which provide this decimal column pub fn table_with_decimal() -> Arc { let batch_decimal = make_decimal(); @@ -312,25 +269,6 @@ fn make_decimal() -> RecordBatch { RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap() } -/// Asserts that given future is pending. -pub fn assert_is_pending<'a, T>(fut: &mut Pin + Send + 'a>>) { - let waker = futures::task::noop_waker(); - let mut cx = futures::task::Context::from_waker(&waker); - let poll = fut.poll_unpin(&mut cx); - - assert!(poll.is_pending()); -} - -/// Create vector batches -pub fn create_vec_batches(schema: &Schema, n: usize) -> Vec { - let batch = create_batch(schema); - let mut vec = Vec::with_capacity(n); - for _ in 0..n { - vec.push(batch.clone()); - } - vec -} - /// Created a sorted Csv exec pub fn csv_exec_sorted( schema: &SchemaRef, @@ -359,15 +297,88 @@ pub fn csv_exec_sorted( )) } -/// Create batch -fn create_batch(schema: &Schema) -> RecordBatch { - RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], - ) - .unwrap() +/// A mock execution plan that simply returns the provided statistics +#[derive(Debug, Clone)] +pub struct StatisticsExec { + stats: Statistics, + schema: Arc, +} +impl StatisticsExec { + pub fn new(stats: Statistics, schema: Schema) -> Self { + assert!( + stats + .column_statistics + .as_ref() + .map(|cols| cols.len() == schema.fields().len()) + .unwrap_or(true), + "if defined, the column statistics vector length should be the number of fields" + ); + Self { + stats, + schema: Arc::new(schema), + } + } +} + +impl DisplayAs for StatisticsExec { + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!( + f, + "StatisticsExec: col_count={}, row_count={:?}", + self.schema.fields().len(), + self.stats.num_rows, + ) + } + } + } +} + +impl ExecutionPlan for StatisticsExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(2) + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + fn children(&self) -> Vec> { + vec![] + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> Result> { + Ok(self) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + unimplemented!("This plan only serves for testing statistics") + } + + fn statistics(&self) -> Statistics { + self.stats.clone() + } } -pub mod exec; pub mod object_store; pub mod variable; diff --git a/datafusion/core/tests/sql/displayable.rs b/datafusion/core/tests/sql/displayable.rs new file mode 100644 index 0000000000000..b736820009cc9 --- /dev/null +++ b/datafusion/core/tests/sql/displayable.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use object_store::path::Path; + +use datafusion::prelude::*; +use datafusion_physical_plan::displayable; + +#[tokio::test] +async fn teset_displayable() { + // Hard code target_partitions as it appears in the RepartitionExec output + let config = SessionConfig::new().with_target_partitions(3); + let ctx = SessionContext::with_config(config); + + // register the a table + ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()) + .await + .unwrap(); + + // create a plan to run a SQL query + let dataframe = ctx.sql("SELECT a FROM example WHERE a < 5").await.unwrap(); + let physical_plan = dataframe.create_physical_plan().await.unwrap(); + + // Format using display string in verbose mode + let displayable_plan = displayable(physical_plan.as_ref()); + let plan_string = format!("{}", displayable_plan.indent(true)); + + let working_directory = std::env::current_dir().unwrap(); + let normalized = Path::from_filesystem_path(working_directory).unwrap(); + let plan_string = plan_string.replace(normalized.as_ref(), "WORKING_DIR"); + + assert_eq!("CoalesceBatchesExec: target_batch_size=8192\ + \n FilterExec: a@0 < 5\ + \n RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1\ + \n CsvExec: file_groups={1 group: [[WORKING_DIR/tests/data/example.csv]]}, projection=[a], has_header=true", + plan_string.trim()); + + let one_line = format!("{}", displayable_plan.one_line()); + assert_eq!( + "CoalesceBatchesExec: target_batch_size=8192", + one_line.trim() + ); +} diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 35423234db88b..1e29f791c560a 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -82,6 +82,7 @@ pub mod arrow_files; #[cfg(feature = "avro")] pub mod create_drop; pub mod csv_files; +pub mod displayable; pub mod explain_analyze; pub mod expr; pub mod group_by; diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 161569513a783..60422269c9a70 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -32,8 +32,6 @@ rust-version = { workspace = true } name = "datafusion_physical_plan" path = "src/lib.rs" -[features] - [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } arrow = { workspace = true } @@ -41,36 +39,23 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } async-trait = "0.1.41" -#base64 = { version = "0.21", optional = true } -#blake2 = { version = "^0.10.2", optional = true } -#blake3 = { version = "1.0", optional = true } chrono = { version = "0.4.23", default-features = false } -datafusion-common = { path = "../common", version = "30.0.0" } -datafusion-expr = { path = "../expr", version = "30.0.0" } -datafusion-execution = { path = "../execution", version = "30.0.0" } -datafusion-physical-expr = { path = "../physical-expr", version = "30.0.0" } +datafusion-common = { path = "../common", version = "31.0.0" } +datafusion-execution = { path = "../execution", version = "31.0.0" } +datafusion-expr = { path = "../expr", version = "31.0.0" } +datafusion-physical-expr = { path = "../physical-expr", version = "31.0.0" } futures = "0.3" half = { version = "2.1", default-features = false } hashbrown = { version = "0.14", features = ["raw"] } -#hex = { version = "0.4", optional = true } indexmap = "2.0.0" itertools = { version = "0.11", features = ["use_std"] } -#libc = "0.2.140" log = "^0.4" -#md-5 = { version = "^0.10.0", optional = true } parking_lot = "0.12" pin-project-lite = "^0.2.7" -#paste = "^1.0" -#petgraph = "0.6.2" -#rand = "0.8" -#regex = { version = "1.8", optional = true } -#sha2 = { version = "^0.10.1", optional = true } -tempfile = "3" -tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] } -#unicode-segmentation = { version = "^1.7.1", optional = true } -#uuid = { version = "^1.2", features = ["v4"] } #[dev-dependencies] #criterion = "0.5" -#rand = "0.8" -#rstest = "0.18.0" +rand = "0.8" +rstest = "0.18.0" +tempfile = "3" +tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] } diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index cb88d132d0d8a..f9256d3f565ce 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -256,23 +256,23 @@ pub(crate) struct AggregationOrdering { #[derive(Debug)] pub struct AggregateExec { /// Aggregation mode (full, partial) - pub(crate) mode: AggregateMode, + pub mode: AggregateMode, /// Group by expressions - pub(crate) group_by: PhysicalGroupBy, + pub group_by: PhysicalGroupBy, /// Aggregate expressions - pub(crate) aggr_expr: Vec>, + pub aggr_expr: Vec>, /// FILTER (WHERE clause) expression for each aggregate expression - pub(crate) filter_expr: Vec>>, + pub filter_expr: Vec>>, /// (ORDER BY clause) expression for each aggregate expression - pub(crate) order_by_expr: Vec>, + pub order_by_expr: Vec>, /// Input plan, could be a partial aggregate or the input to the aggregate - pub(crate) input: Arc, + pub input: Arc, /// Schema after the aggregate is applied schema: SchemaRef, /// Input schema before any aggregation is applied. For partial aggregate this will be the /// same as input.schema() but for the final aggregate it will be the same as the input /// to the partial aggregate - pub(crate) input_schema: SchemaRef, + pub input_schema: SchemaRef, /// The columns map used to normalize out expressions like Partitioning and PhysicalSortExpr /// The key is the column from the input schema and the values are the columns from the output schema columns_map: HashMap>, @@ -727,6 +727,10 @@ impl AggregateExec { )) } } + + pub fn group_by(&self) -> &PhysicalGroupBy { + &self.group_by + } } impl DisplayAs for AggregateExec { @@ -1182,24 +1186,22 @@ fn evaluate_group_by( #[cfg(test)] mod tests { use super::*; - use crate::physical_plan::aggregates::GroupByOrderMode::{ - FullyOrdered, PartiallyOrdered, - }; - use crate::physical_plan::aggregates::{ + use crate::aggregates::GroupByOrderMode::{FullyOrdered, PartiallyOrdered}; + use crate::aggregates::{ get_finest_requirement, get_working_mode, AggregateExec, AggregateMode, PhysicalGroupBy, }; - use crate::physical_plan::coalesce_batches::CoalesceBatchesExec; - use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec; - use crate::physical_plan::expressions::{col, Avg}; - use crate::physical_plan::memory::MemoryExec; - use crate::physical_plan::{ + use crate::coalesce_batches::CoalesceBatchesExec; + use crate::coalesce_partitions::CoalescePartitionsExec; + use crate::expressions::{col, Avg}; + use crate::memory::MemoryExec; + use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; + use crate::test::{assert_is_pending, mem_exec}; + use crate::{assert_batches_eq, assert_batches_sorted_eq, common}; + use crate::{ DisplayAs, ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, Statistics, }; - use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; - use crate::test::{assert_is_pending, csv_exec_sorted}; - use crate::{assert_batches_eq, assert_batches_sorted_eq, physical_plan::common}; use arrow::array::{Float64Array, UInt32Array}; use arrow::compute::{concat_batches, SortOptions}; @@ -1260,7 +1262,8 @@ mod tests { sort_expr("b", &test_schema), sort_expr("c", &test_schema), ]; - let input = csv_exec_sorted(&test_schema, sort_exprs, true); + let input = mem_exec(1).with_sort_information(vec![sort_exprs]); + let input = Arc::new(input) as _; // test cases consists of vector of tuples. Where each tuple represents a single test case. // First field in the tuple is Vec where each element in the vector represents GROUP BY columns @@ -1846,7 +1849,7 @@ mod tests { schema, )?); - let fut = crate::physical_plan::collect(aggregate_exec, task_ctx); + let fut = crate::collect(aggregate_exec, task_ctx); let mut fut = fut.boxed(); assert_is_pending(&mut fut); @@ -1885,7 +1888,7 @@ mod tests { schema, )?); - let fut = crate::physical_plan::collect(aggregate_exec, task_ctx); + let fut = crate::collect(aggregate_exec, task_ctx); let mut fut = fut.boxed(); assert_is_pending(&mut fut); @@ -1995,7 +1998,7 @@ mod tests { schema, )?) as Arc; - let result = crate::physical_plan::collect(aggregate_final, task_ctx).await?; + let result = crate::collect(aggregate_final, task_ctx).await?; if is_first_acc { let expected = [ "+---+----------------+", diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index e055400c18696..b7e95c8b1b802 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -242,7 +242,7 @@ mod tests { use futures::FutureExt; use crate::{ - physical_plan::collect, + collect, test::{ assert_is_pending, exec::{assert_strong_count_converges_to_zero, BlockingExec}, diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 7e6e129934168..f46a228064fe7 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -309,9 +309,9 @@ pub fn concat_batches( #[cfg(test)] mod tests { use super::*; - use crate::test::create_vec_batches; use crate::{memory::MemoryExec, repartition::RepartitionExec}; use arrow::datatypes::{DataType, Field, Schema}; + use arrow_array::UInt32Array; #[tokio::test(flavor = "multi_thread")] async fn test_concat_batches() -> Result<()> { @@ -365,4 +365,23 @@ mod tests { } Ok(output_partitions) } + + /// Create vector batches + fn create_vec_batches(schema: &Schema, n: usize) -> Vec { + let batch = create_batch(schema); + let mut vec = Vec::with_capacity(n); + for _ in 0..n { + vec.push(batch.clone()); + } + vec + } + + /// Create batch + fn create_batch(schema: &Schema) -> RecordBatch { + RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], + ) + .unwrap() + } } diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 296743ba63145..8eddf57ae5515 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -169,7 +169,6 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use futures::FutureExt; - use tempfile::TempDir; use super::*; use crate::test::exec::{ @@ -183,8 +182,7 @@ mod tests { let task_ctx = Arc::new(TaskContext::default()); let num_partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(num_partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(num_partitions); // input should have 4 partitions assert_eq!(csv.output_partitioning().partition_count(), num_partitions); @@ -199,9 +197,9 @@ mod tests { let batches = common::collect(iter).await?; assert_eq!(batches.len(), num_partitions); - // there should be a total of 100 rows + // there should be a total of 400 rows (100 per each partition) let row_count: usize = batches.iter().map(|batch| batch.num_rows()).sum(); - assert_eq!(row_count, 100); + assert_eq!(row_count, 400); Ok(()) } diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 0b4379fe96e58..e4a4e113eb07e 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -16,8 +16,7 @@ // under the License. //! Implementation of physical plan display. See -//! [`crate::physical_plan::displayable`] for examples of how to -//! format +//! [`crate::displayable`] for examples of how to format use std::fmt; diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index 2f03e0750e7b0..675dac9ad2656 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -173,12 +173,12 @@ impl ExecutionPlan for EmptyExec { mod tests { use super::*; use crate::with_new_children_if_necessary; - use crate::{physical_plan::common, test_util}; + use crate::{common, test}; #[tokio::test] async fn empty() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let empty = EmptyExec::new(false, schema.clone()); assert_eq!(empty.schema(), schema); @@ -193,7 +193,7 @@ mod tests { #[test] fn with_new_children() -> Result<()> { - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let empty = Arc::new(EmptyExec::new(false, schema.clone())); let empty_with_row = Arc::new(EmptyExec::new(true, schema)); @@ -215,7 +215,7 @@ mod tests { #[tokio::test] async fn invalid_execute() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let empty = EmptyExec::new(false, schema); // ask for the wrong partition @@ -227,7 +227,7 @@ mod tests { #[tokio::test] async fn produce_one_row() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let empty = EmptyExec::new(true, schema); let iter = empty.execute(0, task_ctx)?; @@ -242,7 +242,7 @@ mod tests { #[tokio::test] async fn produce_one_row_multiple_partition() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let partitions = 3; let empty = EmptyExec::new(true, schema).with_partitions(partitions); diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 1030eeba42508..15208fd0829e2 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -381,74 +381,17 @@ mod tests { use crate::expressions::*; use crate::test; use crate::test::exec::StatisticsExec; - use crate::test_util; use crate::ExecutionPlan; - use crate::{collect, with_new_children_if_necessary}; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::utils::DataPtr; use datafusion_common::ColumnStatistics; use datafusion_common::ScalarValue; use datafusion_expr::Operator; use std::iter::Iterator; use std::sync::Arc; - use tempfile::TempDir; - - #[tokio::test] - async fn simple_predicate() -> Result<()> { - let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); - - let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; - - let predicate: Arc = binary( - binary(col("c2", &schema)?, Operator::Gt, lit(1u32), &schema)?, - Operator::And, - binary(col("c2", &schema)?, Operator::Lt, lit(4u32), &schema)?, - &schema, - )?; - - let filter: Arc = - Arc::new(FilterExec::try_new(predicate, csv)?); - - let results = collect(filter, task_ctx).await?; - - results - .iter() - .for_each(|batch| assert_eq!(13, batch.num_columns())); - let row_count: usize = results.iter().map(|batch| batch.num_rows()).sum(); - assert_eq!(41, row_count); - - Ok(()) - } - - #[tokio::test] - async fn with_new_children() -> Result<()> { - let schema = test_util::aggr_test_schema(); - let partitions = 4; - let tmp_dir = TempDir::new()?; - let input = test::scan_partitioned_csv(partitions, tmp_dir.path())?; - - let predicate: Arc = - binary(col("c2", &schema)?, Operator::Gt, lit(1u32), &schema)?; - - let filter: Arc = - Arc::new(FilterExec::try_new(predicate, input.clone())?); - - let new_filter = filter.clone().with_new_children(vec![input.clone()])?; - assert!(!Arc::data_ptr_eq(&filter, &new_filter)); - - let new_filter2 = - with_new_children_if_necessary(filter.clone(), vec![input])?.into(); - assert!(Arc::data_ptr_eq(&filter, &new_filter2)); - - Ok(()) - } #[tokio::test] async fn collect_columns_predicates() -> Result<()> { - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let predicate: Arc = binary( binary( binary(col("c2", &schema)?, Operator::GtEq, lit(1u32), &schema)?, diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index e5302420c27a7..90d84282fd0dd 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -52,9 +52,9 @@ type JoinLeftData = (RecordBatch, MemoryReservation); #[derive(Debug)] pub struct CrossJoinExec { /// left (build) side which gets loaded in memory - pub(crate) left: Arc, + pub left: Arc, /// right (probe) side which are combined with left side - pub(crate) right: Arc, + pub right: Arc, /// The schema once the join is applied schema: SchemaRef, /// Build-side data @@ -458,9 +458,9 @@ impl CrossJoinStream { mod tests { use super::*; use crate::assert_batches_sorted_eq; + use crate::assert_contains; use crate::common; - use crate::common::assert_contains; - use crate::test::{build_table_scan_i32, columns}; + use crate::test::build_table_scan_i32; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; async fn join_collect( @@ -678,4 +678,9 @@ mod tests { Ok(()) } + + /// Returns the column names on the schema + fn columns(schema: &Schema) -> Vec { + schema.fields().iter().map(|f| f.name().clone()).collect() + } } diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 75807863198ce..43ade366f3257 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -85,15 +85,15 @@ type JoinLeftData = (JoinHashMap, RecordBatch, MemoryReservation); #[derive(Debug)] pub struct HashJoinExec { /// left (build) side which gets hashed - pub(crate) left: Arc, + pub left: Arc, /// right (probe) side which are filtered by the hash table - pub(crate) right: Arc, + pub right: Arc, /// Set of common columns used to join on - pub(crate) on: Vec<(Column, Column)>, + pub on: Vec<(Column, Column)>, /// Filters which are applied while finding matching rows - pub(crate) filter: Option, + pub filter: Option, /// How the join is performed - pub(crate) join_type: JoinType, + pub join_type: JoinType, /// The schema once the join is applied schema: SchemaRef, /// Build-side data @@ -103,13 +103,13 @@ pub struct HashJoinExec { /// Output order output_order: Option>, /// Partitioning mode to use - pub(crate) mode: PartitionMode, + pub mode: PartitionMode, /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Information of index and left / right placement of columns column_indices: Vec, /// If null_equals_null is true, null == null else null != null - pub(crate) null_equals_null: bool, + pub null_equals_null: bool, } impl HashJoinExec { @@ -1067,23 +1067,19 @@ mod tests { use datafusion_physical_expr::expressions::Literal; use hashbrown::raw::RawTable; - use crate::execution::context::SessionConfig; - use crate::physical_expr::expressions::BinaryExpr; use crate::{ - assert_batches_sorted_eq, - common::assert_contains, - physical_plan::{ - common, - expressions::Column, - hash_utils::create_hashes, - joins::{hash_join::build_equal_condition_join_indices, utils::JoinSide}, - memory::MemoryExec, - repartition::RepartitionExec, - }, + assert_batches_sorted_eq, assert_contains, common, + expressions::Column, + hash_utils::create_hashes, + joins::{hash_join::build_equal_condition_join_indices, utils::JoinSide}, + memory::MemoryExec, + repartition::RepartitionExec, + test::build_table_i32, test::exec::MockExec, - test::{build_table_i32, columns}, }; + use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + use datafusion_physical_expr::expressions::BinaryExpr; use super::*; @@ -2906,4 +2902,9 @@ mod tests { Ok(()) } + + /// Returns the column names on the schema + fn columns(schema: &Schema) -> Vec { + schema.fields().iter().map(|f| f.name().clone()).collect() + } } diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index b66454c732262..a0222f84a70e9 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -740,18 +740,14 @@ impl RecordBatchStream for NestedLoopJoinStream { #[cfg(test)] mod tests { use super::*; - use crate::physical_expr::expressions::BinaryExpr; use crate::{ - assert_batches_sorted_eq, - common::assert_contains, - execution::runtime_env::{RuntimeConfig, RuntimeEnv}, - physical_plan::{ - common, expressions::Column, memory::MemoryExec, repartition::RepartitionExec, - }, - test::{build_table_i32, columns}, + assert_batches_sorted_eq, assert_contains, common, expressions::Column, + memory::MemoryExec, repartition::RepartitionExec, test::build_table_i32, }; use arrow::datatypes::{DataType, Field}; + use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_expr::Operator; + use datafusion_physical_expr::expressions::BinaryExpr; use crate::joins::utils::JoinSide; use datafusion_common::ScalarValue; @@ -1169,4 +1165,9 @@ mod tests { Ok(()) } + + /// Returns the column names on the schema + fn columns(schema: &Schema) -> Vec { + schema.fields().iter().map(|f| f.name().clone()).collect() + } } diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index e85f4bcb2ecc3..bf2f977d820d6 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -63,13 +63,13 @@ use futures::{Stream, StreamExt}; #[derive(Debug)] pub struct SortMergeJoinExec { /// Left sorted joining execution plan - pub(crate) left: Arc, + pub left: Arc, /// Right sorting joining execution plan - pub(crate) right: Arc, + pub right: Arc, /// Set of common columns used to join on - pub(crate) on: JoinOn, + pub on: JoinOn, /// How the join is performed - pub(crate) join_type: JoinType, + pub join_type: JoinType, /// The schema once the join is applied schema: SchemaRef, /// Execution metrics @@ -81,9 +81,9 @@ pub struct SortMergeJoinExec { /// The output ordering output_ordering: Option>, /// Sort options of join columns used in sorting left and right execution plans - pub(crate) sort_options: Vec, + pub sort_options: Vec, /// If null_equals_null is true, null == null else null != null - pub(crate) null_equals_null: bool, + pub null_equals_null: bool, } impl SortMergeJoinExec { @@ -194,6 +194,18 @@ impl SortMergeJoinExec { pub fn on(&self) -> &[(Column, Column)] { &self.on } + + pub fn right(&self) -> &dyn ExecutionPlan { + self.right.as_ref() + } + + pub fn join_type(&self) -> JoinType { + self.join_type + } + + pub fn left(&self) -> &dyn ExecutionPlan { + self.left.as_ref() + } } impl DisplayAs for SortMergeJoinExec { @@ -1392,12 +1404,12 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_execution::TaskContext; - use crate::common::assert_contains; + use crate::assert_contains; use crate::expressions::Column; use crate::joins::utils::JoinOn; use crate::joins::SortMergeJoinExec; use crate::memory::MemoryExec; - use crate::test::{build_table_i32, columns}; + use crate::test::build_table_i32; use crate::{assert_batches_eq, assert_batches_sorted_eq}; use crate::{common, ExecutionPlan}; use datafusion_common::JoinType; @@ -2422,4 +2434,8 @@ mod tests { Ok(()) } + /// Returns the column names on the schema + fn columns(schema: &Schema) -> Vec { + schema.fields().iter().map(|f| f.name().clone()).collect() + } } diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs index 44610ab09a38b..af3dc6f9a1944 100644 --- a/datafusion/physical-plan/src/joins/test_utils.rs +++ b/datafusion/physical-plan/src/joins/test_utils.rs @@ -17,13 +17,13 @@ //! This file has test utils for hash joins -use crate::physical_plan::joins::utils::{JoinFilter, JoinOn}; -use crate::physical_plan::joins::{ +use crate::joins::utils::{JoinFilter, JoinOn}; +use crate::joins::{ HashJoinExec, PartitionMode, StreamJoinPartitionMode, SymmetricHashJoinExec, }; -use crate::physical_plan::memory::MemoryExec; -use crate::physical_plan::repartition::RepartitionExec; -use crate::physical_plan::{common, ExecutionPlan, Partitioning}; +use crate::memory::MemoryExec; +use crate::repartition::RepartitionExec; +use crate::{common, ExecutionPlan, Partitioning}; use arrow::util::pretty::pretty_format_batches; use arrow_array::{ ArrayRef, Float64Array, Int32Array, IntervalDayTimeArray, RecordBatch, diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index a34e8e651b3bd..9c116e73ead7e 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -242,45 +242,6 @@ pub fn with_new_children_if_necessary( /// Return a [wrapper](DisplayableExecutionPlan) around an /// [`ExecutionPlan`] which can be displayed in various easier to /// understand ways. -/// -/// ``` -/// use datafusion::prelude::*; -/// use datafusion::physical_plan::displayable; -/// use object_store::path::Path; -/// -/// #[tokio::main] -/// async fn main() { -/// // Hard code target_partitions as it appears in the RepartitionExec output -/// let config = SessionConfig::new() -/// .with_target_partitions(3); -/// let mut ctx = SessionContext::with_config(config); -/// -/// // register the a table -/// ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await.unwrap(); -/// -/// // create a plan to run a SQL query -/// let dataframe = ctx.sql("SELECT a FROM example WHERE a < 5").await.unwrap(); -/// let physical_plan = dataframe.create_physical_plan().await.unwrap(); -/// -/// // Format using display string in verbose mode -/// let displayable_plan = displayable(physical_plan.as_ref()); -/// let plan_string = format!("{}", displayable_plan.indent(true)); -/// -/// let working_directory = std::env::current_dir().unwrap(); -/// let normalized = Path::from_filesystem_path(working_directory).unwrap(); -/// let plan_string = plan_string.replace(normalized.as_ref(), "WORKING_DIR"); -/// -/// assert_eq!("CoalesceBatchesExec: target_batch_size=8192\ -/// \n FilterExec: a@0 < 5\ -/// \n RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1\ -/// \n CsvExec: file_groups={1 group: [[WORKING_DIR/tests/data/example.csv]]}, projection=[a], has_header=true", -/// plan_string.trim()); -/// -/// let one_line = format!("{}", displayable_plan.one_line()); -/// assert_eq!("CoalesceBatchesExec: target_batch_size=8192", one_line.trim()); -/// } -/// ``` -/// pub fn displayable(plan: &dyn ExecutionPlan) -> DisplayableExecutionPlan<'_> { DisplayableExecutionPlan::new(plan) } @@ -404,3 +365,6 @@ use datafusion_execution::TaskContext; pub use datafusion_physical_expr::{ expressions, functions, hash_utils, ordering_equivalence_properties_helper, udf, }; + +#[cfg(test)] +pub mod test; diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 360b0d6179ed0..c6d51b7d9c5d7 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -521,8 +521,8 @@ impl RecordBatchStream for LimitStream { #[cfg(test)] mod tests { + use arrow_schema::Schema; use common::collect; - use tempfile::TempDir; use super::*; use crate::coalesce_partitions::CoalescePartitionsExec; @@ -534,8 +534,7 @@ mod tests { let task_ctx = Arc::new(TaskContext::default()); let num_partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(num_partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(num_partitions); // input should have 4 partitions assert_eq!(csv.output_partitioning().partition_count(), num_partitions); @@ -619,9 +618,9 @@ mod tests { #[tokio::test] async fn limit_no_column() -> Result<()> { let batches = vec![ - test::make_batch_no_column(6), - test::make_batch_no_column(6), - test::make_batch_no_column(6), + make_batch_no_column(6), + make_batch_no_column(6), + make_batch_no_column(6), ]; let input = test::exec::TestStream::new(batches); @@ -650,9 +649,9 @@ mod tests { async fn skip_and_fetch(skip: usize, fetch: Option) -> Result { let task_ctx = Arc::new(TaskContext::default()); + // 4 partitions @ 100 rows apiece let num_partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(num_partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(num_partitions); assert_eq!(csv.output_partitioning().partition_count(), num_partitions); @@ -668,7 +667,7 @@ mod tests { #[tokio::test] async fn skip_none_fetch_none() -> Result<()> { let row_count = skip_and_fetch(0, None).await?; - assert_eq!(row_count, 100); + assert_eq!(row_count, 400); Ok(()) } @@ -681,9 +680,9 @@ mod tests { #[tokio::test] async fn skip_3_fetch_none() -> Result<()> { - // there are total of 100 rows, we skipped 3 rows (offset = 3) + // there are total of 400 rows, we skipped 3 rows (offset = 3) let row_count = skip_and_fetch(3, None).await?; - assert_eq!(row_count, 97); + assert_eq!(row_count, 397); Ok(()) } @@ -696,23 +695,24 @@ mod tests { } #[tokio::test] - async fn skip_100_fetch_none() -> Result<()> { - let row_count = skip_and_fetch(100, None).await?; + async fn skip_400_fetch_none() -> Result<()> { + let row_count = skip_and_fetch(400, None).await?; assert_eq!(row_count, 0); Ok(()) } #[tokio::test] - async fn skip_100_fetch_1() -> Result<()> { - let row_count = skip_and_fetch(100, Some(1)).await?; + async fn skip_400_fetch_1() -> Result<()> { + // there are a total of 400 rows + let row_count = skip_and_fetch(400, Some(1)).await?; assert_eq!(row_count, 0); Ok(()) } #[tokio::test] - async fn skip_101_fetch_none() -> Result<()> { - // there are total of 100 rows, we skipped 101 rows (offset = 3) - let row_count = skip_and_fetch(101, None).await?; + async fn skip_401_fetch_none() -> Result<()> { + // there are total of 400 rows, we skipped 401 rows (offset = 3) + let row_count = skip_and_fetch(401, None).await?; assert_eq!(row_count, 0); Ok(()) } @@ -731,7 +731,7 @@ mod tests { #[tokio::test] async fn test_row_number_statistics_for_local_limit() -> Result<()> { let row_count = row_number_statistics_for_local_limit(4, 10).await?; - assert_eq!(row_count, Some(40)); + assert_eq!(row_count, Some(10)); Ok(()) } @@ -741,8 +741,7 @@ mod tests { fetch: Option, ) -> Result> { let num_partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(num_partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(num_partitions); assert_eq!(csv.output_partitioning().partition_count(), num_partitions); @@ -756,8 +755,7 @@ mod tests { num_partitions: usize, fetch: usize, ) -> Result> { - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(num_partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(num_partitions); assert_eq!(csv.output_partitioning().partition_count(), num_partitions); @@ -765,4 +763,12 @@ mod tests { Ok(offset.statistics().num_rows) } + + /// Return a RecordBatch with a single array with row_count sz + fn make_batch_no_column(sz: usize) -> RecordBatch { + let schema = Arc::new(Schema::empty()); + + let options = RecordBatchOptions::new().with_row_count(Option::from(sz)); + RecordBatch::try_new_with_options(schema, vec![], &options).unwrap() + } } diff --git a/datafusion/physical-plan/src/metrics/baseline.rs b/datafusion/physical-plan/src/metrics/baseline.rs index 7d72a6a9fae17..dc345cd8cdcd6 100644 --- a/datafusion/physical-plan/src/metrics/baseline.rs +++ b/datafusion/physical-plan/src/metrics/baseline.rs @@ -29,7 +29,7 @@ use datafusion_common::Result; /// /// Example: /// ``` -/// use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet}; +/// use datafusion_physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet}; /// let metrics = ExecutionPlanMetricsSet::new(); /// /// let partition = 2; diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index 30e9764c64460..beecc13e0029b 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -29,7 +29,7 @@ use super::{ /// case of constant strings /// /// ```rust -/// use datafusion::physical_plan::metrics::*; +/// use datafusion_physical_plan::metrics::*; /// /// let metrics = ExecutionPlanMetricsSet::new(); /// let partition = 1; diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index 652c0af5c2e44..b2e0086f69e9a 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -43,7 +43,7 @@ pub use value::{Count, Gauge, MetricValue, ScopedTimerGuard, Time, Timestamp}; /// [`ExecutionPlanMetricsSet`]. /// /// ``` -/// use datafusion::physical_plan::metrics::*; +/// use datafusion_physical_plan::metrics::*; /// /// let metrics = ExecutionPlanMetricsSet::new(); /// assert!(metrics.clone_inner().output_rows().is_none()); diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 1de23d4b89267..f1ec0a68a6e76 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -509,109 +509,19 @@ impl RecordBatchStream for ProjectionStream { mod tests { use super::*; use crate::common::collect; - use crate::expressions::{self, col}; - use crate::test::{self}; - use crate::test_util; + use crate::expressions; + use crate::test; use arrow_schema::DataType; use datafusion_common::ScalarValue; - use datafusion_expr::Operator; - use datafusion_physical_expr::expressions::binary; - use futures::future; - use tempfile::TempDir; - - // Create a binary expression without coercion. Used here when we do not want to coerce the expressions - // to valid types. Usage can result in an execution (after plan) error. - fn binary_simple( - l: Arc, - op: Operator, - r: Arc, - input_schema: &Schema, - ) -> Arc { - binary(l, op, r, input_schema).unwrap() - } - - #[tokio::test] - async fn project_first_column() -> Result<()> { - let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); - - let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; - - // pick column c1 and name it column c1 in the output schema - let projection = - ProjectionExec::try_new(vec![(col("c1", &schema)?, "c1".to_string())], csv)?; - - let col_field = projection.schema.field(0); - let col_metadata = col_field.metadata(); - let data: &str = &col_metadata["testing"]; - assert_eq!(data, "test"); - - let mut partition_count = 0; - let mut row_count = 0; - for partition in 0..projection.output_partitioning().partition_count() { - partition_count += 1; - let stream = projection.execute(partition, task_ctx.clone())?; - - row_count += stream - .map(|batch| { - let batch = batch.unwrap(); - assert_eq!(1, batch.num_columns()); - batch.num_rows() - }) - .fold(0, |acc, x| future::ready(acc + x)) - .await; - } - assert_eq!(partitions, partition_count); - assert_eq!(100, row_count); - - Ok(()) - } - - #[tokio::test] - async fn project_input_not_partitioning() -> Result<()> { - let schema = test_util::aggr_test_schema(); - - let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; - - // pick column c1 and name it column c1 in the output schema - let projection = - ProjectionExec::try_new(vec![(col("c1", &schema)?, "c1".to_string())], csv)?; - assert!(!projection.benefits_from_input_partitioning()[0]); - Ok(()) - } - - #[tokio::test] - async fn project_input_partitioning() -> Result<()> { - let schema = test_util::aggr_test_schema(); - - let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; - - let c1 = col("c2", &schema).unwrap(); - let c2 = col("c9", &schema).unwrap(); - let c1_plus_c2 = binary_simple(c1, Operator::Plus, c2, &schema); - - let projection = - ProjectionExec::try_new(vec![(c1_plus_c2, "c2 + c9".to_string())], csv)?; - - assert!(projection.benefits_from_input_partitioning()[0]); - Ok(()) - } #[tokio::test] async fn project_no_column() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(1, tmp_dir.path())?; - let expected = collect(csv.execute(0, task_ctx.clone())?).await.unwrap(); + let exec = test::scan_partitioned(1); + let expected = collect(exec.execute(0, task_ctx.clone())?).await.unwrap(); - let projection = ProjectionExec::try_new(vec![], csv)?; + let projection = ProjectionExec::try_new(vec![], exec)?; let stream = projection.execute(0, task_ctx.clone())?; let output = collect(stream).await.unwrap(); assert_eq!(output.len(), expected.len()); diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 44a7739a343c5..c10bfc78b117a 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -894,10 +894,8 @@ impl RecordBatchStream for PerPartitionStream { #[cfg(test)] mod tests { use super::*; - use crate::test::create_vec_batches; use crate::{ assert_batches_sorted_eq, - physical_plan::{collect, expressions::col, memory::MemoryExec}, test::{ assert_is_pending, exec::{ @@ -905,10 +903,12 @@ mod tests { ErrorExec, MockExec, }, }, + {collect, expressions::col, memory::MemoryExec}, }; use arrow::array::{ArrayRef, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; + use arrow_array::UInt32Array; use datafusion_common::cast::as_string_array; use datafusion_common::exec_err; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; @@ -1398,4 +1398,23 @@ mod tests { Ok(()) } + + /// Create vector batches + fn create_vec_batches(schema: &Schema, n: usize) -> Vec { + let batch = create_batch(schema); + let mut vec = Vec::with_capacity(n); + for _ in 0..n { + vec.push(batch.clone()); + } + vec + } + + /// Create batch + fn create_batch(schema: &Schema) -> RecordBatch { + RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], + ) + .unwrap() + } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 695272767696d..7b4d239eebd00 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -900,59 +900,31 @@ mod tests { use arrow::array::*; use arrow::compute::SortOptions; use arrow::datatypes::*; - use datafusion_common::cast::{as_primitive_array, as_string_array}; + use datafusion_common::cast::as_primitive_array; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeConfig; use futures::FutureExt; use std::collections::HashMap; - use tempfile::TempDir; #[tokio::test] async fn test_in_mem_sort() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(partitions); let schema = csv.schema(); let sort_exec = Arc::new(SortExec::new( - vec![ - // c1 string column - PhysicalSortExpr { - expr: col("c1", &schema)?, - options: SortOptions::default(), - }, - // c2 uin32 column - PhysicalSortExpr { - expr: col("c2", &schema)?, - options: SortOptions::default(), - }, - // c7 uin8 column - PhysicalSortExpr { - expr: col("c7", &schema)?, - options: SortOptions::default(), - }, - ], + vec![PhysicalSortExpr { + expr: col("i", &schema)?, + options: SortOptions::default(), + }], Arc::new(CoalescePartitionsExec::new(csv)), )); let result = collect(sort_exec, task_ctx.clone()).await?; assert_eq!(result.len(), 1); - - let columns = result[0].columns(); - - let c1 = as_string_array(&columns[0])?; - assert_eq!(c1.value(0), "a"); - assert_eq!(c1.value(c1.len() - 1), "e"); - - let c2 = as_primitive_array::(&columns[1])?; - assert_eq!(c2.value(0), 1); - assert_eq!(c2.value(c2.len() - 1), 5,); - - let c7 = as_primitive_array::(&columns[6])?; - assert_eq!(c7.value(0), 15); - assert_eq!(c7.value(c7.len() - 1), 254,); + assert_eq!(result[0].num_rows(), 400); assert_eq!( task_ctx.runtime_env().memory_pool.reserved(), @@ -965,7 +937,7 @@ mod tests { #[tokio::test] async fn test_sort_spill() -> Result<()> { - // trigger spill there will be 4 batches with 5.5KB for each + // trigger spill w/ 100 batches let session_config = SessionConfig::new(); let sort_spill_reservation_bytes = session_config .options() @@ -980,57 +952,35 @@ mod tests { .with_runtime(runtime), ); - let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; - let schema = csv.schema(); + let partitions = 100; + let input = test::scan_partitioned(partitions); + let schema = input.schema(); let sort_exec = Arc::new(SortExec::new( - vec![ - // c1 string column - PhysicalSortExpr { - expr: col("c1", &schema)?, - options: SortOptions::default(), - }, - // c2 uin32 column - PhysicalSortExpr { - expr: col("c2", &schema)?, - options: SortOptions::default(), - }, - // c7 uin8 column - PhysicalSortExpr { - expr: col("c7", &schema)?, - options: SortOptions::default(), - }, - ], - Arc::new(CoalescePartitionsExec::new(csv)), + vec![PhysicalSortExpr { + expr: col("i", &schema)?, + options: SortOptions::default(), + }], + Arc::new(CoalescePartitionsExec::new(input)), )); let result = collect(sort_exec.clone(), task_ctx.clone()).await?; - assert_eq!(result.len(), 1); + assert_eq!(result.len(), 2); // Now, validate metrics let metrics = sort_exec.metrics().unwrap(); - assert_eq!(metrics.output_rows().unwrap(), 100); + assert_eq!(metrics.output_rows().unwrap(), 10000); assert!(metrics.elapsed_compute().unwrap() > 0); assert!(metrics.spill_count().unwrap() > 0); assert!(metrics.spilled_bytes().unwrap() > 0); let columns = result[0].columns(); - let c1 = as_string_array(&columns[0])?; - assert_eq!(c1.value(0), "a"); - assert_eq!(c1.value(c1.len() - 1), "e"); - - let c2 = as_primitive_array::(&columns[1])?; - assert_eq!(c2.value(0), 1); - assert_eq!(c2.value(c2.len() - 1), 5,); - - let c7 = as_primitive_array::(&columns[6])?; - assert_eq!(c7.value(0), 15); - assert_eq!(c7.value(c7.len() - 1), 254,); + let i = as_primitive_array::(&columns[0])?; + assert_eq!(i.value(0), 0); + assert_eq!(i.value(i.len() - 1), 81); assert_eq!( task_ctx.runtime_env().memory_pool.reserved(), @@ -1044,7 +994,7 @@ mod tests { #[tokio::test] async fn test_sort_fetch_memory_calculation() -> Result<()> { // This test mirrors down the size from the example above. - let avg_batch_size = 4000; + let avg_batch_size = 400; let partitions = 4; // A tuple of (fetch, expect_spillage) @@ -1075,29 +1025,15 @@ mod tests { .with_session_config(session_config), ); - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path())?; + let csv = test::scan_partitioned(partitions); let schema = csv.schema(); let sort_exec = Arc::new( SortExec::new( - vec![ - // c1 string column - PhysicalSortExpr { - expr: col("c1", &schema)?, - options: SortOptions::default(), - }, - // c2 uin32 column - PhysicalSortExpr { - expr: col("c2", &schema)?, - options: SortOptions::default(), - }, - // c7 uin8 column - PhysicalSortExpr { - expr: col("c7", &schema)?, - options: SortOptions::default(), - }, - ], + vec![PhysicalSortExpr { + expr: col("i", &schema)?, + options: SortOptions::default(), + }], Arc::new(CoalescePartitionsExec::new(csv)), ) .with_fetch(fetch), diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 507d66c920fb5..b9e2c9662b14c 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -276,8 +276,8 @@ mod tests { use arrow::record_batch::RecordBatch; use datafusion_execution::config::SessionConfig; use futures::{FutureExt, StreamExt}; - use tempfile::TempDir; + use crate::assert_batches_eq; use crate::coalesce_partitions::CoalescePartitionsExec; use crate::expressions::col; use crate::memory::MemoryExec; @@ -285,8 +285,7 @@ mod tests { use crate::sorts::sort::SortExec; use crate::stream::RecordBatchReceiverStream; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; - use crate::test::{self, assert_is_pending}; - use crate::{assert_batches_eq, test_util}; + use crate::test::{self, assert_is_pending, make_partition}; use crate::{collect, common}; use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; @@ -561,31 +560,16 @@ mod tests { async fn test_partition_sort() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path()).unwrap(); + let csv = test::scan_partitioned(partitions); let schema = csv.schema(); - let sort = vec![ - PhysicalSortExpr { - expr: col("c1", &schema).unwrap(), - options: SortOptions { - descending: true, - nulls_first: true, - }, - }, - PhysicalSortExpr { - expr: col("c2", &schema).unwrap(), - options: Default::default(), - }, - PhysicalSortExpr { - expr: col("c7", &schema).unwrap(), - options: SortOptions::default(), - }, - PhysicalSortExpr { - expr: col("c12", &schema).unwrap(), - options: SortOptions::default(), + let sort = vec![PhysicalSortExpr { + expr: col("i", &schema).unwrap(), + options: SortOptions { + descending: true, + nulls_first: true, }, - ]; + }]; let basic = basic_sort(csv.clone(), sort.clone(), task_ctx.clone()).await; let partition = partition_sort(csv, sort, task_ctx.clone()).await; @@ -634,8 +618,7 @@ mod tests { context: Arc, ) -> Result> { let partitions = 4; - let tmp_dir = TempDir::new()?; - let csv = test::scan_partitioned_csv(partitions, tmp_dir.path()).unwrap(); + let csv = test::scan_partitioned(partitions); let sorted = basic_sort(csv, sort, context).await; let split: Vec<_> = sizes.iter().map(|x| split_batch(&sorted, *x)).collect(); @@ -648,29 +631,11 @@ mod tests { #[tokio::test] async fn test_partition_sort_streaming_input() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); - let sort = vec![ - // uint8 - PhysicalSortExpr { - expr: col("c7", &schema).unwrap(), - options: Default::default(), - }, - // int16 - PhysicalSortExpr { - expr: col("c4", &schema).unwrap(), - options: Default::default(), - }, - // utf-8 - PhysicalSortExpr { - expr: col("c1", &schema).unwrap(), - options: SortOptions::default(), - }, - // utf-8 - PhysicalSortExpr { - expr: col("c13", &schema).unwrap(), - options: SortOptions::default(), - }, - ]; + let schema = make_partition(11).schema(); + let sort = vec![PhysicalSortExpr { + expr: col("i", &schema).unwrap(), + options: Default::default(), + }]; let input = sorted_partitioned_input(sort.clone(), &[10, 3, 11], task_ctx.clone()) @@ -678,8 +643,8 @@ mod tests { let basic = basic_sort(input.clone(), sort.clone(), task_ctx.clone()).await; let partition = sorted_merge(input, sort, task_ctx.clone()).await; - assert_eq!(basic.num_rows(), 300); - assert_eq!(partition.num_rows(), 300); + assert_eq!(basic.num_rows(), 1200); + assert_eq!(partition.num_rows(), 1200); let basic = arrow::util::pretty::pretty_format_batches(&[basic]) .unwrap() @@ -695,20 +660,11 @@ mod tests { #[tokio::test] async fn test_partition_sort_streaming_input_output() -> Result<()> { - let schema = test_util::aggr_test_schema(); - - let sort = vec![ - // float64 - PhysicalSortExpr { - expr: col("c12", &schema).unwrap(), - options: Default::default(), - }, - // utf-8 - PhysicalSortExpr { - expr: col("c13", &schema).unwrap(), - options: Default::default(), - }, - ]; + let schema = make_partition(11).schema(); + let sort = vec![PhysicalSortExpr { + expr: col("i", &schema).unwrap(), + options: Default::default(), + }]; // Test streaming with default batch size let task_ctx = Arc::new(TaskContext::default()); @@ -725,10 +681,10 @@ mod tests { let merge = Arc::new(SortPreservingMergeExec::new(sort, input)); let merged = collect(merge, task_ctx).await.unwrap(); - assert_eq!(merged.len(), 14); + assert_eq!(merged.len(), 53); - assert_eq!(basic.num_rows(), 300); - assert_eq!(merged.iter().map(|x| x.num_rows()).sum::(), 300); + assert_eq!(basic.num_rows(), 1200); + assert_eq!(merged.iter().map(|x| x.num_rows()).sum::(), 1200); let basic = arrow::util::pretty::pretty_format_batches(&[basic]) .unwrap() @@ -826,9 +782,9 @@ mod tests { #[tokio::test] async fn test_async() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let schema = test_util::aggr_test_schema(); + let schema = make_partition(11).schema(); let sort = vec![PhysicalSortExpr { - expr: col("c12", &schema).unwrap(), + expr: col("i", &schema).unwrap(), options: SortOptions::default(), }]; diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs new file mode 100644 index 0000000000000..e0c612387470c --- /dev/null +++ b/datafusion/physical-plan/src/test.rs @@ -0,0 +1,343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Utilities for testing datafusion-physical-plan + +use std::error::Error; +use std::pin::Pin; +use std::sync::Arc; +use std::{collections::HashMap, path::PathBuf}; + +use arrow_array::{ArrayRef, Int32Array, RecordBatch}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use futures::{Future, FutureExt}; + +use crate::memory::MemoryExec; +use crate::ExecutionPlan; + +pub mod exec; + +/// A macro to assert that one string is contained within another with +/// a nice error message if they are not. +/// +/// Usage: `assert_contains!(actual, expected)` +/// +/// Is a macro so test error +/// messages are on the same line as the failure; +/// +/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>) +#[macro_export] +macro_rules! assert_contains { + ($ACTUAL: expr, $EXPECTED: expr) => { + let actual_value: String = $ACTUAL.into(); + let expected_value: String = $EXPECTED.into(); + assert!( + actual_value.contains(&expected_value), + "Can not find expected in actual.\n\nExpected:\n{}\n\nActual:\n{}", + expected_value, + actual_value + ); + }; +} + +/// A macro to assert that one string is NOT contained within another with +/// a nice error message if they are are. +/// +/// Usage: `assert_not_contains!(actual, unexpected)` +/// +/// Is a macro so test error +/// messages are on the same line as the failure; +/// +/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>) +#[macro_export] +macro_rules! assert_not_contains { + ($ACTUAL: expr, $UNEXPECTED: expr) => { + let actual_value: String = $ACTUAL.into(); + let unexpected_value: String = $UNEXPECTED.into(); + assert!( + !actual_value.contains(&unexpected_value), + "Found unexpected in actual.\n\nUnexpected:\n{}\n\nActual:\n{}", + unexpected_value, + actual_value + ); + }; +} + +/// Compares formatted output of a record batch with an expected +/// vector of strings, with the result of pretty formatting record +/// batches. This is a macro so errors appear on the correct line +/// +/// Designed so that failure output can be directly copy/pasted +/// into the test code as expected results. +/// +/// Expects to be called about like this: +/// +/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +#[macro_export] +macro_rules! assert_batches_eq { + ($EXPECTED_LINES: expr, $CHUNKS: expr) => { + let expected_lines: Vec = + $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); + + let formatted = arrow::util::pretty::pretty_format_batches_with_options( + $CHUNKS, + &datafusion_common::format::DEFAULT_FORMAT_OPTIONS, + ) + .unwrap() + .to_string(); + + let actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + assert_eq!( + expected_lines, actual_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + }; +} + +/// Compares formatted output of a record batch with an expected +/// vector of strings in a way that order does not matter. +/// This is a macro so errors appear on the correct line +/// +/// Designed so that failure output can be directly copy/pasted +/// into the test code as expected results. +/// +/// Expects to be called about like this: +/// +/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +#[macro_export] +macro_rules! assert_batches_sorted_eq { + ($EXPECTED_LINES: expr, $CHUNKS: expr) => { + let mut expected_lines: Vec = + $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); + + // sort except for header + footer + let num_lines = expected_lines.len(); + if num_lines > 3 { + expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + let formatted = arrow::util::pretty::pretty_format_batches_with_options( + $CHUNKS, + &datafusion_common::format::DEFAULT_FORMAT_OPTIONS, + ) + .unwrap() + .to_string(); + // fix for windows: \r\n --> + + let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + // sort except for header + footer + let num_lines = actual_lines.len(); + if num_lines > 3 { + actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + assert_eq!( + expected_lines, actual_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + }; +} + +/// Returns the arrow test data directory, which is by default stored +/// in a git submodule rooted at `testing/data`. +/// +/// The default can be overridden by the optional environment +/// variable `ARROW_TEST_DATA` +/// +/// panics when the directory can not be found. +/// +/// Example: +/// ``` +/// let testdata = datafusion_common::test_util::arrow_test_data(); +/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata); +/// assert!(std::path::PathBuf::from(csvdata).exists()); +/// ``` +pub fn arrow_test_data() -> String { + match get_data_dir("ARROW_TEST_DATA", "../../testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get arrow data dir: {err}"), + } +} + +/// Returns the parquet test data directory, which is by default +/// stored in a git submodule rooted at +/// `parquet-testing/data`. +/// +/// The default can be overridden by the optional environment variable +/// `PARQUET_TEST_DATA` +/// +/// panics when the directory can not be found. +/// +/// Example: +/// ``` +/// let testdata = datafusion_common::test_util::parquet_test_data(); +/// let filename = format!("{}/binary.parquet", testdata); +/// assert!(std::path::PathBuf::from(filename).exists()); +/// ``` +pub fn parquet_test_data() -> String { + match get_data_dir("PARQUET_TEST_DATA", "../../parquet-testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get parquet data dir: {err}"), + } +} + +/// Returns a directory path for finding test data. +/// +/// udf_env: name of an environment variable +/// +/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR) +/// +/// Returns either: +/// The path referred to in `udf_env` if that variable is set and refers to a directory +/// The submodule_data directory relative to CARGO_MANIFEST_PATH +pub fn get_data_dir( + udf_env: &str, + submodule_data: &str, +) -> Result> { + // Try user defined env. + if let Ok(dir) = std::env::var(udf_env) { + let trimmed = dir.trim().to_string(); + if !trimmed.is_empty() { + let pb = PathBuf::from(trimmed); + if pb.is_dir() { + return Ok(pb); + } else { + return Err(format!( + "the data dir `{}` defined by env {} not found", + pb.display(), + udf_env + ) + .into()); + } + } + } + + // The env is undefined or its value is trimmed to empty, let's try default dir. + + // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package", + // set by `cargo run` or `cargo test`, see: + // https://doc.rust-lang.org/cargo/reference/environment-variables.html + let dir = env!("CARGO_MANIFEST_DIR"); + + let pb = PathBuf::from(dir).join(submodule_data); + if pb.is_dir() { + Ok(pb) + } else { + Err(format!( + "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ + HINT: try running `git submodule update --init`", + udf_env, + pb.display(), + ).into()) + } +} + +/// Asserts that given future is pending. +pub fn assert_is_pending<'a, T>(fut: &mut Pin + Send + 'a>>) { + let waker = futures::task::noop_waker(); + let mut cx = futures::task::Context::from_waker(&waker); + let poll = fut.poll_unpin(&mut cx); + + assert!(poll.is_pending()); +} + +/// Get the schema for the aggregate_test_* csv files +pub fn aggr_test_schema() -> SchemaRef { + let mut f1 = Field::new("c1", DataType::Utf8, false); + f1.set_metadata(HashMap::from_iter(vec![("testing".into(), "test".into())])); + let schema = Schema::new(vec![ + f1, + Field::new("c2", DataType::UInt32, false), + Field::new("c3", DataType::Int8, false), + Field::new("c4", DataType::Int16, false), + Field::new("c5", DataType::Int32, false), + Field::new("c6", DataType::Int64, false), + Field::new("c7", DataType::UInt8, false), + Field::new("c8", DataType::UInt16, false), + Field::new("c9", DataType::UInt32, false), + Field::new("c10", DataType::UInt64, false), + Field::new("c11", DataType::Float32, false), + Field::new("c12", DataType::Float64, false), + Field::new("c13", DataType::Utf8, false), + ]); + + Arc::new(schema) +} + +/// returns record batch with 3 columns of i32 in memory +pub fn build_table_i32( + a: (&str, &Vec), + b: (&str, &Vec), + c: (&str, &Vec), +) -> RecordBatch { + let schema = Schema::new(vec![ + Field::new(a.0, DataType::Int32, false), + Field::new(b.0, DataType::Int32, false), + Field::new(c.0, DataType::Int32, false), + ]); + + RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(a.1.clone())), + Arc::new(Int32Array::from(b.1.clone())), + Arc::new(Int32Array::from(c.1.clone())), + ], + ) + .unwrap() +} + +/// returns memory table scan wrapped around record batch with 3 columns of i32 +pub fn build_table_scan_i32( + a: (&str, &Vec), + b: (&str, &Vec), + c: (&str, &Vec), +) -> Arc { + let batch = build_table_i32(a, b, c); + let schema = batch.schema(); + Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None).unwrap()) +} + +/// Return a RecordBatch with a single Int32 array with values (0..sz) in a field named "i" +pub fn make_partition(sz: i32) -> RecordBatch { + let seq_start = 0; + let seq_end = sz; + let values = (seq_start..seq_end).collect::>(); + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let arr = Arc::new(Int32Array::from(values)); + let arr = arr as ArrayRef; + + RecordBatch::try_new(schema, vec![arr]).unwrap() +} + +/// Returns a `MemoryExec` that scans `partitions` of 100 batches each +pub fn scan_partitioned(partitions: usize) -> Arc { + Arc::new(mem_exec(partitions)) +} + +/// Returns a `MemoryExec` that scans `partitions` of 100 batches each +pub fn mem_exec(partitions: usize) -> MemoryExec { + let data: Vec> = (0..partitions).map(|_| vec![make_partition(100)]).collect(); + + let schema = data[0][0].schema(); + let projection = None; + MemoryExec::try_new(&data, schema, projection).unwrap() +} diff --git a/datafusion/core/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs similarity index 98% rename from datafusion/core/src/test/exec.rs rename to datafusion/physical-plan/src/test/exec.rs index 44ce5cf3282b1..a1f40c7ba909c 100644 --- a/datafusion/core/src/test/exec.rs +++ b/datafusion/physical-plan/src/test/exec.rs @@ -31,20 +31,15 @@ use arrow::{ }; use futures::Stream; -use crate::physical_plan::{ - common, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, - SendableRecordBatchStream, Statistics, -}; -use crate::physical_plan::{expressions::PhysicalSortExpr, DisplayAs}; use crate::{ - error::{DataFusionError, Result}, - physical_plan::stream::RecordBatchReceiverStream, -}; -use crate::{ - execution::context::TaskContext, physical_plan::stream::RecordBatchStreamAdapter, + common, stream::RecordBatchReceiverStream, stream::RecordBatchStreamAdapter, + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, Statistics, }; +use datafusion_physical_expr::PhysicalSortExpr; -use datafusion_common::internal_err; +use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_execution::TaskContext; /// Index into the data that has been returned so far #[derive(Debug, Default, Clone)] diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 8e0d871e0e34a..af765e257db2f 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -604,19 +604,17 @@ mod tests { use super::*; use crate::test; - use crate::{physical_plan::collect, scalar::ScalarValue}; + use crate::collect; use arrow::record_batch::RecordBatch; - use tempfile::TempDir; + use datafusion_common::ScalarValue; #[tokio::test] async fn test_union_partitions() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); - let tmp_dir = TempDir::new()?; - - // Create csv's with different partitioning - let csv = test::scan_partitioned_csv(4, tmp_dir.path())?; - let csv2 = test::scan_partitioned_csv(5, tmp_dir.path())?; + // Create inputs with different partitioning + let csv = test::scan_partitioned(4); + let csv2 = test::scan_partitioned(5); let union_exec = Arc::new(UnionExec::new(vec![csv, csv2])); diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index d6ca35b5d9bef..2cf341d1fe600 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -195,14 +195,12 @@ impl ExecutionPlan for ValuesExec { #[cfg(test)] mod tests { use super::*; - - use crate::test::create_vec_batches; - use crate::test_util; + use crate::test::{self, make_partition}; use arrow_schema::{DataType, Field, Schema}; #[tokio::test] async fn values_empty_case() -> Result<()> { - let schema = test_util::aggr_test_schema(); + let schema = test::aggr_test_schema(); let empty = ValuesExec::try_new(schema, vec![]); assert!(empty.is_err()); Ok(()) @@ -210,33 +208,24 @@ mod tests { #[test] fn new_exec_with_batches() { - let schema = Arc::new(Schema::new(vec![Field::new( - "col0", - DataType::UInt32, - false, - )])); - let batches = create_vec_batches(&schema, 10); + let batch = make_partition(7); + let schema = batch.schema(); + let batches = vec![batch.clone(), batch]; + let _exec = ValuesExec::try_new_from_batches(schema, batches).unwrap(); } #[test] fn new_exec_with_batches_empty() { - let schema = Arc::new(Schema::new(vec![Field::new( - "col0", - DataType::UInt32, - false, - )])); + let batch = make_partition(7); + let schema = batch.schema(); let _ = ValuesExec::try_new_from_batches(schema, Vec::new()).unwrap_err(); } #[test] fn new_exec_with_batches_invalid_schema() { - let schema = Arc::new(Schema::new(vec![Field::new( - "col0", - DataType::UInt32, - false, - )])); - let batches = create_vec_batches(&schema, 10); + let batch = make_partition(7); + let batches = vec![batch.clone(), batch]; let invalid_schema = Arc::new(Schema::new(vec![ Field::new("col0", DataType::UInt32, false), diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 3f25275996389..2a2f8d6d211bf 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -359,30 +359,14 @@ pub(crate) fn window_ordering_equivalence( mod tests { use super::*; use crate::aggregates::AggregateFunction; - use crate::datasource::physical_plan::CsvExec; + use crate::collect; use crate::expressions::col; + use crate::test::assert_is_pending; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; - use crate::test::{self, assert_is_pending, csv_exec_sorted}; - use crate::{collect, ExecutionPlan}; - use arrow::array::*; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, SchemaRef}; - use arrow::record_batch::RecordBatch; - use datafusion_common::cast::as_primitive_array; use datafusion_execution::TaskContext; - use datafusion_expr::{create_udaf, Accumulator, Volatility}; use futures::FutureExt; - use std::path::Path; - use tempfile::TempDir; - - fn create_test_schema( - partitions: usize, - work_dir: &Path, - ) -> Result<(Arc, SchemaRef)> { - let csv = test::scan_partitioned_csv(partitions, work_dir)?; - let schema = csv.schema(); - Ok((csv, schema)) - } fn create_test_schema2() -> Result { let a = Field::new("a", DataType::Int32, true); @@ -394,57 +378,6 @@ mod tests { Ok(schema) } - /// make PhysicalSortExpr with default options - fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr { - sort_expr_options(name, schema, SortOptions::default()) - } - - /// PhysicalSortExpr with specified options - fn sort_expr_options( - name: &str, - schema: &Schema, - options: SortOptions, - ) -> PhysicalSortExpr { - PhysicalSortExpr { - expr: col(name, schema).unwrap(), - options, - } - } - - #[tokio::test] - async fn test_get_partition_by_ordering() -> Result<()> { - let test_schema = create_test_schema2()?; - // Columns a,c are nullable whereas b,d are not nullable. - // Source is sorted by a ASC NULLS FIRST, b ASC NULLS FIRST, c ASC NULLS FIRST, d ASC NULLS FIRST - // Column e is not ordered. - let sort_exprs = vec![ - sort_expr("a", &test_schema), - sort_expr("b", &test_schema), - sort_expr("c", &test_schema), - sort_expr("d", &test_schema), - ]; - // Input is ordered by a,b,c,d - let input = csv_exec_sorted(&test_schema, sort_exprs, true); - let test_data = vec![ - (vec!["a", "b"], vec![0, 1]), - (vec!["b", "a"], vec![1, 0]), - (vec!["b", "a", "c"], vec![1, 0, 2]), - (vec!["d", "b", "a"], vec![2, 1]), - (vec!["d", "e", "a"], vec![2]), - ]; - for (pb_names, expected) in test_data { - let pb_exprs = pb_names - .iter() - .map(|name| col(name, &test_schema)) - .collect::>>()?; - assert_eq!( - get_ordered_partition_by_indices(&pb_exprs, &input), - expected - ); - } - Ok(()) - } - #[tokio::test] async fn test_calc_requirements() -> Result<()> { let schema = create_test_schema2()?; @@ -509,143 +442,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn window_function_with_udaf() -> Result<()> { - #[derive(Debug)] - struct MyCount(i64); - - impl Accumulator for MyCount { - fn state(&self) -> Result> { - Ok(vec![ScalarValue::Int64(Some(self.0))]) - } - - fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - let array = &values[0]; - self.0 += (array.len() - array.null_count()) as i64; - Ok(()) - } - - fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - let counts: &Int64Array = arrow::array::as_primitive_array(&states[0]); - if let Some(c) = &arrow::compute::sum(counts) { - self.0 += *c; - } - Ok(()) - } - - fn evaluate(&self) -> Result { - Ok(ScalarValue::Int64(Some(self.0))) - } - - fn size(&self) -> usize { - std::mem::size_of_val(self) - } - } - - let my_count = create_udaf( - "my_count", - vec![DataType::Int64], - Arc::new(DataType::Int64), - Volatility::Immutable, - Arc::new(|_| Ok(Box::new(MyCount(0)))), - Arc::new(vec![DataType::Int64]), - ); - - let task_ctx = Arc::new(TaskContext::default()); - let tmp_dir = TempDir::new()?; - let (input, schema) = create_test_schema(1, tmp_dir.path())?; - - let window_exec = Arc::new(WindowAggExec::try_new( - vec![create_window_expr( - &WindowFunction::AggregateUDF(Arc::new(my_count)), - "my_count".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Arc::new(WindowFrame::new(false)), - schema.as_ref(), - )?], - input, - schema.clone(), - vec![], - )?); - - let result: Vec = collect(window_exec, task_ctx).await?; - assert_eq!(result.len(), 1); - - let n_schema_fields = schema.fields().len(); - let columns = result[0].columns(); - - let count: &Int64Array = as_primitive_array(&columns[n_schema_fields])?; - assert_eq!(count.value(0), 100); - assert_eq!(count.value(99), 100); - Ok(()) - } - - #[tokio::test] - async fn window_function() -> Result<()> { - let task_ctx = Arc::new(TaskContext::default()); - let tmp_dir = TempDir::new()?; - let (input, schema) = create_test_schema(1, tmp_dir.path())?; - - let window_exec = Arc::new(WindowAggExec::try_new( - vec![ - create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Count), - "count".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Arc::new(WindowFrame::new(false)), - schema.as_ref(), - )?, - create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Max), - "max".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Arc::new(WindowFrame::new(false)), - schema.as_ref(), - )?, - create_window_expr( - &WindowFunction::AggregateFunction(AggregateFunction::Min), - "min".to_owned(), - &[col("c3", &schema)?], - &[], - &[], - Arc::new(WindowFrame::new(false)), - schema.as_ref(), - )?, - ], - input, - schema.clone(), - vec![], - )?); - - let result: Vec = collect(window_exec, task_ctx).await?; - assert_eq!(result.len(), 1); - - let n_schema_fields = schema.fields().len(); - let columns = result[0].columns(); - - // c3 is small int - - let count: &Int64Array = as_primitive_array(&columns[n_schema_fields])?; - assert_eq!(count.value(0), 100); - assert_eq!(count.value(99), 100); - - let max: &Int8Array = as_primitive_array(&columns[n_schema_fields + 1])?; - assert_eq!(max.value(0), 125); - assert_eq!(max.value(99), 125); - - let min: &Int8Array = as_primitive_array(&columns[n_schema_fields + 2])?; - assert_eq!(min.value(0), -117); - assert_eq!(min.value(99), -117); - - Ok(()) - } - #[tokio::test] async fn test_drop_cancel() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); From b762f3db8c18d25e891fe4276ed90abb4b963e62 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Sep 2023 16:12:17 -0400 Subject: [PATCH 04/15] Update cargo.lock --- datafusion-cli/Cargo.lock | 265 ++++++++++++++++++++++++++++++-------- 1 file changed, 214 insertions(+), 51 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 0c6c6846a89a8..5c92e933f5a00 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -75,6 +75,15 @@ dependencies = [ "libc", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anstyle" version = "1.0.3" @@ -87,6 +96,12 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" + [[package]] name = "arrayvec" version = "0.7.4" @@ -172,7 +187,7 @@ dependencies = [ "chrono", "comfy-table", "half", - "lexical-core", + "lexical-core 0.8.5", "num", ] @@ -190,8 +205,8 @@ dependencies = [ "chrono", "csv", "csv-core", - "lazy_static", - "lexical-core", + "lazy_static 1.4.0", + "lexical-core 0.8.5", "regex", ] @@ -235,9 +250,9 @@ dependencies = [ "chrono", "half", "indexmap 2.0.0", - "lexical-core", + "lexical-core 0.8.5", "num", - "serde", + "serde 1.0.188", "serde_json", ] @@ -323,9 +338,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d495b6dc0184693324491a5ac05f559acc97bf937ab31d7a1c33dd0016be6d2b" +checksum = "bb42b2197bf15ccb092b62c74515dbd8b86d0effd934795f6687c93b6e679a2c" dependencies = [ "bzip2", "flate2", @@ -347,7 +362,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -438,7 +453,7 @@ dependencies = [ "bytes", "http", "http-body", - "lazy_static", + "lazy_static 1.4.0", "percent-encoding", "pin-project-lite", "tracing", @@ -556,7 +571,7 @@ dependencies = [ "http-body", "hyper", "hyper-rustls 0.23.2", - "lazy_static", + "lazy_static 1.4.0", "pin-project-lite", "rustls 0.20.9", "tokio", @@ -716,7 +731,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5" dependencies = [ "arrayref", - "arrayvec", + "arrayvec 0.7.4", "cc", "cfg-if", "constant_time_eq", @@ -761,7 +776,7 @@ checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a" dependencies = [ "memchr", "regex-automata", - "serde", + "serde 1.0.188", ] [[package]] @@ -837,8 +852,8 @@ checksum = "defd4e7873dbddba6c7c91e199c7fcb946abc4a6a4ac3195400bcfb01b5de877" dependencies = [ "android-tzdata", "iana-time-zone", - "num-traits", - "serde", + "num-traits 0.2.16", + "serde 1.0.188", "windows-targets", ] @@ -925,6 +940,22 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "config" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1b9d958c2b1368a663f05538fc1b5975adce1e19f435acceae987aceeeb369" +dependencies = [ + "lazy_static 1.4.0", + "nom", + "rust-ini", + "serde 1.0.188", + "serde-hjson", + "serde_json", + "toml", + "yaml-rust", +] + [[package]] name = "const-random" version = "0.1.15" @@ -1012,7 +1043,7 @@ dependencies = [ "csv-core", "itoa", "ryu", - "serde", + "serde 1.0.188", ] [[package]] @@ -1031,7 +1062,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f34ba9a9bcb8645379e9de8cb3ecfcf4d1c85ba66d90deb3259206fa5aa193b" dependencies = [ "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -1241,6 +1272,7 @@ dependencies = [ "log", "parking_lot", "pin-project-lite", + "ptree", "rand", "rstest 0.18.2", "tempfile", @@ -1282,6 +1314,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "directories" +version = "4.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f51c5d4ddabd36886dd3e1438cb358cdcb0d7c499cb99cb4ac2e38e18b5cb210" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs" version = "4.0.0" @@ -1458,7 +1499,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" dependencies = [ - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -1532,7 +1573,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -1631,7 +1672,7 @@ checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" dependencies = [ "cfg-if", "crunchy", - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -1895,12 +1936,31 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76f033c7ad61445c5b347c7382dd1237847eb1bce590fe50365dcb33d546be73" + [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lexical-core" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" +dependencies = [ + "arrayvec 0.5.2", + "bitflags 1.3.2", + "cfg-if", + "ryu", + "static_assertions", +] + [[package]] name = "lexical-core" version = "0.8.5" @@ -1967,9 +2027,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.147" +version = "0.2.148" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" [[package]] name = "libm" @@ -1987,6 +2047,12 @@ dependencies = [ "libc", ] +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.4.7" @@ -2110,6 +2176,17 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "5.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08959a387a676302eebf4ddbcbc611da04285579f76f88ee0506c63b1a61dd4b" +dependencies = [ + "lexical-core 0.7.6", + "memchr", + "version_check", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -2127,7 +2204,7 @@ dependencies = [ "num-integer", "num-iter", "num-rational", - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -2138,7 +2215,7 @@ checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ "autocfg", "num-integer", - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -2147,7 +2224,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" dependencies = [ - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -2157,7 +2234,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -2168,7 +2245,7 @@ checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" dependencies = [ "autocfg", "num-integer", - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -2180,7 +2257,16 @@ dependencies = [ "autocfg", "num-bigint", "num-integer", - "num-traits", + "num-traits 0.2.16", +] + +[[package]] +name = "num-traits" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92e5113e9fd4cc14ded8e499429f396a20f98c772a47cc8622a736e1ec843c31" +dependencies = [ + "num-traits 0.2.16", ] [[package]] @@ -2233,7 +2319,7 @@ dependencies = [ "reqwest", "ring", "rustls-pemfile", - "serde", + "serde 1.0.188", "serde_json", "snafu", "tokio", @@ -2260,7 +2346,7 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" dependencies = [ - "num-traits", + "num-traits 0.2.16", ] [[package]] @@ -2418,7 +2504,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -2508,13 +2594,29 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ "unicode-ident", ] +[[package]] +name = "ptree" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0de80796b316aec75344095a6d2ef68ec9b8f573b9e7adc821149ba3598e270" +dependencies = [ + "ansi_term", + "atty", + "config", + "directories", + "petgraph", + "serde 1.0.188", + "serde-value", + "tint", +] + [[package]] name = "quick-xml" version = "0.28.2" @@ -2522,7 +2624,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1" dependencies = [ "memchr", - "serde", + "serde 1.0.188", ] [[package]] @@ -2663,7 +2765,7 @@ dependencies = [ "pin-project-lite", "rustls 0.21.7", "rustls-pemfile", - "serde", + "serde 1.0.188", "serde_json", "serde_urlencoded", "tokio", @@ -2745,10 +2847,16 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.32", + "syn 2.0.33", "unicode-ident", ] +[[package]] +name = "rust-ini" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e52c148ef37f8c375d49d5a73aa70713125b7f19095948a923f80afdeb22ec2" + [[package]] name = "rustc-demangle" version = "0.1.23" @@ -2824,9 +2932,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.101.4" +version = "0.101.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d93931baf2d282fff8d3a532bbfd7653f734643161b87e3e01e59a04439bf0d" +checksum = "45a27e3b59326c16e23d30aeb7a36a24cc0d29e71d68ff611cdfb4a01d013bed" dependencies = [ "ring", "untrusted", @@ -2936,6 +3044,12 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" +[[package]] +name = "serde" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dad3f759919b92c3068c696c15c3d17238234498bbdcc80f2c469606f948ac8" + [[package]] name = "serde" version = "1.0.188" @@ -2945,6 +3059,28 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-hjson" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3a4e0ea8a88553209f6cc6cfe8724ecad22e1acf372793c27d995290fe74f8" +dependencies = [ + "lazy_static 1.4.0", + "num-traits 0.1.43", + "regex", + "serde 0.8.23", +] + +[[package]] +name = "serde-value" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c" +dependencies = [ + "ordered-float", + "serde 1.0.188", +] + [[package]] name = "serde_derive" version = "1.0.188" @@ -2953,7 +3089,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -2964,7 +3100,7 @@ checksum = "2cc66a619ed80bf7a0f6b17dd063a84b88f6dea1813737cf469aef1d081142c2" dependencies = [ "itoa", "ryu", - "serde", + "serde 1.0.188", ] [[package]] @@ -2976,7 +3112,7 @@ dependencies = [ "form_urlencoded", "itoa", "ryu", - "serde", + "serde 1.0.188", ] [[package]] @@ -3142,7 +3278,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -3164,9 +3300,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.32" +version = "2.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "9caece70c63bfba29ec2fed841a09851b14a235c60010fa4de58089b6c025668" dependencies = [ "proc-macro2", "quote", @@ -3224,7 +3360,7 @@ checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -3245,7 +3381,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" dependencies = [ "deranged", - "serde", + "serde 1.0.188", "time-core", "time-macros", ] @@ -3265,6 +3401,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tint" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af24570664a3074673dbbf69a65bdae0ae0b72f2949b1adfbacb736ee4d6896" +dependencies = [ + "lazy_static 0.2.11", +] + [[package]] name = "tiny-keccak" version = "2.0.2" @@ -3315,7 +3460,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -3364,6 +3509,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde 1.0.188", +] + [[package]] name = "tower" version = "0.4.13" @@ -3413,7 +3567,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", ] [[package]] @@ -3455,9 +3609,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" @@ -3585,7 +3739,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", "wasm-bindgen-shared", ] @@ -3619,7 +3773,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.33", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3800,6 +3954,15 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yaml-rust" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" +dependencies = [ + "linked-hash-map", +] + [[package]] name = "zeroize" version = "1.6.0" From a3f01e1b6ec2411f55a1c8f5fc07e285bcff80dd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Sep 2023 16:09:32 -0400 Subject: [PATCH 05/15] fix merge issue --- datafusion/core/Cargo.toml | 1 - datafusion/physical-plan/Cargo.toml | 2 +- datafusion/physical-plan/src/aggregates/mod.rs | 2 +- datafusion/physical-plan/src/repartition/mod.rs | 6 +----- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index b80a7fda5bdad..d68032e9e45cc 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -111,7 +111,6 @@ rand_distr = "0.4.3" regex = "1.5.4" rstest = "0.18.0" rust_decimal = { version = "1.27.0", features = ["tokio-pg"] } -termtree = "0.4.1" test-utils = { path = "../../test-utils" } thiserror = "1.0.37" tokio-postgres = "0.7.7" diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 98fd53b66150e..0e25020d826ff 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -54,7 +54,7 @@ parking_lot = "0.12" pin-project-lite = "^0.2.7" #[dev-dependencies] -ptree = "0.4.0" +termtree = "0.4.1" rand = "0.8" rstest = "0.18.0" tempfile = "3" diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 9299d017513eb..53d735882702a 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -262,7 +262,7 @@ pub(crate) struct AggregationOrdering { #[derive(Debug)] pub struct AggregateExec { /// Aggregation mode (full, partial) - pub mode: AggregateMode, + mode: AggregateMode, /// Group by expressions pub group_by: PhysicalGroupBy, /// Aggregate expressions diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index c10bfc78b117a..fd69b347c80bf 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -1402,11 +1402,7 @@ mod tests { /// Create vector batches fn create_vec_batches(schema: &Schema, n: usize) -> Vec { let batch = create_batch(schema); - let mut vec = Vec::with_capacity(n); - for _ in 0..n { - vec.push(batch.clone()); - } - vec + (0..n).map(|_| batch.clone()).collect() } /// Create batch From 4fc9f3388a02b19b80211205ff2397cdde464d4d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Sep 2023 16:24:56 -0400 Subject: [PATCH 06/15] Make mode private --- .../combine_partial_final_agg.rs | 105 ++++++++---------- .../enforce_distribution.rs | 21 ++-- .../physical_optimizer/topk_aggregation.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 6 + 4 files changed, 64 insertions(+), 70 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs index de47f3fbee730..1f8d708a47707 100644 --- a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs @@ -50,74 +50,59 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate { _config: &ConfigOptions, ) -> Result> { plan.transform_down(&|plan| { - let transformed = plan.as_any().downcast_ref::().and_then( - |AggregateExec { - mode: final_mode, - input: final_input, - group_by: final_group_by, - aggr_expr: final_aggr_expr, - filter_expr: final_filter_expr, - .. - }| { - if matches!( - final_mode, - AggregateMode::Final | AggregateMode::FinalPartitioned - ) { - final_input - .as_any() - .downcast_ref::() - .and_then( - |AggregateExec { - mode: input_mode, - input: partial_input, - group_by: input_group_by, - aggr_expr: input_aggr_expr, - filter_expr: input_filter_expr, - order_by_expr: input_order_by_expr, - input_schema, - .. - }| { - if matches!(input_mode, AggregateMode::Partial) - && can_combine( - ( - final_group_by, - final_aggr_expr, - final_filter_expr, - ), - ( - input_group_by, - input_aggr_expr, - input_filter_expr, - ), - ) - { - let mode = if *final_mode == AggregateMode::Final - { - AggregateMode::Single - } else { - AggregateMode::SinglePartitioned - }; + let transformed = + plan.as_any() + .downcast_ref::() + .and_then(|agg_exec| { + if matches!( + agg_exec.mode(), + AggregateMode::Final | AggregateMode::FinalPartitioned + ) { + agg_exec + .input() + .as_any() + .downcast_ref::() + .and_then(|input_agg_exec| { + if matches!( + input_agg_exec.mode(), + AggregateMode::Partial + ) && can_combine( + ( + agg_exec.group_by(), + agg_exec.aggr_expr(), + agg_exec.filter_expr(), + ), + ( + input_agg_exec.group_by(), + input_agg_exec.aggr_expr(), + input_agg_exec.filter_expr(), + ), + ) { + let mode = + if agg_exec.mode() == &AggregateMode::Final { + AggregateMode::Single + } else { + AggregateMode::SinglePartitioned + }; AggregateExec::try_new( mode, - input_group_by.clone(), - input_aggr_expr.to_vec(), - input_filter_expr.to_vec(), - input_order_by_expr.to_vec(), - partial_input.clone(), - input_schema.clone(), + input_agg_exec.group_by().clone(), + input_agg_exec.aggr_expr().to_vec(), + input_agg_exec.filter_expr.to_vec(), + input_agg_exec.order_by_expr.to_vec(), + input_agg_exec.input().clone(), + input_agg_exec.input_schema().clone(), ) .ok() .map(Arc::new) } else { None } - }, - ) - } else { - None - } - }, - ); + }) + } else { + None + } + }); Ok(if let Some(transformed) = transformed { Transformed::Yes(transformed) diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 6752d1a10f566..77d6e7d7123de 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -509,7 +509,9 @@ fn reorder_aggregate_keys( match new_positions { None => Ok(PlanWithKeyRequirements::new(agg_plan)), Some(positions) => { - let new_partial_agg = if let Some(AggregateExec { + let new_partial_agg = if let Some(agg_exec) = + agg_exec.input().as_any().downcast_ref::() + /*AggregateExec { mode, group_by, aggr_expr, @@ -519,12 +521,13 @@ fn reorder_aggregate_keys( input_schema, .. }) = - agg_exec.input().as_any().downcast_ref::() + */ { - if matches!(mode, AggregateMode::Partial) { + if matches!(agg_exec.mode(), &AggregateMode::Partial) { let mut new_group_exprs = vec![]; for idx in positions.iter() { - new_group_exprs.push(group_by.expr()[*idx].clone()); + new_group_exprs + .push(agg_exec.group_by().expr()[*idx].clone()); } let new_partial_group_by = PhysicalGroupBy::new_single(new_group_exprs); @@ -532,11 +535,11 @@ fn reorder_aggregate_keys( Some(Arc::new(AggregateExec::try_new( AggregateMode::Partial, new_partial_group_by, - aggr_expr.clone(), - filter_expr.clone(), - order_by_expr.clone(), - input.clone(), - input_schema.clone(), + agg_exec.aggr_expr().to_vec(), + agg_exec.filter_expr().to_vec(), + agg_exec.order_by_expr().to_vec(), + agg_exec.input().clone(), + agg_exec.input_schema.clone(), )?)) } else { None diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs index f862675bf2051..2563a6123963c 100644 --- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs @@ -68,7 +68,7 @@ impl TopKAggregation { // We found what we want: clone, copy the limit down, and return modified node let mut new_aggr = AggregateExec::try_new( - aggr.mode, + aggr.mode().clone(), aggr.group_by.clone(), aggr.aggr_expr.clone(), aggr.filter_expr.clone(), diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 53d735882702a..2ba517aab64c2 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -713,6 +713,12 @@ impl AggregateExec { &self.mode } + /// Set the mode of this AggregateExec to `mode` + pub fn with_mode(mut self, mode: AggregateMode) -> Self { + self.mode = mode; + self + } + /// Grouping expressions pub fn group_expr(&self) -> &PhysicalGroupBy { &self.group_by From 202e3b1e250674460fda8902f08c5db46a4f3bb3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Sep 2023 16:31:11 -0400 Subject: [PATCH 07/15] Make the other fields private --- .../combine_partial_final_agg.rs | 4 ++-- .../physical_optimizer/topk_aggregation.rs | 20 +++++++++---------- .../physical-plan/src/aggregates/mod.rs | 17 ++++++++-------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs index 1f8d708a47707..40b2bcc3e140e 100644 --- a/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs +++ b/datafusion/core/src/physical_optimizer/combine_partial_final_agg.rs @@ -88,8 +88,8 @@ impl PhysicalOptimizerRule for CombinePartialFinalAggregate { mode, input_agg_exec.group_by().clone(), input_agg_exec.aggr_expr().to_vec(), - input_agg_exec.filter_expr.to_vec(), - input_agg_exec.order_by_expr.to_vec(), + input_agg_exec.filter_expr().to_vec(), + input_agg_exec.order_by_expr().to_vec(), input_agg_exec.input().clone(), input_agg_exec.input_schema().clone(), ) diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs index 2563a6123963c..7360e7d5c3a1a 100644 --- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs @@ -56,7 +56,7 @@ impl TopKAggregation { if !kt.is_primitive() && kt != DataType::Utf8 { return None; } - if aggr.filter_expr.iter().any(|e| e.is_some()) { + if aggr.filter_expr().iter().any(|e| e.is_some()) { return None; } @@ -67,17 +67,17 @@ impl TopKAggregation { } // We found what we want: clone, copy the limit down, and return modified node - let mut new_aggr = AggregateExec::try_new( + let new_aggr = AggregateExec::try_new( aggr.mode().clone(), - aggr.group_by.clone(), - aggr.aggr_expr.clone(), - aggr.filter_expr.clone(), - aggr.order_by_expr.clone(), - aggr.input.clone(), - aggr.input_schema.clone(), + aggr.group_by().clone(), + aggr.aggr_expr().to_vec(), + aggr.filter_expr().to_vec(), + aggr.order_by_expr().to_vec(), + aggr.input().clone(), + aggr.input_schema().clone(), ) - .expect("Unable to copy Aggregate!"); - new_aggr.limit = Some(limit); + .expect("Unable to copy Aggregate!") + .with_limit(Some(limit)); Some(Arc::new(new_aggr)) } diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 2ba517aab64c2..43ba99c68d151 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -264,15 +264,15 @@ pub struct AggregateExec { /// Aggregation mode (full, partial) mode: AggregateMode, /// Group by expressions - pub group_by: PhysicalGroupBy, + group_by: PhysicalGroupBy, /// Aggregate expressions - pub aggr_expr: Vec>, + aggr_expr: Vec>, /// FILTER (WHERE clause) expression for each aggregate expression - pub filter_expr: Vec>>, + filter_expr: Vec>>, /// (ORDER BY clause) expression for each aggregate expression - pub order_by_expr: Vec>, + order_by_expr: Vec>, /// Set if the output of this aggregation is truncated by a upstream sort/limit clause - pub limit: Option, + limit: Option, /// Input plan, could be a partial aggregate or the input to the aggregate pub input: Arc, /// Schema after the aggregate is applied @@ -713,12 +713,11 @@ impl AggregateExec { &self.mode } - /// Set the mode of this AggregateExec to `mode` - pub fn with_mode(mut self, mode: AggregateMode) -> Self { - self.mode = mode; + /// Set the `limit` of this AggExec + pub fn with_limit(mut self, limit: Option) -> Self { + self.limit = limit; self } - /// Grouping expressions pub fn group_expr(&self) -> &PhysicalGroupBy { &self.group_by From ab180ad7a77b83fd170bb0977ff3c2fcff027685 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Sep 2023 16:40:24 -0400 Subject: [PATCH 08/15] cleanup --- .../physical-plan/src/repartition/mod.rs | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index fd69b347c80bf..579b1b3d57b32 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -920,7 +920,7 @@ mod tests { async fn one_to_many_round_robin() -> Result<()> { // define input partitions let schema = test_schema(); - let partition = create_vec_batches(&schema, 50); + let partition = create_vec_batches(50); let partitions = vec![partition]; // repartition from 1 input to 4 output @@ -940,7 +940,7 @@ mod tests { async fn many_to_one_round_robin() -> Result<()> { // define input partitions let schema = test_schema(); - let partition = create_vec_batches(&schema, 50); + let partition = create_vec_batches(50); let partitions = vec![partition.clone(), partition.clone(), partition.clone()]; // repartition from 3 input to 1 output @@ -957,7 +957,7 @@ mod tests { async fn many_to_many_round_robin() -> Result<()> { // define input partitions let schema = test_schema(); - let partition = create_vec_batches(&schema, 50); + let partition = create_vec_batches(50); let partitions = vec![partition.clone(), partition.clone(), partition.clone()]; // repartition from 3 input to 5 output @@ -978,7 +978,7 @@ mod tests { async fn many_to_many_hash_partition() -> Result<()> { // define input partitions let schema = test_schema(); - let partition = create_vec_batches(&schema, 50); + let partition = create_vec_batches(50); let partitions = vec![partition.clone(), partition.clone(), partition.clone()]; let output_partitions = repartition( @@ -1033,7 +1033,7 @@ mod tests { tokio::spawn(async move { // define input partitions let schema = test_schema(); - let partition = create_vec_batches(&schema, 50); + let partition = create_vec_batches(50); let partitions = vec![partition.clone(), partition.clone(), partition.clone()]; @@ -1367,7 +1367,7 @@ mod tests { async fn oom() -> Result<()> { // define input partitions let schema = test_schema(); - let partition = create_vec_batches(&schema, 50); + let partition = create_vec_batches(50); let input_partitions = vec![partition]; let partitioning = Partitioning::RoundRobinBatch(4); @@ -1400,15 +1400,16 @@ mod tests { } /// Create vector batches - fn create_vec_batches(schema: &Schema, n: usize) -> Vec { - let batch = create_batch(schema); + fn create_vec_batches(n: usize) -> Vec { + let batch = create_batch(); (0..n).map(|_| batch.clone()).collect() } /// Create batch - fn create_batch(schema: &Schema) -> RecordBatch { + fn create_batch() -> RecordBatch { + let schema = test_schema(); RecordBatch::try_new( - Arc::new(schema.clone()), + schema, vec![Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], ) .unwrap() From db33d987436fb61c6b404714979ff74da528658f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Sep 2023 17:02:44 -0400 Subject: [PATCH 09/15] avoid assert_batches duplication --- datafusion/common/src/lib.rs | 3 + datafusion/common/src/test_util.rs | 79 +++++++ .../physical-plan/src/aggregates/mod.rs | 7 +- .../physical-plan/src/joins/cross_join.rs | 3 +- .../physical-plan/src/joins/hash_join.rs | 4 +- .../src/joins/nested_loop_join.rs | 6 +- .../src/joins/sort_merge_join.rs | 6 +- .../physical-plan/src/repartition/mod.rs | 3 +- .../src/sorts/sort_preserving_merge.rs | 2 +- datafusion/physical-plan/src/test.rs | 222 +----------------- 10 files changed, 99 insertions(+), 236 deletions(-) diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 5f02d92e50c9b..420bcd963c305 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -63,6 +63,9 @@ pub use table_reference::{OwnedTableReference, ResolvedTableReference, TableRefe pub use unnest::UnnestOptions; pub use utils::project_schema; +/// Reexport arrow crate +pub use arrow; + /// Downcast an Arrow Array to a concrete type, return an `DataFusionError::Internal` if the cast is /// not possible. In normal usage of DataFusion the downcast should always succeed. /// diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index 048b0d58d8c4d..60f1df7fd11ac 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -19,6 +19,85 @@ use std::{error::Error, path::PathBuf}; +/// Compares formatted output of a record batch with an expected +/// vector of strings, with the result of pretty formatting record +/// batches. This is a macro so errors appear on the correct line +/// +/// Designed so that failure output can be directly copy/pasted +/// into the test code as expected results. +/// +/// Expects to be called about like this: +/// +/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +#[macro_export] +macro_rules! assert_batches_eq { + ($EXPECTED_LINES: expr, $CHUNKS: expr) => { + let expected_lines: Vec = + $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); + + let formatted = $crate::arrow::util::pretty::pretty_format_batches_with_options( + $CHUNKS, + &$crate::format::DEFAULT_FORMAT_OPTIONS, + ) + .unwrap() + .to_string(); + + let actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + assert_eq!( + expected_lines, actual_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + }; +} + +/// Compares formatted output of a record batch with an expected +/// vector of strings in a way that order does not matter. +/// This is a macro so errors appear on the correct line +/// +/// Designed so that failure output can be directly copy/pasted +/// into the test code as expected results. +/// +/// Expects to be called about like this: +/// +/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` +#[macro_export] +macro_rules! assert_batches_sorted_eq { + ($EXPECTED_LINES: expr, $CHUNKS: expr) => { + let mut expected_lines: Vec = + $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); + + // sort except for header + footer + let num_lines = expected_lines.len(); + if num_lines > 3 { + expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + let formatted = $crate::arrow::util::pretty::pretty_format_batches_with_options( + $CHUNKS, + &$crate::format::DEFAULT_FORMAT_OPTIONS, + ) + .unwrap() + .to_string(); + // fix for windows: \r\n --> + + let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + // sort except for header + footer + let num_lines = actual_lines.len(); + if num_lines > 3 { + actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + assert_eq!( + expected_lines, actual_lines, + "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", + expected_lines, actual_lines + ); + }; +} + /// A macro to assert that one string is contained within another with /// a nice error message if they are not. /// diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index ac93621e36d6f..d7813f3186a51 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -1275,11 +1275,11 @@ mod tests { }; use crate::coalesce_batches::CoalesceBatchesExec; use crate::coalesce_partitions::CoalescePartitionsExec; + use crate::common; use crate::expressions::{col, Avg}; use crate::memory::MemoryExec; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{assert_is_pending, mem_exec}; - use crate::{assert_batches_eq, assert_batches_sorted_eq, common}; use crate::{ DisplayAs, ExecutionPlan, Partitioning, RecordBatchStream, SendableRecordBatchStream, Statistics, @@ -1289,7 +1289,10 @@ mod tests { use arrow::compute::{concat_batches, SortOptions}; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; - use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; + use datafusion_common::{ + assert_batches_eq, assert_batches_sorted_eq, internal_err, DataFusionError, + Result, ScalarValue, + }; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_physical_expr::expressions::{ lit, ApproxDistinct, Column, Count, FirstValue, LastValue, Median, diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 90d84282fd0dd..4ba29524b3e2f 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -457,10 +457,9 @@ impl CrossJoinStream { #[cfg(test)] mod tests { use super::*; - use crate::assert_batches_sorted_eq; - use crate::assert_contains; use crate::common; use crate::test::build_table_scan_i32; + use datafusion_common::{assert_batches_sorted_eq, assert_contains}; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; async fn join_collect( diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 43ade366f3257..8e204634f3d94 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -1062,13 +1062,13 @@ mod tests { use arrow::array::{ArrayRef, Date32Array, Int32Array, UInt32Builder, UInt64Builder}; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::ScalarValue; + use datafusion_common::{assert_batches_sorted_eq, assert_contains, ScalarValue}; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::Literal; use hashbrown::raw::RawTable; use crate::{ - assert_batches_sorted_eq, assert_contains, common, + common, expressions::Column, hash_utils::create_hashes, joins::{hash_join::build_equal_condition_join_indices, utils::JoinSide}, diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index a0222f84a70e9..c49c16dba3130 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -741,8 +741,8 @@ impl RecordBatchStream for NestedLoopJoinStream { mod tests { use super::*; use crate::{ - assert_batches_sorted_eq, assert_contains, common, expressions::Column, - memory::MemoryExec, repartition::RepartitionExec, test::build_table_i32, + common, expressions::Column, memory::MemoryExec, repartition::RepartitionExec, + test::build_table_i32, }; use arrow::datatypes::{DataType, Field}; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; @@ -750,7 +750,7 @@ mod tests { use datafusion_physical_expr::expressions::BinaryExpr; use crate::joins::utils::JoinSide; - use datafusion_common::ScalarValue; + use datafusion_common::{assert_batches_sorted_eq, assert_contains, ScalarValue}; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr::PhysicalExpr; use std::sync::Arc; diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index bf2f977d820d6..4de723ab73ea5 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -1404,16 +1404,16 @@ mod tests { use datafusion_execution::config::SessionConfig; use datafusion_execution::TaskContext; - use crate::assert_contains; use crate::expressions::Column; use crate::joins::utils::JoinOn; use crate::joins::SortMergeJoinExec; use crate::memory::MemoryExec; use crate::test::build_table_i32; - use crate::{assert_batches_eq, assert_batches_sorted_eq}; use crate::{common, ExecutionPlan}; - use datafusion_common::JoinType; use datafusion_common::Result; + use datafusion_common::{ + assert_batches_eq, assert_batches_sorted_eq, assert_contains, JoinType, + }; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; fn build_table( diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 579b1b3d57b32..14b54dc0614d8 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -895,7 +895,6 @@ impl RecordBatchStream for PerPartitionStream { mod tests { use super::*; use crate::{ - assert_batches_sorted_eq, test::{ assert_is_pending, exec::{ @@ -910,7 +909,7 @@ mod tests { use arrow::record_batch::RecordBatch; use arrow_array::UInt32Array; use datafusion_common::cast::as_string_array; - use datafusion_common::exec_err; + use datafusion_common::{assert_batches_sorted_eq, exec_err}; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use futures::FutureExt; use std::collections::HashSet; diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index b9e2c9662b14c..6e81f43c3d3f0 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -277,7 +277,6 @@ mod tests { use datafusion_execution::config::SessionConfig; use futures::{FutureExt, StreamExt}; - use crate::assert_batches_eq; use crate::coalesce_partitions::CoalescePartitionsExec; use crate::expressions::col; use crate::memory::MemoryExec; @@ -288,6 +287,7 @@ mod tests { use crate::test::{self, assert_is_pending, make_partition}; use crate::{collect, common}; use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; + use datafusion_common::assert_batches_eq; use super::*; diff --git a/datafusion/physical-plan/src/test.rs b/datafusion/physical-plan/src/test.rs index e0c612387470c..9e6312284c08f 100644 --- a/datafusion/physical-plan/src/test.rs +++ b/datafusion/physical-plan/src/test.rs @@ -17,10 +17,9 @@ //! Utilities for testing datafusion-physical-plan -use std::error::Error; +use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; -use std::{collections::HashMap, path::PathBuf}; use arrow_array::{ArrayRef, Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema, SchemaRef}; @@ -31,225 +30,6 @@ use crate::ExecutionPlan; pub mod exec; -/// A macro to assert that one string is contained within another with -/// a nice error message if they are not. -/// -/// Usage: `assert_contains!(actual, expected)` -/// -/// Is a macro so test error -/// messages are on the same line as the failure; -/// -/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>) -#[macro_export] -macro_rules! assert_contains { - ($ACTUAL: expr, $EXPECTED: expr) => { - let actual_value: String = $ACTUAL.into(); - let expected_value: String = $EXPECTED.into(); - assert!( - actual_value.contains(&expected_value), - "Can not find expected in actual.\n\nExpected:\n{}\n\nActual:\n{}", - expected_value, - actual_value - ); - }; -} - -/// A macro to assert that one string is NOT contained within another with -/// a nice error message if they are are. -/// -/// Usage: `assert_not_contains!(actual, unexpected)` -/// -/// Is a macro so test error -/// messages are on the same line as the failure; -/// -/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>) -#[macro_export] -macro_rules! assert_not_contains { - ($ACTUAL: expr, $UNEXPECTED: expr) => { - let actual_value: String = $ACTUAL.into(); - let unexpected_value: String = $UNEXPECTED.into(); - assert!( - !actual_value.contains(&unexpected_value), - "Found unexpected in actual.\n\nUnexpected:\n{}\n\nActual:\n{}", - unexpected_value, - actual_value - ); - }; -} - -/// Compares formatted output of a record batch with an expected -/// vector of strings, with the result of pretty formatting record -/// batches. This is a macro so errors appear on the correct line -/// -/// Designed so that failure output can be directly copy/pasted -/// into the test code as expected results. -/// -/// Expects to be called about like this: -/// -/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` -#[macro_export] -macro_rules! assert_batches_eq { - ($EXPECTED_LINES: expr, $CHUNKS: expr) => { - let expected_lines: Vec = - $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - - let formatted = arrow::util::pretty::pretty_format_batches_with_options( - $CHUNKS, - &datafusion_common::format::DEFAULT_FORMAT_OPTIONS, - ) - .unwrap() - .to_string(); - - let actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - assert_eq!( - expected_lines, actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; -} - -/// Compares formatted output of a record batch with an expected -/// vector of strings in a way that order does not matter. -/// This is a macro so errors appear on the correct line -/// -/// Designed so that failure output can be directly copy/pasted -/// into the test code as expected results. -/// -/// Expects to be called about like this: -/// -/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` -#[macro_export] -macro_rules! assert_batches_sorted_eq { - ($EXPECTED_LINES: expr, $CHUNKS: expr) => { - let mut expected_lines: Vec = - $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - - // sort except for header + footer - let num_lines = expected_lines.len(); - if num_lines > 3 { - expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - let formatted = arrow::util::pretty::pretty_format_batches_with_options( - $CHUNKS, - &datafusion_common::format::DEFAULT_FORMAT_OPTIONS, - ) - .unwrap() - .to_string(); - // fix for windows: \r\n --> - - let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - // sort except for header + footer - let num_lines = actual_lines.len(); - if num_lines > 3 { - actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - assert_eq!( - expected_lines, actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; -} - -/// Returns the arrow test data directory, which is by default stored -/// in a git submodule rooted at `testing/data`. -/// -/// The default can be overridden by the optional environment -/// variable `ARROW_TEST_DATA` -/// -/// panics when the directory can not be found. -/// -/// Example: -/// ``` -/// let testdata = datafusion_common::test_util::arrow_test_data(); -/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata); -/// assert!(std::path::PathBuf::from(csvdata).exists()); -/// ``` -pub fn arrow_test_data() -> String { - match get_data_dir("ARROW_TEST_DATA", "../../testing/data") { - Ok(pb) => pb.display().to_string(), - Err(err) => panic!("failed to get arrow data dir: {err}"), - } -} - -/// Returns the parquet test data directory, which is by default -/// stored in a git submodule rooted at -/// `parquet-testing/data`. -/// -/// The default can be overridden by the optional environment variable -/// `PARQUET_TEST_DATA` -/// -/// panics when the directory can not be found. -/// -/// Example: -/// ``` -/// let testdata = datafusion_common::test_util::parquet_test_data(); -/// let filename = format!("{}/binary.parquet", testdata); -/// assert!(std::path::PathBuf::from(filename).exists()); -/// ``` -pub fn parquet_test_data() -> String { - match get_data_dir("PARQUET_TEST_DATA", "../../parquet-testing/data") { - Ok(pb) => pb.display().to_string(), - Err(err) => panic!("failed to get parquet data dir: {err}"), - } -} - -/// Returns a directory path for finding test data. -/// -/// udf_env: name of an environment variable -/// -/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR) -/// -/// Returns either: -/// The path referred to in `udf_env` if that variable is set and refers to a directory -/// The submodule_data directory relative to CARGO_MANIFEST_PATH -pub fn get_data_dir( - udf_env: &str, - submodule_data: &str, -) -> Result> { - // Try user defined env. - if let Ok(dir) = std::env::var(udf_env) { - let trimmed = dir.trim().to_string(); - if !trimmed.is_empty() { - let pb = PathBuf::from(trimmed); - if pb.is_dir() { - return Ok(pb); - } else { - return Err(format!( - "the data dir `{}` defined by env {} not found", - pb.display(), - udf_env - ) - .into()); - } - } - } - - // The env is undefined or its value is trimmed to empty, let's try default dir. - - // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package", - // set by `cargo run` or `cargo test`, see: - // https://doc.rust-lang.org/cargo/reference/environment-variables.html - let dir = env!("CARGO_MANIFEST_DIR"); - - let pb = PathBuf::from(dir).join(submodule_data); - if pb.is_dir() { - Ok(pb) - } else { - Err(format!( - "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ - HINT: try running `git submodule update --init`", - udf_env, - pb.display(), - ).into()) - } -} - /// Asserts that given future is pending. pub fn assert_is_pending<'a, T>(fut: &mut Pin + Send + 'a>>) { let waker = futures::task::noop_waker(); From df292a715e034a0fe8407a8d5e78bf22936951c7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Sep 2023 17:06:47 -0400 Subject: [PATCH 10/15] Remove duplicated macro definition --- datafusion/core/src/lib.rs | 4 ++ datafusion/core/src/test_util/mod.rs | 80 +--------------------------- 2 files changed, 6 insertions(+), 78 deletions(-) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 4f74888c840b1..576f66a5ed7c1 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -471,6 +471,10 @@ pub mod physical_plan { pub use datafusion_physical_plan::*; } +// Reexport testing macros for compatibility +pub use datafusion_common::assert_batches_eq; +pub use datafusion_common::assert_batches_sorted_eq; + /// re-export of [`datafusion_sql`] crate pub mod sql { pub use datafusion_sql::*; diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index ab29cecbb8eaa..bd52c3eedaa4f 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -50,84 +50,8 @@ pub use datafusion_common::test_util::{ arrow_test_data, get_data_dir, parquet_test_data, }; -/// Compares formatted output of a record batch with an expected -/// vector of strings, with the result of pretty formatting record -/// batches. This is a macro so errors appear on the correct line -/// -/// Designed so that failure output can be directly copy/pasted -/// into the test code as expected results. -/// -/// Expects to be called about like this: -/// -/// `assert_batch_eq!(expected_lines: &[&str], batches: &[RecordBatch])` -#[macro_export] -macro_rules! assert_batches_eq { - ($EXPECTED_LINES: expr, $CHUNKS: expr) => { - let expected_lines: Vec = - $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - - let formatted = $crate::arrow::util::pretty::pretty_format_batches_with_options( - $CHUNKS, - &$crate::common::format::DEFAULT_FORMAT_OPTIONS, - ) - .unwrap() - .to_string(); - - let actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - assert_eq!( - expected_lines, actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; -} - -/// Compares formatted output of a record batch with an expected -/// vector of strings in a way that order does not matter. -/// This is a macro so errors appear on the correct line -/// -/// Designed so that failure output can be directly copy/pasted -/// into the test code as expected results. -/// -/// Expects to be called about like this: -/// -/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])` -#[macro_export] -macro_rules! assert_batches_sorted_eq { - ($EXPECTED_LINES: expr, $CHUNKS: expr) => { - let mut expected_lines: Vec = - $EXPECTED_LINES.iter().map(|&s| s.into()).collect(); - - // sort except for header + footer - let num_lines = expected_lines.len(); - if num_lines > 3 { - expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - let formatted = $crate::arrow::util::pretty::pretty_format_batches_with_options( - $CHUNKS, - &$crate::common::format::DEFAULT_FORMAT_OPTIONS, - ) - .unwrap() - .to_string(); - // fix for windows: \r\n --> - - let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - // sort except for header + footer - let num_lines = actual_lines.len(); - if num_lines > 3 { - actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - assert_eq!( - expected_lines, actual_lines, - "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n", - expected_lines, actual_lines - ); - }; -} +pub use datafusion_common::assert_batches_eq; +pub use datafusion_common::assert_batches_sorted_eq; /// Scan an empty data source, mainly used in tests pub fn scan_empty( From 01e5c85a9c8719cbe0574587a16fd395893c5ca5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Sep 2023 07:36:47 -0400 Subject: [PATCH 11/15] move dep --- datafusion-cli/Cargo.lock | 2 +- datafusion/core/Cargo.toml | 1 - datafusion/physical-plan/Cargo.toml | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 783d7c26a2375..0b5af8252c85b 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1078,7 +1078,6 @@ dependencies = [ "log", "num_cpus", "object_store", - "once_cell", "parking_lot", "parquet", "percent-encoding", @@ -1240,6 +1239,7 @@ dependencies = [ "indexmap 2.0.0", "itertools 0.11.0", "log", + "once_cell", "parking_lot", "pin-project-lite", "rand", diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 38f5e969ead2b..f414d875bf6c7 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -81,7 +81,6 @@ log = "^0.4" num-traits = { version = "0.2", optional = true } num_cpus = "1.13.0" object_store = "0.7.0" -once_cell = "1.18.0" parking_lot = "0.12" parquet = { workspace = true } percent-encoding = "2.2.0" diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 0e25020d826ff..1a1b80437d67f 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -50,6 +50,7 @@ hashbrown = { version = "0.14", features = ["raw"] } indexmap = "2.0.0" itertools = { version = "0.11", features = ["use_std"] } log = "^0.4" +once_cell = "1.18.0" parking_lot = "0.12" pin-project-lite = "^0.2.7" From f50e7e16510bae0a4c4a131ed1e1f3a3f3328539 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Sep 2023 09:17:42 -0400 Subject: [PATCH 12/15] clippy --- datafusion/core/src/physical_optimizer/topk_aggregation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs index 7360e7d5c3a1a..4789226d7aa9c 100644 --- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs @@ -68,7 +68,7 @@ impl TopKAggregation { // We found what we want: clone, copy the limit down, and return modified node let new_aggr = AggregateExec::try_new( - aggr.mode().clone(), + *aggr.mode(), aggr.group_by().clone(), aggr.aggr_expr().to_vec(), aggr.filter_expr().to_vec(), From 4775782649a26d6b6db16e87381f5783ba7880ed Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Sep 2023 10:46:50 -0400 Subject: [PATCH 13/15] toml lint --- datafusion/physical-plan/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 1a1b80437d67f..02f154762760a 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -53,10 +53,10 @@ log = "^0.4" once_cell = "1.18.0" parking_lot = "0.12" pin-project-lite = "^0.2.7" - -#[dev-dependencies] -termtree = "0.4.1" rand = "0.8" rstest = "0.18.0" tempfile = "3" + +#[dev-dependencies] +termtree = "0.4.1" tokio = { version = "1.28", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] } From 4fdbeaac0b4d98b0b2f12630065c60ebcfd12617 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Sep 2023 11:39:59 -0400 Subject: [PATCH 14/15] Use timestamp_nanos_opt instead of deprecated timestamp_nanos --- datafusion-cli/Cargo.lock | 8 ++++---- datafusion/common/src/scalar.rs | 3 ++- datafusion/core/tests/parquet/mod.rs | 3 ++- datafusion/core/tests/sql/explain_analyze.rs | 9 ++++++--- .../src/simplify_expressions/simplify_exprs.rs | 4 ++-- datafusion/physical-expr/src/datetime_expressions.rs | 9 +++++---- datafusion/physical-plan/src/metrics/value.rs | 6 ++++-- .../physical-plan/src/sorts/sort_preserving_merge.rs | 12 ++++++++---- datafusion/proto/src/physical_plan/to_proto.rs | 8 +++++++- 9 files changed, 40 insertions(+), 22 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 0b5af8252c85b..f2cd44a95706a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -831,9 +831,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.30" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defd4e7873dbddba6c7c91e199c7fcb946abc4a6a4ac3195400bcfb01b5de877" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" dependencies = [ "android-tzdata", "iana-time-zone", @@ -3445,9 +3445,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.16.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index fa2175c223388..32343b98fa247 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -4965,7 +4965,8 @@ mod tests { .unwrap() .and_hms_nano_opt(hour, minute, second, nanosec) .unwrap() - .timestamp_nanos(), + .timestamp_nanos_opt() + .unwrap(), ), None, )) diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 6f289e0c064bd..db7349851ba28 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -290,7 +290,8 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch { offset_nanos + t.parse::() .unwrap() - .timestamp_nanos() + .timestamp_nanos_opt() + .unwrap() }) }) .collect::>(); diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index f32ffc1642cda..06120c01ce864 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -18,6 +18,7 @@ use super::*; use datafusion::config::ConfigOptions; use datafusion::physical_plan::display::DisplayableExecutionPlan; +use datafusion::physical_plan::metrics::Timestamp; #[tokio::test] async fn explain_analyze_baseline_metrics() { @@ -142,11 +143,11 @@ async fn explain_analyze_baseline_metrics() { metrics.iter().for_each(|m| match m.value() { MetricValue::StartTimestamp(ts) => { saw_start = true; - assert!(ts.value().unwrap().timestamp_nanos() > 0); + assert!(nanos_from_timestamp(ts) > 0); } MetricValue::EndTimestamp(ts) => { saw_end = true; - assert!(ts.value().unwrap().timestamp_nanos() > 0); + assert!(nanos_from_timestamp(ts) > 0); } _ => {} }); @@ -161,7 +162,9 @@ async fn explain_analyze_baseline_metrics() { datafusion::physical_plan::accept(physical_plan.as_ref(), &mut TimeValidator {}) .unwrap(); } - +fn nanos_from_timestamp(ts: &Timestamp) -> i64 { + ts.value().unwrap().timestamp_nanos_opt().unwrap() +} #[tokio::test] async fn csv_explain_plans() { // This test verify the look of each plan in its full cycle plan creation diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index e6d66720ee1b4..59c1c5a1266d3 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -473,8 +473,8 @@ mod tests { let expected = format!( "Projection: TimestampNanosecond({}, Some(\"+00:00\")) AS now(), TimestampNanosecond({}, Some(\"+00:00\")) AS t2\ \n TableScan: test", - time.timestamp_nanos(), - time.timestamp_nanos() + time.timestamp_nanos_opt().unwrap(), + time.timestamp_nanos_opt().unwrap() ); assert_eq!(expected, actual); diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index bf90a7da9c0bd..63f7645fea5db 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -168,7 +168,7 @@ pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result { pub fn make_now( now_ts: DateTime, ) -> impl Fn(&[ColumnarValue]) -> Result { - let now_ts = Some(now_ts.timestamp_nanos()); + let now_ts = now_ts.timestamp_nanos_opt(); move |_arg| { Ok(ColumnarValue::Scalar(ScalarValue::TimestampNanosecond( now_ts, @@ -204,7 +204,7 @@ pub fn make_current_date( pub fn make_current_time( now_ts: DateTime, ) -> impl Fn(&[ColumnarValue]) -> Result { - let nano = Some(now_ts.timestamp_nanos() % 86400000000000); + let nano = now_ts.timestamp_nanos_opt().map(|ts| ts % 86400000000000); move |_arg| Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(nano))) } @@ -271,8 +271,9 @@ fn date_trunc_coarse(granularity: &str, value: i64) -> Result { return exec_err!("Unsupported date_trunc granularity: {unsupported}"); } }; + let value = value.and_then(|value| value.timestamp_nanos_opt()); // `with_x(0)` are infallible because `0` are always a valid - Ok(value.unwrap().timestamp_nanos()) + Ok(value.unwrap()) } // truncates a single value with the given timeunit to the specified granularity @@ -459,7 +460,7 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 }; } - bin_time.timestamp_nanos() + bin_time.timestamp_nanos_opt().unwrap() } fn to_utc_date_time(nanos: i64) -> DateTime { diff --git a/datafusion/physical-plan/src/metrics/value.rs b/datafusion/physical-plan/src/metrics/value.rs index 59b012f25a27d..899ceb60b49f7 100644 --- a/datafusion/physical-plan/src/metrics/value.rs +++ b/datafusion/physical-plan/src/metrics/value.rs @@ -430,11 +430,13 @@ impl MetricValue { Self::Time { time, .. } => time.value(), Self::StartTimestamp(timestamp) => timestamp .value() - .map(|ts| ts.timestamp_nanos() as usize) + .and_then(|ts| ts.timestamp_nanos_opt()) + .map(|nanos| nanos as usize) .unwrap_or(0), Self::EndTimestamp(timestamp) => timestamp .value() - .map(|ts| ts.timestamp_nanos() as usize) + .and_then(|ts| ts.timestamp_nanos_opt()) + .map(|nanos| nanos as usize) .unwrap_or(0), } } diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 6e81f43c3d3f0..6754a16331568 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -280,12 +280,12 @@ mod tests { use crate::coalesce_partitions::CoalescePartitionsExec; use crate::expressions::col; use crate::memory::MemoryExec; - use crate::metrics::MetricValue; + use crate::metrics::{MetricValue, Timestamp}; use crate::sorts::sort::SortExec; use crate::stream::RecordBatchReceiverStream; + use crate::{collect, common}; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{self, assert_is_pending, make_partition}; - use crate::{collect, common}; use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; use datafusion_common::assert_batches_eq; @@ -893,11 +893,11 @@ mod tests { metrics.iter().for_each(|m| match m.value() { MetricValue::StartTimestamp(ts) => { saw_start = true; - assert!(ts.value().unwrap().timestamp_nanos() > 0); + assert!(nanos_from_timestamp(ts) > 0); } MetricValue::EndTimestamp(ts) => { saw_end = true; - assert!(ts.value().unwrap().timestamp_nanos() > 0); + assert!(nanos_from_timestamp(ts) > 0); } _ => {} }); @@ -906,6 +906,10 @@ mod tests { assert!(saw_end); } + fn nanos_from_timestamp(ts: &Timestamp) -> i64 { + ts.value().unwrap().timestamp_nanos_opt().unwrap() + } + #[tokio::test] async fn test_drop_cancel() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 62221f9d754c2..a5b1300360fe7 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -422,10 +422,16 @@ impl TryFrom<&PartitionedFile> for protobuf::PartitionedFile { type Error = DataFusionError; fn try_from(pf: &PartitionedFile) -> Result { + let last_modified = pf.object_meta.last_modified; + let last_modified_ns = last_modified.timestamp_nanos_opt().ok_or_else(|| { + DataFusionError::Plan(format!( + "Invalid timestamp on PartitionedFile::ObjectMeta: {last_modified}" + )) + })? as u64; Ok(protobuf::PartitionedFile { path: pf.object_meta.location.as_ref().to_owned(), size: pf.object_meta.size as u64, - last_modified_ns: pf.object_meta.last_modified.timestamp_nanos() as u64, + last_modified_ns, partition_values: pf .partition_values .iter() From 5121e47379c1da21e4c1af23b25dc984fa060b06 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Sep 2023 12:01:27 -0400 Subject: [PATCH 15/15] fmt --- datafusion/physical-plan/src/sorts/sort_preserving_merge.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 6754a16331568..5b485e0b68e41 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -283,9 +283,9 @@ mod tests { use crate::metrics::{MetricValue, Timestamp}; use crate::sorts::sort::SortExec; use crate::stream::RecordBatchReceiverStream; - use crate::{collect, common}; use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; use crate::test::{self, assert_is_pending, make_partition}; + use crate::{collect, common}; use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; use datafusion_common::assert_batches_eq;