diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b28bdf3513b33..36a620aaa06c0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -136,9 +136,9 @@ jobs: - name: Verify Working Directory Clean run: git diff --exit-code - # Run doc tests + # Run `cargo test doc` (test documentation examples) linux-test-doc: - name: cargo doctest (amd64) + name: cargo test doc (amd64) needs: [ linux-build-lib ] runs-on: ubuntu-latest container: @@ -157,6 +157,24 @@ jobs: - name: Verify Working Directory Clean run: git diff --exit-code + # Run `cargo doc` to ensure the rustdoc is clean + linux-rustdoc: + name: cargo doc + needs: [ linux-build-lib ] + runs-on: ubuntu-latest + container: + image: amd64/rust + steps: + - uses: actions/checkout@v3 + - name: Setup Rust toolchain + uses: ./.github/actions/setup-builder + with: + rust-version: stable + - name: Run cargo doc + run: | + export RUSTDOCFLAGS="-D warnings -A rustdoc::private-intra-doc-links" + cargo doc --document-private-items --no-deps --workspace + # verify that the benchmark queries return the correct results verify-benchmark-results: name: verify benchmark results (amd64) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 5973bf262ee74..b808f340d7d99 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -306,13 +306,19 @@ config_namespace! { /// Should DataFusion execute sorts in a per-partition fashion and merge /// afterwards instead of coalescing first and sorting globally. /// With this flag is enabled, plans in the form below + /// + /// ```text /// "SortExec: [a@0 ASC]", /// " CoalescePartitionsExec", /// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", + /// ``` /// would turn into the plan below which performs better in multithreaded environments + /// + /// ```text /// "SortPreservingMergeExec: [a@0 ASC]", /// " SortExec: [a@0 ASC]", /// " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", + /// ``` pub repartition_sorts: bool, default = true /// When set to true, the logical plan optimizer will produce warning diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs index 34d09fdc1fdd5..2919d9a39c9c8 100644 --- a/datafusion/common/src/tree_node.rs +++ b/datafusion/common/src/tree_node.rs @@ -15,18 +15,29 @@ // specific language governing permissions and limitations // under the License. -//! This module provides common traits for visiting or rewriting tree nodes easily. +//! This module provides common traits for visiting or rewriting tree +//! data structures easily. use std::sync::Arc; use crate::Result; -/// Trait for tree node. It can be [`ExecutionPlan`], [`PhysicalExpr`], [`LogicalPlan`], [`Expr`], etc. +/// Defines a visitable and rewriteable a tree node. This trait is +/// implemented for plans ([`ExecutionPlan`] and [`LogicalPlan`]) as +/// well as expression trees ([`PhysicalExpr`], [`Expr`]) in +/// DataFusion +/// +/// +/// [`ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html +/// [`PhysicalExpr`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.PhysicalExpr.html +/// [`LogicalPlan`]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/enum.LogicalPlan.html +/// [`Expr`]: https://docs.rs/datafusion-expr/latest/datafusion_expr/expr/enum.Expr.html pub trait TreeNode: Sized { - /// Use preorder to iterate the node on the tree so that we can stop fast for some cases. + /// Use preorder to iterate the node on the tree so that we can + /// stop fast for some cases. /// - /// [`op`] can be used to collect some info from the tree node - /// or do some checking for the tree node. + /// The `op` closure can be used to collect some info from the + /// tree node or do some checking for the tree node. fn apply(&self, op: &mut F) -> Result where F: FnMut(&Self) -> Result, @@ -68,7 +79,8 @@ pub trait TreeNode: Sized { /// children of that node will be visited, nor is post_visit /// called on that node. Details see [`TreeNodeVisitor`] /// - /// If using the default [`post_visit`] with nothing to do, the [`apply`] should be preferred + /// If using the default [`TreeNodeVisitor::post_visit`] that does + /// nothing, [`Self::apply`] should be preferred. fn visit>( &self, visitor: &mut V, @@ -152,7 +164,8 @@ pub trait TreeNode: Sized { /// children of that node will be visited, nor is mutate /// called on that node /// - /// If using the default [`pre_visit`] with [`true`] returned, the [`transform`] should be preferred + /// If using the default [`TreeNodeRewriter::pre_visit`] which + /// returns `true`, [`Self::transform`] should be preferred. fn rewrite>(self, rewriter: &mut R) -> Result { let need_mutate = match rewriter.pre_visit(&self)? { RewriteRecursion::Mutate => return rewriter.mutate(self), @@ -190,8 +203,8 @@ pub trait TreeNode: Sized { /// tree and makes it easier to add new types of tree node and /// algorithms. /// -/// When passed to[`TreeNode::visit`], [`TreeNode::pre_visit`] -/// and [`TreeNode::post_visit`] are invoked recursively +/// When passed to[`TreeNode::visit`], [`TreeNodeVisitor::pre_visit`] +/// and [`TreeNodeVisitor::post_visit`] are invoked recursively /// on an node tree. /// /// If an [`Err`] result is returned, recursion is stopped @@ -239,7 +252,7 @@ pub trait TreeNodeRewriter: Sized { fn mutate(&mut self, node: Self::N) -> Result; } -/// Controls how the [TreeNode] recursion should proceed for [`rewrite`]. +/// Controls how the [`TreeNode`] recursion should proceed for [`TreeNode::rewrite`]. #[derive(Debug)] pub enum RewriteRecursion { /// Continue rewrite this node tree. @@ -252,7 +265,7 @@ pub enum RewriteRecursion { Skip, } -/// Controls how the [TreeNode] recursion should proceed for [`visit`]. +/// Controls how the [`TreeNode`] recursion should proceed for [`TreeNode::visit`]. #[derive(Debug)] pub enum VisitRecursion { /// Continue the visit to this node tree. diff --git a/datafusion/common/src/utils.rs b/datafusion/common/src/utils.rs index 2f468ff9c35c3..e29fd7e021a96 100644 --- a/datafusion/common/src/utils.rs +++ b/datafusion/common/src/utils.rs @@ -263,7 +263,7 @@ pub(crate) fn parse_identifiers(s: &str) -> Result> { Ok(idents) } -/// Construct a new Vec from the rows of the `arrays` at the `indices`. +/// Construct a new [`Vec`] of [`ArrayRef`] from the rows of the `arrays` at the `indices`. pub fn get_arrayref_at_indices( arrays: &[ArrayRef], indices: &PrimitiveArray, diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 94f7f9e39b1b7..d145729cf76f5 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -150,7 +150,7 @@ //! [`AggregateUDF`]: physical_plan::udaf::AggregateUDF //! [`QueryPlanner`]: execution::context::QueryPlanner //! [`OptimizerRule`]: datafusion_optimizer::optimizer::OptimizerRule -//! [`PhysicalOptimizerRule`]: datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule +//! [`PhysicalOptimizerRule`]: crate::physical_optimizer::optimizer::PhysicalOptimizerRule //! //! # Code Organization //! diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs index 1dbc59a63aa10..1421e25be7854 100644 --- a/datafusion/core/src/physical_optimizer/pipeline_checker.rs +++ b/datafusion/core/src/physical_optimizer/pipeline_checker.rs @@ -158,6 +158,9 @@ pub fn check_finiteness_requirements( /// data pruning. For this to be possible, it needs to have a filter where /// all involved [`PhysicalExpr`]s, [`Operator`]s and data types support /// interval calculations. +/// +/// [`PhysicalExpr`]: crate::physical_plan::PhysicalExpr +/// [`Operator`]: datafusion_expr::Operator fn is_prunable(join: &SymmetricHashJoinExec) -> bool { join.filter().map_or(false, |filter| { check_support(filter.expression()) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index a30e0d9111f10..5fe3a72c69234 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -736,7 +736,7 @@ fn build_is_null_column_expr( /// expression that will evaluate to FALSE if it can be determined no /// rows between the min/max values could pass the predicates. /// -/// Returns the pruning predicate as an [`Expr`] +/// Returns the pruning predicate as an [`PhysicalExpr`] fn build_predicate_expression( expr: &Arc, schema: &Schema, diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 94b361cb95926..f9ca976299d6a 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -37,6 +37,8 @@ use std::sync::Arc; /// This is a "data class" we use within the [`EnforceSorting`] rule to push /// down [`SortExec`] in the plan. In some cases, we can reduce the total /// computational cost by pushing down `SortExec`s through some executors. +/// +/// [`EnforceSorting`]: crate::physical_optimizer::sort_enforcement::EnforceSorting #[derive(Debug, Clone)] pub(crate) struct SortPushDown { /// Current plan diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs index 101b157a36840..7e4d0794b77a9 100644 --- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -47,7 +47,9 @@ pub(crate) struct AggregateStream { /// /// This is wrapped into yet another struct because we need to interact with the async memory management subsystem /// during poll. To have as little code "weirdness" as possible, we chose to just use [`BoxStream`] together with -/// [`futures::stream::unfold`]. The latter requires a state object, which is [`GroupedHashAggregateStreamV2Inner`]. +/// [`futures::stream::unfold`]. +/// +/// The latter requires a state object, which is [`AggregateStreamInner`]. struct AggregateStreamInner { schema: SchemaRef, mode: AggregateMode, diff --git a/datafusion/core/src/physical_plan/file_format/file_stream.rs b/datafusion/core/src/physical_plan/file_format/file_stream.rs index 06e4f222831be..538798ead1e99 100644 --- a/datafusion/core/src/physical_plan/file_format/file_stream.rs +++ b/datafusion/core/src/physical_plan/file_format/file_stream.rs @@ -97,13 +97,13 @@ enum FileStreamState { /// Currently performing asynchronous IO to obtain a stream of RecordBatch /// for a given parquet file Open { - /// A [`FileOpenFuture`] returned by [`FormatReader::open`] + /// A [`FileOpenFuture`] returned by [`FileOpener::open`] future: FileOpenFuture, /// The partition values for this file partition_values: Vec, }, /// Scanning the [`BoxStream`] returned by the completion of a [`FileOpenFuture`] - /// returned by [`FormatReader::open`] + /// returned by [`FileOpener::open`] Scan { /// Partitioning column values for the current batch_iter partition_values: Vec, @@ -149,7 +149,7 @@ impl StartableTime { struct FileStreamMetrics { /// Wall clock time elapsed for file opening. /// - /// Time between when [`FileReader::open`] is called and when the + /// Time between when [`FileOpener::open`] is called and when the /// [`FileStream`] receives a stream for reading. /// /// If there are multiple files being scanned, the stream diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs b/datafusion/core/src/physical_plan/file_format/mod.rs index a50f3bf025435..5fffc628afc31 100644 --- a/datafusion/core/src/physical_plan/file_format/mod.rs +++ b/datafusion/core/src/physical_plan/file_format/mod.rs @@ -77,6 +77,8 @@ use super::{ColumnStatistics, Statistics}; /// different dictionary type. /// /// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same say. +/// +/// [`ListingTable`]: crate::datasource::listing::ListingTable pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType { DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type)) } diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index c69cdb7417f50..6d9e35c32e37d 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -462,7 +462,7 @@ fn make_output_ordering_string(ordering: &[PhysicalSortExpr]) -> String { w } -/// Implements [`FormatReader`] for a parquet file +/// Implements [`FileOpener`] for a parquet file struct ParquetOpener { partition_index: usize, projection: Arc<[usize]>, diff --git a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs index 78c9aca515611..0cfb79141eefa 100644 --- a/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs +++ b/datafusion/core/src/physical_plan/joins/symmetric_hash_join.rs @@ -263,7 +263,7 @@ impl SymmetricHashJoinExec { /// # Error /// This function errors when: /// - It is not possible to join the left and right sides on keys `on`, or - /// - It fails to construct [SortedFilterExpr]s, or + /// - It fails to construct `SortedFilterExpr`s, or /// - It fails to create the [ExprIntervalGraph]. pub fn try_new( left: Arc, diff --git a/datafusion/core/src/physical_plan/joins/utils.rs b/datafusion/core/src/physical_plan/joins/utils.rs index 9e25e0353772b..412e774806efc 100644 --- a/datafusion/core/src/physical_plan/joins/utils.rs +++ b/datafusion/core/src/physical_plan/joins/utils.rs @@ -735,14 +735,15 @@ pub(crate) fn need_produce_result_in_final(join_type: JoinType) -> bool { ) } -/// In the end of join execution, need to use bit map of the matched indices to generate the final left and -/// right indices. +/// In the end of join execution, need to use bit map of the matched +/// indices to generate the final left and right indices. /// /// For example: -/// left_bit_map: [true, false, true, true, false] -/// join_type: `Left` /// -/// The result is: ([1,4], [null, null]) +/// 1. left_bit_map: `[true, false, true, true, false]` +/// 2. join_type: `Left` +/// +/// The result is: `([1,4], [null, null])` pub(crate) fn get_final_indices_from_bit_map( left_bit_map: &BooleanBufferBuilder, join_type: JoinType, diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 8ee32b8d04337..4bad535e521a1 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -1292,8 +1292,8 @@ impl DefaultPhysicalPlanner { } } -/// Expand and align a GROUPING SET expression. -/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS) +/// Expand and align a GROUPING SET expression. +/// (see ) /// /// This will take a list of grouping sets and ensure that each group is /// properly aligned for the physical execution plan. We do this by @@ -1301,7 +1301,7 @@ impl DefaultPhysicalPlanner { /// group to the same set of expression types and ordering. /// For example, if we have something like `GROUPING SETS ((a,b,c),(a),(b),(b,c))` /// we would expand this to `GROUPING SETS ((a,b,c),(a,NULL,NULL),(NULL,b,NULL),(NULL,b,c)) -/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS) +/// (see ) fn merge_grouping_set_physical_expr( grouping_sets: &[Vec], input_dfschema: &DFSchema, @@ -1352,7 +1352,7 @@ fn merge_grouping_set_physical_expr( } /// Expand and align a CUBE expression. This is a special case of GROUPING SETS -/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS) +/// (see ) fn create_cube_physical_expr( exprs: &[Expr], input_dfschema: &DFSchema, @@ -1399,7 +1399,7 @@ fn create_cube_physical_expr( } /// Expand and align a ROLLUP expression. This is a special case of GROUPING SETS -/// (see https://www.postgresql.org/docs/current/queries-table-expressions.html#QUERIES-GROUPING-SETS) +/// (see ) fn create_rollup_physical_expr( exprs: &[Expr], input_dfschema: &DFSchema, diff --git a/datafusion/core/src/physical_plan/repartition/distributor_channels.rs b/datafusion/core/src/physical_plan/repartition/distributor_channels.rs index 412926fbc61a3..d9466d647cf33 100644 --- a/datafusion/core/src/physical_plan/repartition/distributor_channels.rs +++ b/datafusion/core/src/physical_plan/repartition/distributor_channels.rs @@ -85,7 +85,7 @@ pub fn channels( /// Erroring during [send](DistributionSender::send). /// -/// This occurs when the [receiver](DistributedReceiver) is gone. +/// This occurs when the [receiver](DistributionReceiver) is gone. #[derive(PartialEq, Eq)] pub struct SendError(pub T); diff --git a/datafusion/core/src/scheduler/pipeline/mod.rs b/datafusion/core/src/scheduler/pipeline/mod.rs index 824a6950e025b..c1838cd1a49b8 100644 --- a/datafusion/core/src/scheduler/pipeline/mod.rs +++ b/datafusion/core/src/scheduler/pipeline/mod.rs @@ -70,6 +70,7 @@ pub mod repartition; /// from the physical plan as a single [`Pipeline`]. Parallelized implementations /// are also possible /// +/// [`ExecutionPlan`]: crate::physical_plan::ExecutionPlan pub trait Pipeline: Send + Sync + std::fmt::Debug { /// Push a [`RecordBatch`] to the given input partition fn push(&self, input: RecordBatch, child: usize, partition: usize) -> Result<()>; diff --git a/datafusion/core/src/scheduler/task.rs b/datafusion/core/src/scheduler/task.rs index ffa9728a4b4a9..bf10186ef2ad1 100644 --- a/datafusion/core/src/scheduler/task.rs +++ b/datafusion/core/src/scheduler/task.rs @@ -128,7 +128,8 @@ impl Task { } } - /// Call [`Pipeline::poll_partition`], attempting to make progress on query execution + /// Call [`Pipeline::poll_partition`][super::pipeline::Pipeline::poll_partition], + /// attempting to make progress on query execution pub fn do_work(self) { assert!(is_worker(), "Task::do_work called outside of worker pool"); if self.context.is_cancelled() { @@ -324,7 +325,9 @@ impl ExecutionContext { struct TaskWaker { /// Store a weak reference to the [`ExecutionContext`] to avoid reference cycles if this - /// [`Waker`] is stored within a [`Pipeline`] owned by the [`ExecutionContext`] + /// [`Waker`] is stored within a `Pipeline` owned by the [`ExecutionContext`] + /// + /// [`Waker`]: std::task::Waker context: Weak, /// A counter that stores the number of times this has been awoken @@ -338,9 +341,8 @@ struct TaskWaker { /// This ensures that a given [`Task`] is not enqueued multiple times /// /// We store an integer, as opposed to a boolean, so that wake ups that - /// occur during [`Pipeline::poll_partition`] can be detected and handled + /// occur during `Pipeline::poll_partition` can be detected and handled /// after it has finished executing - /// wake_count: AtomicUsize, /// The index of the pipeline within `query` to poll diff --git a/datafusion/execution/src/config.rs b/datafusion/execution/src/config.rs index 8f0094fadf508..81b7e8f0a601c 100644 --- a/datafusion/execution/src/config.rs +++ b/datafusion/execution/src/config.rs @@ -94,7 +94,7 @@ impl SessionConfig { /// Customize [`target_partitions`] /// - /// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions + /// [`target_partitions`]: datafusion_common::config::ExecutionOptions::target_partitions pub fn with_target_partitions(mut self, n: usize) -> Self { // partition count must be greater than zero assert!(n > 0); @@ -104,7 +104,7 @@ impl SessionConfig { /// Get [`target_partitions`] /// - /// [`target_partitions`]: crate::config::ExecutionOptions::target_partitions + /// [`target_partitions`]: datafusion_common::config::ExecutionOptions::target_partitions pub fn target_partitions(&self) -> usize { self.options.execution.target_partitions } diff --git a/datafusion/execution/src/object_store.rs b/datafusion/execution/src/object_store.rs index c22ec34a60f32..d5b7e48d6f586 100644 --- a/datafusion/execution/src/object_store.rs +++ b/datafusion/execution/src/object_store.rs @@ -117,7 +117,8 @@ impl std::fmt::Display for ObjectStoreUrl { /// 2. Systems relying on ad-hoc discovery, without corresponding DDL, can create [`ObjectStore`] /// lazily by providing a custom implementation of [`ObjectStoreRegistry`] /// -/// [`ListingTableUrl`]: crate::datasource::listing::ListingTableUrl +/// +/// [`ListingTableUrl`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTableUrl.html pub trait ObjectStoreRegistry: Send + Sync + std::fmt::Debug + 'static { /// If a store with the same key existed before, it is replaced and returned fn register_store( diff --git a/datafusion/expr/src/tree_node/plan.rs b/datafusion/expr/src/tree_node/plan.rs index e948f90da8475..c7621bc178332 100644 --- a/datafusion/expr/src/tree_node/plan.rs +++ b/datafusion/expr/src/tree_node/plan.rs @@ -22,12 +22,14 @@ use datafusion_common::tree_node::{TreeNodeVisitor, VisitRecursion}; use datafusion_common::{tree_node::TreeNode, Result}; impl TreeNode for LogicalPlan { - /// Compared to the default implementation, we need to invoke [`apply_subqueries`] - /// before visiting its children fn apply(&self, op: &mut F) -> Result where F: FnMut(&Self) -> Result, { + // Note, + // + // Compared to the default implementation, we need to invoke + // [`Self::apply_subqueries`] before visiting its children match op(self)? { VisitRecursion::Continue => {} // If the recursion should skip, do not apply to its children. And let the recursion continue @@ -61,13 +63,13 @@ impl TreeNode for LogicalPlan { /// visitor.post_visit(Filter) /// visitor.post_visit(Projection) /// ``` - /// - /// Compared to the default implementation, we need to invoke [`visit_subqueries`] - /// before visiting its children fn visit>( &self, visitor: &mut V, ) -> Result { + // Compared to the default implementation, we need to invoke + // [`Self::visit_subqueries`] before visiting its children + match visitor.pre_visit(self)? { VisitRecursion::Continue => {} // If the recursion should skip, do not apply to its children. And let the recursion continue diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index a61e79fc24d92..3aec826d8121c 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -69,7 +69,7 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result { } } -/// The power set (or powerset) of a set S is the set of all subsets of S, \ +/// The [power set] (or powerset) of a set S is the set of all subsets of S, \ /// including the empty set and S itself. /// /// Example: @@ -85,7 +85,7 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result { /// {x, y, z} \ /// and hence the power set of S is {{}, {x}, {y}, {z}, {x, y}, {x, z}, {y, z}, {x, y, z}}. /// -/// Reference: https://en.wikipedia.org/wiki/Power_set +/// [power set]: https://en.wikipedia.org/wiki/Power_set fn powerset(slice: &[T]) -> Result>, String> { if slice.len() >= 64 { return Err("The size of the set must be less than 64.".into()); diff --git a/datafusion/jit/src/api.rs b/datafusion/jit/src/api.rs index a3989b5e5a9d4..4ac26438bc2b6 100644 --- a/datafusion/jit/src/api.rs +++ b/datafusion/jit/src/api.rs @@ -274,7 +274,7 @@ impl<'a> CodeBlock<'a> { unreachable!() } - /// Enter else block. Try [if_block] first which is much easier to use. + /// Enter else block. Try [`Self::if_block`] first which is much easier to use. fn enter_else(&mut self) { self.fields.pop_back(); self.fields.push_back(HashMap::new()); @@ -367,7 +367,7 @@ impl<'a> CodeBlock<'a> { Ok(()) } - /// Enter `while` loop block. Try [while_block] first which is much easier to use. + /// Enter `while` loop block. Try [`Self::while_block`] first which is much easier to use. fn while_loop(&mut self, cond: Expr) -> Result { if cond.get_type() != BOOL { internal_err!("while condition must be bool") @@ -384,7 +384,7 @@ impl<'a> CodeBlock<'a> { } } - /// Enter `if-then-else`'s then block. Try [if_block] first which is much easier to use. + /// Enter `if-then-else`'s then block. Try [`Self::if_block`] first which is much easier to use. fn if_else(&mut self, cond: Expr) -> Result { if cond.get_type() != BOOL { internal_err!("if condition must be bool") diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs index ed48da7fde98c..4520141de52b5 100644 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs @@ -33,7 +33,8 @@ use crate::analyzer::AnalyzerRule; pub const COUNT_STAR: &str = "COUNT(*)"; /// Rewrite `Count(Expr:Wildcard)` to `Count(Expr:Literal)`. -/// Resolve issue: https://github.com/apache/arrow-datafusion/issues/5473. +/// +/// Resolves issue: #[derive(Default)] pub struct CountWildcardRule {} diff --git a/datafusion/optimizer/src/utils.rs b/datafusion/optimizer/src/utils.rs index b52f1b1146a7b..4e5fa8cd6a5e1 100644 --- a/datafusion/optimizer/src/utils.rs +++ b/datafusion/optimizer/src/utils.rs @@ -487,7 +487,6 @@ pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema { /// from both the subquery and outer table or only from the outer table. /// /// Returns join predicates and subquery(extracted). -/// ``` pub(crate) fn extract_join_filters( maybe_filter: &LogicalPlan, ) -> Result<(Vec, LogicalPlan)> { diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 691b7081623ba..b3e37a8f92468 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -57,8 +57,9 @@ pub(crate) mod variance; /// * knows its accumulator's state's field /// * knows the expressions from whose its accumulator will receive values /// -/// The aggregate expressions implement this trait also need to implement the [PartialEq] -/// This allows comparing the equality between the trait objects +/// Any implementation of this trait also needs to implement the +/// `PartialEq` to allows comparing equality between the +/// trait objects. pub trait AggregateExpr: Send + Sync + Debug + PartialEq { /// Returns the aggregate expression as [`Any`](std::any::Any) so that it can be /// downcast to a specific implementation. diff --git a/datafusion/physical-expr/src/aggregate/utils.rs b/datafusion/physical-expr/src/aggregate/utils.rs index feabbd9cdc2d5..158ceb316e844 100644 --- a/datafusion/physical-expr/src/aggregate/utils.rs +++ b/datafusion/physical-expr/src/aggregate/utils.rs @@ -83,10 +83,13 @@ pub fn calculate_result_decimal_for_avg( } } -/// Downcast the [`Box`] or [`Arc`] and return -/// the inner trait object as [`Any`](std::any::Any) so that it can be downcast to a specific implementation. -/// This method is used when implementing the [`PartialEq'] for aggregation expressions and allows -/// comparing the equality between the trait objects. +/// Downcast a `Box` or `Arc` +/// and return the inner trait object as [`Any`](std::any::Any) so +/// that it can be downcast to a specific implementation. +/// +/// This method is used when implementing the `PartialEq` +/// for [`AggregateExpr`] aggregation expressions and allows comparing the equality +/// between the trait objects. pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any { if any.is::>() { any.downcast_ref::>() diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index c4d5bb9b5fee4..575050cfff08a 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! InList expression +//! Implementation of `InList` expressions: [`InListExpr`] use ahash::RandomState; use std::any::Any; @@ -134,8 +134,9 @@ where } } -/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there are nulls present -/// or there are more than [`OPTIMIZER_INSET_THRESHOLD`] values +/// Computes an [`ArrayHashSet`] for the provided [`Array`] if there +/// are nulls present or there are more than the configured number of +/// elements. /// /// Note: This is split into a separate function as higher-rank trait bounds currently /// cause type inference to misbehave diff --git a/datafusion/physical-expr/src/sort_expr.rs b/datafusion/physical-expr/src/sort_expr.rs index 1699d25bf2871..f19ae67004df3 100644 --- a/datafusion/physical-expr/src/sort_expr.rs +++ b/datafusion/physical-expr/src/sort_expr.rs @@ -180,6 +180,8 @@ impl PhysicalSortRequirement { /// /// This method takes `&'a PhysicalSortExpr` to make it easy to /// use implementing [`ExecutionPlan::required_input_ordering`]. + /// + /// [`ExecutionPlan::required_input_ordering`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#method.required_input_ordering pub fn from_sort_exprs<'a>( ordering: impl IntoIterator, ) -> Vec { diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/sql/src/expr/arrow_cast.rs index 043d078d8d09c..607a9fb432f4a 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/sql/src/expr/arrow_cast.rs @@ -29,8 +29,8 @@ pub const ARROW_CAST_NAME: &str = "arrow_cast"; /// Create an [`Expr`] that evaluates the `arrow_cast` function /// -/// This function is not a [`BuiltInScalarFunction`] because the -/// return type of [`BuiltInScalarFunction`] depends only on the +/// This function is not a [`BuiltinScalarFunction`] because the +/// return type of [`BuiltinScalarFunction`] depends only on the /// *types* of the arguments. However, the type of `arrow_type` depends on /// the *value* of its second argument. /// @@ -48,6 +48,7 @@ pub const ARROW_CAST_NAME: &str = "arrow_cast"; /// ```sql /// select arrow_cast(column_x, 'Float64') /// ``` +/// [`BuiltinScalarFunction`]: datafusion_expr::BuiltinScalarFunction pub fn create_arrow_cast(mut args: Vec, schema: &DFSchema) -> Result { if args.len() != 2 { return Err(DataFusionError::Plan(format!( diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 4a8719331aac6..67fe8b1e99dbc 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -518,7 +518,8 @@ fn rewrite_placeholder(expr: &mut Expr, other: &Expr, schema: &DFSchema) -> Resu Ok(()) } -/// Find all [`Expr::PlaceHolder`] tokens in a logical plan, and try to infer their type from context +/// Find all [`Expr::Placeholder`] tokens in a logical plan, and try +/// to infer their [`DataType`] from the context of their use. fn infer_placeholder_types(expr: Expr, schema: &DFSchema) -> Result { expr.transform(&|mut expr| { // Default to assuming the arguments are the same type diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index dc21c819421e2..4b754bcd8a2a8 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -63,7 +63,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.optimizer.allow_symmetric_joins_without_pruning | true | Should DataFusion allow symmetric hash joins for unbounded data sources even when its inputs do not have any ordering or filtering If the flag is not enabled, the SymmetricHashJoin operator will be unable to prune its internal buffers, resulting in certain join types - such as Full, Left, LeftAnti, LeftSemi, Right, RightAnti, and RightSemi - being produced only at the end of the execution. This is not typical in stream processing. Additionally, without proper design for long runner execution, all types of joins may encounter out-of-memory errors. | | datafusion.optimizer.repartition_file_scans | true | When set to true, file groups will be repartitioned to achieve maximum parallelism. Currently supported only for Parquet format in which case multiple row groups from the same file may be read concurrently. If false then each row group is read serially, though different files may be read in parallel. | | datafusion.optimizer.repartition_windows | true | Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level | -| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", would turn into the plan below which performs better in multithreaded environments "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", | +| datafusion.optimizer.repartition_sorts | true | Should DataFusion execute sorts in a per-partition fashion and merge afterwards instead of coalescing first and sorting globally. With this flag is enabled, plans in the form below `text "SortExec: [a@0 ASC]", " CoalescePartitionsExec", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` would turn into the plan below which performs better in multithreaded environments `text "SortPreservingMergeExec: [a@0 ASC]", " SortExec: [a@0 ASC]", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", ` | | datafusion.optimizer.skip_failed_rules | true | When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail | | datafusion.optimizer.max_passes | 3 | Number of times that the optimizer will attempt to optimize the plan | | datafusion.optimizer.top_down_join_key_reordering | true | When set to true, the physical plan optimizer will run a top down process to reorder the join keys |