From 6034ac826e90d032caf4714b7e9087328a064c56 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 28 Sep 2024 16:55:16 -0400 Subject: [PATCH 01/13] Initial work on #12432 to allow for generation of udf docs from embedded documentation in the code --- .github/workflows/rust.yml | 15 ++ datafusion-cli/Cargo.lock | 3 + datafusion/expr/Cargo.toml | 1 + datafusion/expr/src/lib.rs | 8 +- datafusion/expr/src/udaf.rs | 72 ++++++- datafusion/expr/src/udf.rs | 133 ++++++++++++- datafusion/expr/src/udf_docs.rs | 64 ++++++ datafusion/expr/src/udwf.rs | 77 ++++++- datafusion/functions-aggregate/Cargo.toml | 1 + .../functions-aggregate/src/bit_and_or_xor.rs | 79 +++++++- datafusion/functions-window/src/row_number.rs | 19 +- datafusion/functions/Cargo.toml | 1 + datafusion/functions/src/core/coalesce.rs | 22 +- datafusion/functions/src/crypto/sha224.rs | 25 ++- datafusion/functions/src/datetime/to_date.rs | 57 +++++- datafusion/functions/src/encoding/inner.rs | 36 +++- datafusion/functions/src/math/log.rs | 29 ++- datafusion/functions/src/regex/regexplike.rs | 49 ++++- datafusion/functions/src/string/ascii.rs | 22 +- datafusion/functions/src/unicode/rpad.rs | 32 ++- dev/update_config_docs.sh | 6 +- docs/source/user-guide/expressions.md | 4 +- .../src/bin/print_aggregate_functions_docs.rs | 152 ++++++++++++++ .../src/bin/print_scalar_functions_docs.rs | 152 ++++++++++++++ .../src/bin/print_window_functions_docs.rs | 152 ++++++++++++++ .../datafusion/expr/src/udf_docs.rs | 64 ++++++ .../datafusion/dev/update_aggregate_docs.sh | 69 +++++++ .../dev/datafusion/dev/update_scalar_docs.sh | 67 +++++++ .../dev/datafusion/dev/update_window_docs.sh | 188 ++++++++++++++++++ 29 files changed, 1557 insertions(+), 42 deletions(-) create mode 100644 datafusion/expr/src/udf_docs.rs create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_aggregate_functions_docs.rs create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_scalar_functions_docs.rs create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_window_functions_docs.rs create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_aggregate_docs.sh create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_scalar_docs.sh create mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_window_docs.sh diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index edaa49ec6e7ec..1d28989a21540 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -542,6 +542,21 @@ jobs: # If you encounter an error, run './dev/update_config_docs.sh' and commit ./dev/update_config_docs.sh git diff --exit-code + - name: Check if aggregate_functions.md has been modified + run: | + # If you encounter an error, run './dev/update_aggregate_docs.sh' and commit + ./dev/update_aggregate_docs.sh + git diff --exit-code + - name: Check if scalar_functions.md has been modified + run: | + # If you encounter an error, run './dev/update_scalar_docs.sh' and commit + ./dev/update_scalar_docs.sh + git diff --exit-code + - name: Check if window_functions.md has been modified + run: | + # If you encounter an error, run './dev/update_window_docs.sh' and commit + ./dev/update_window_docs.sh + git diff --exit-code # Verify MSRV for the crates which are directly used by other projects: # - datafusion diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index fbe7d5c04b9bf..179d410e185ee 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1345,6 +1345,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", + "indexmap", "paste", "serde_json", "sqlparser", @@ -1376,6 +1377,7 @@ dependencies = [ "datafusion-expr", "hashbrown", "hex", + "indexmap", "itertools", "log", "md-5", @@ -1400,6 +1402,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "half", + "indexmap", "log", "paste", "sqlparser", diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 55387fea22eeb..d7dc1afe4d505 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -48,6 +48,7 @@ datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } +indexmap = { workspace = true } paste = "^1.0" serde_json = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 260065f69af98..aba2b1e38527b 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -34,6 +34,7 @@ mod partition_evaluator; mod table_source; mod udaf; mod udf; +mod udf_docs; mod udwf; pub mod conditional_expressions; @@ -90,9 +91,10 @@ pub use logical_plan::*; pub use partition_evaluator::PartitionEvaluator; pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; -pub use udaf::{AggregateUDF, AggregateUDFImpl, ReversedUDAF}; -pub use udf::{ScalarUDF, ScalarUDFImpl}; -pub use udwf::{WindowUDF, WindowUDFImpl}; +pub use udaf::{aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF}; +pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl}; +pub use udf_docs::{DocSection, Documentation, DOCUMENTATION_NONE, DOC_SECTION_NONE}; +pub use udwf::{window_doc_sections, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; #[cfg(test)] diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index e3ef672daf5ff..782e62618bf70 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -33,10 +33,11 @@ use crate::function::{ AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs, }; use crate::groups_accumulator::GroupsAccumulator; +use crate::udf_docs::DOCUMENTATION_NONE; use crate::utils::format_state_name; use crate::utils::AggregateOrderSensitivity; -use crate::Signature; use crate::{Accumulator, Expr}; +use crate::{Documentation, Signature}; /// Logical representation of a user-defined [aggregate function] (UDAF). /// @@ -248,6 +249,11 @@ impl AggregateUDF { pub fn default_value(&self, data_type: &DataType) -> Result { self.inner.default_value(data_type) } + + /// Returns this UDF's documentation that will be used to generate public documentation + pub fn documentation(&self) -> &Documentation { + self.inner.documentation() + } } impl From for AggregateUDF @@ -274,19 +280,31 @@ where /// # use std::any::Any; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr}; +/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr, Documentation}; /// # use datafusion_expr::{AggregateUDFImpl, AggregateUDF, Accumulator, function::{AccumulatorArgs, StateFieldsArgs}}; +/// # use datafusion_expr::window_doc_sections::DOC_SECTION_AGGREGATE; /// # use arrow::datatypes::Schema; /// # use arrow::datatypes::Field; +/// # use indexmap::IndexMap; +/// /// #[derive(Debug, Clone)] /// struct GeoMeanUdf { -/// signature: Signature +/// signature: Signature, +/// documentation: Documentation, /// } /// /// impl GeoMeanUdf { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable), +/// documentation: Documentation { +/// doc_section: DOC_SECTION_AGGREGATE, +/// description: "calculates a geometric mean", +/// syntax_example: "geo_mean(2.0)", +/// sql_example: None, +/// arguments: Some(IndexMap::from([("arg_1", "The Float64 number for the geometric mean")])), +/// related_udfs: None, +/// } /// } /// } /// } @@ -298,7 +316,7 @@ where /// fn signature(&self) -> &Signature { &self.signature } /// fn return_type(&self, args: &[DataType]) -> Result { /// if !matches!(args.get(0), Some(&DataType::Float64)) { -/// return plan_err!("add_one only accepts Float64 arguments"); +/// return plan_err!("geo_mean only accepts Float64 arguments"); /// } /// Ok(DataType::Float64) /// } @@ -310,6 +328,9 @@ where /// Field::new("ordering", DataType::UInt32, true) /// ]) /// } +/// fn documentation(&self) -> Documentation { +/// &self.documentation +/// } /// } /// /// // Create a new AggregateUDF from the implementation @@ -564,6 +585,12 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { fn default_value(&self, data_type: &DataType) -> Result { ScalarValue::try_from(data_type) } + + /// Returns the documentation for this Aggregate UDF for use + /// in generating publicly facing documentation. + fn documentation(&self) -> &Documentation { + &DOCUMENTATION_NONE + } } impl PartialEq for dyn AggregateUDFImpl { @@ -710,6 +737,41 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl { fn is_descending(&self) -> Option { self.inner.is_descending() } + + fn documentation(&self) -> &Documentation { + self.inner.documentation() + } +} + +// Aggregate UDF doc sections for use in public documentation +pub mod aggregate_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_GENERAL, + DOC_SECTION_STATISTICAL, + DOC_SECTION_APPROXIMATE, + ] + } + + pub const DOC_SECTION_GENERAL: DocSection = DocSection { + include: true, + label: "General Functions", + description: None, + }; + + pub const DOC_SECTION_STATISTICAL: DocSection = DocSection { + include: true, + label: "Statistical Functions", + description: None, + }; + + pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection { + include: true, + label: "Approximate Functions", + description: None, + }; } #[cfg(test)] diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 938e1181d85d4..daa4b33cf3b37 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -20,7 +20,10 @@ use crate::expr::schema_name_from_exprs_comma_seperated_without_space; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; -use crate::{ColumnarValue, Expr, ScalarFunctionImplementation, Signature}; +use crate::udf_docs::DOCUMENTATION_NONE; +use crate::{ + ColumnarValue, Documentation, Expr, ScalarFunctionImplementation, Signature, +}; use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result}; use datafusion_expr_common::interval_arithmetic::Interval; @@ -274,6 +277,11 @@ impl ScalarUDF { pub fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + /// Returns this UDF's documentation that will be used to generate public documentation + pub fn documentation(&self) -> &Documentation { + self.inner.documentation() + } } impl From for ScalarUDF @@ -299,18 +307,30 @@ where /// ``` /// # use std::any::Any; /// # use arrow::datatypes::DataType; +/// # use indexmap::IndexMap; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility}; +/// # use datafusion_expr::{col, ColumnarValue, Documentation, Signature, Volatility}; /// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF}; +/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; +/// /// #[derive(Debug)] /// struct AddOne { -/// signature: Signature +/// signature: Signature, +/// documentation: Documentation, /// } /// /// impl AddOne { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), +/// documentation: Documentation { +/// doc_section: DOC_SECTION_MATH, +/// description: "Add one to an int32", +/// syntax_example: "add_one(2)", +/// sql_example: None, +/// arguments: Some(IndexMap::from([("arg_1", "The int32 number to add one to")])), +/// related_udfs: None, +/// } /// } /// } /// } @@ -328,6 +348,9 @@ where /// } /// // The actual implementation would add one to the argument /// fn invoke(&self, args: &[ColumnarValue]) -> Result { unimplemented!() } +/// fn documentation(&self) -> Documentation { +/// &self.documentation +/// } /// } /// /// // Create a new ScalarUDF from the implementation @@ -596,6 +619,12 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { self.signature().hash(hasher); hasher.finish() } + + /// Returns the documentation for this scalar UDF for use + /// in generating publicly facing documentation. + fn documentation(&self) -> &Documentation { + &DOCUMENTATION_NONE + } } /// ScalarUDF that adds an alias to the underlying function. It is better to @@ -709,4 +738,100 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.aliases.hash(hasher); hasher.finish() } + + fn documentation(&self) -> &Documentation { + self.inner.documentation() + } +} + +// Scalar UDF doc sections for use in public documentation +pub mod scalar_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_MATH, + DOC_SECTION_CONDITIONAL, + DOC_SECTION_STRING, + DOC_SECTION_BINARY_STRING, + DOC_SECTION_REGEX, + DOC_SECTION_DATETIME, + DOC_SECTION_ARRAY, + DOC_SECTION_STRUCT, + DOC_SECTION_MAP, + DOC_SECTION_HASHING, + DOC_SECTION_OTHER, + ] + } + + pub const DOC_SECTION_MATH: DocSection = DocSection { + include: true, + label: "Math Functions", + description: None, + }; + + pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection { + include: true, + label: "Conditional Functions", + description: None, + }; + + pub const DOC_SECTION_STRING: DocSection = DocSection { + include: true, + label: "String Functions", + description: None, + }; + + pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection { + include: true, + label: "Binary String Functions", + description: None, + }; + + pub const DOC_SECTION_REGEX: DocSection = DocSection { + include: true, + label: "Regular Expression Functions", + description: Some( + r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported:"#, + ), + }; + + pub const DOC_SECTION_DATETIME: DocSection = DocSection { + include: true, + label: "Time and Date Functions", + description: None, + }; + + pub const DOC_SECTION_ARRAY: DocSection = DocSection { + include: true, + label: "Array Functions", + description: None, + }; + + pub const DOC_SECTION_STRUCT: DocSection = DocSection { + include: true, + label: "Struct Functions", + description: None, + }; + + pub const DOC_SECTION_MAP: DocSection = DocSection { + include: true, + label: "Map Functions", + description: None, + }; + + pub const DOC_SECTION_HASHING: DocSection = DocSection { + include: true, + label: "Hashing Functions", + description: None, + }; + + pub const DOC_SECTION_OTHER: DocSection = DocSection { + include: true, + label: "Other Functions", + description: None, + }; } diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs new file mode 100644 index 0000000000000..32c03bda385f1 --- /dev/null +++ b/datafusion/expr/src/udf_docs.rs @@ -0,0 +1,64 @@ +use indexmap::IndexMap; + +/// Documentation for use by [`crate::ScalarUDFImpl`], +/// [`crate::AggregateUDFImpl`] and [`crate::WindowUDFImpl`] functions +/// that will be used to generate public documentation. +/// +/// The name of the udf will be pulled from the [`crate::ScalarUDFImpl::name`], +/// [`crate::AggregateUDFImpl::name`] or [`crate::WindowUDFImpl::name`] function +/// as appropriate. +/// +/// All strings in the documentation are required to be +/// in [markdown format](https://www.markdownguide.org/basic-syntax/). +/// +/// Currently, documentation only supports a single language +/// thus all text should be in English. +#[derive(Debug, Clone)] +pub struct Documentation { + /// the section in the documentation where the UDF will be documented + pub doc_section: DocSection, + /// the description for the UDF + pub description: &'static str, + pub syntax_example: &'static str, + /// a sql example for the UDF, usually in the form of a sql prompt + /// query and output. It is strongly recommended to provide an + /// example for anything but the most basic UDF's + pub sql_example: Option<&'static str>, + /// arguments for the UDF which will be displayed in insertion + /// order. Key is the argument name, value is a description for + /// the argument + pub arguments: Option>, + /// related functions if any. Values should match the related + /// udf's name exactly. Related udf's must be of the same + /// UDF type (scalar, aggregate or window) for proper linking to + /// occur + pub related_udfs: Option>, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct DocSection { + /// true to include this doc section in the public + /// documentation, false otherwise + pub include: bool, + /// a display label for the doc section. For example: "Math Expressions" + pub label: &'static str, + /// an optional description for the doc section + pub description: Option<&'static str>, +} + +pub const DOCUMENTATION_NONE: Documentation = Documentation { + doc_section: DOC_SECTION_NONE, + description: "", + syntax_example: "", + sql_example: None, + arguments: None, + related_udfs: None, +}; + +/// A doc section that indicated the UDF should not +/// be publicly documented +pub const DOC_SECTION_NONE: DocSection = DocSection { + include: false, + label: "", + description: None, +}; diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 7cc57523a14df..f94413f1a3cd5 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -32,8 +32,10 @@ use datafusion_common::{not_impl_err, Result}; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use crate::expr::WindowFunction; +use crate::udf_docs::DOCUMENTATION_NONE; use crate::{ - function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature, + function::WindowFunctionSimplification, Documentation, Expr, PartitionEvaluator, + Signature, }; /// Logical representation of a user-defined window function (UDWF) @@ -172,6 +174,11 @@ impl WindowUDF { pub fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + /// Returns this UDF's documentation that will be used to generate public documentation + pub fn documentation(&self) -> &Documentation { + self.inner.documentation() + } } impl From for WindowUDF @@ -198,28 +205,40 @@ where /// # use std::any::Any; /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt}; +/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt, Documentation}; /// # use datafusion_expr::{WindowUDFImpl, WindowUDF}; -/// use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # use indexmap::IndexMap; +/// /// #[derive(Debug, Clone)] /// struct SmoothIt { -/// signature: Signature +/// signature: Signature, +/// documentation: Documentation, /// } /// /// impl SmoothIt { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), +/// documentation: Documentation { +/// doc_section: DOC_SECTION_ANALYTICAL, +/// description: "smooths the windows", +/// syntax_example: "smooth_it(2)", +/// sql_example: None, +/// arguments: Some(IndexMap::from([("arg_1", "The int32 number to smooth by")])), +/// related_udfs: None, +/// } /// } /// } /// } /// -/// /// Implement the WindowUDFImpl trait for AddOne +/// /// Implement the WindowUDFImpl trait for SmoothIt /// impl WindowUDFImpl for SmoothIt { /// fn as_any(&self) -> &dyn Any { self } /// fn name(&self) -> &str { "smooth_it" } /// fn signature(&self) -> &Signature { &self.signature } -/// // The actual implementation would add one to the argument +/// // The actual implementation would smooth the window /// fn partition_evaluator(&self) -> Result> { unimplemented!() } /// fn field(&self, field_args: WindowUDFFieldArgs) -> Result { /// if let Some(DataType::Int32) = field_args.get_input_type(0) { @@ -228,6 +247,9 @@ where /// plan_err!("smooth_it only accepts Int32 arguments") /// } /// } +/// fn documentation(&self) -> &Documentation { +/// &self.documentation +/// } /// } /// /// // Create a new WindowUDF from the implementation @@ -351,6 +373,12 @@ pub trait WindowUDFImpl: Debug + Send + Sync { fn coerce_types(&self, _arg_types: &[DataType]) -> Result> { not_impl_err!("Function {} does not implement coerce_types", self.name()) } + + /// Returns the documentation for this window UDF for use + /// in generating publicly facing documentation. + fn documentation(&self) -> &Documentation { + &DOCUMENTATION_NONE + } } impl PartialEq for dyn WindowUDFImpl { @@ -439,6 +467,41 @@ impl WindowUDFImpl for AliasedWindowUDFImpl { fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + fn documentation(&self) -> &Documentation { + self.inner.documentation() + } +} + +// Window UDF doc sections for use in public documentation +pub mod window_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_AGGREGATE, + DOC_SECTION_RANKING, + DOC_SECTION_ANALYTICAL, + ] + } + + pub const DOC_SECTION_AGGREGATE: DocSection = DocSection { + include: true, + label: "Aggregate Functions", + description: Some("All aggregate functions can be used as window functions."), + }; + + pub const DOC_SECTION_RANKING: DocSection = DocSection { + include: true, + label: "Ranking Functions", + description: None, + }; + + pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection { + include: true, + label: "Analytical Functions", + description: None, + }; } #[cfg(test)] diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index d78f68a2604e7..1d3ec62a35ddc 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -48,6 +48,7 @@ datafusion-functions-aggregate-common = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } half = { workspace = true } +indexmap = { workspace = true } log = { workspace = true } paste = "1.0.14" sqlparser = { workspace = true } diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index aa65062e3330c..4307a6d68f5d6 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -17,6 +17,7 @@ //! Defines `BitAnd`, `BitOr`, `BitXor` and `BitXor DISTINCT` aggregate accumulators +use indexmap::IndexMap; use std::any::Any; use std::collections::HashSet; use std::fmt::{Display, Formatter}; @@ -35,9 +36,11 @@ use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::INTEGERS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; @@ -110,8 +113,9 @@ macro_rules! downcast_bitwise_accumulator { /// `EXPR_FN` identifier used to name the generated expression function. /// `AGGREGATE_UDF_FN` is an identifier used to name the underlying UDAF function. /// `OPR_TYPE` is an expression that evaluates to the type of bitwise operation to be performed. +/// `DOCUMENTATION` documentation for the UDAF macro_rules! make_bitwise_udaf_expr_and_func { - ($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $OPR_TYPE:expr) => { + ($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $OPR_TYPE:expr, $DOCUMENTATION:expr) => { make_udaf_expr!( $EXPR_FN, expr_x, @@ -125,14 +129,65 @@ macro_rules! make_bitwise_udaf_expr_and_func { create_func!( $EXPR_FN, $AGGREGATE_UDF_FN, - BitwiseOperation::new($OPR_TYPE, stringify!($EXPR_FN)) + BitwiseOperation::new($OPR_TYPE, stringify!($EXPR_FN), $DOCUMENTATION) ); }; } -make_bitwise_udaf_expr_and_func!(bit_and, bit_and_udaf, BitwiseOperationType::And); -make_bitwise_udaf_expr_and_func!(bit_or, bit_or_udaf, BitwiseOperationType::Or); -make_bitwise_udaf_expr_and_func!(bit_xor, bit_xor_udaf, BitwiseOperationType::Xor); +make_bitwise_udaf_expr_and_func!( + bit_and, + bit_and_udaf, + BitwiseOperationType::And, + Documentation { + doc_section: DOC_SECTION_GENERAL, + description: "Computes the bitwise AND of all non-null input values.", + syntax_example: "bit_and(expression)", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ), + ])), + related_udfs: None, + } +); +make_bitwise_udaf_expr_and_func!( + bit_or, + bit_or_udaf, + BitwiseOperationType::Or, + Documentation { + doc_section: DOC_SECTION_GENERAL, + description: "Computes the bitwise OR of all non-null input values.", + syntax_example: "bit_or(expression)", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ), + ])), + related_udfs: None, + } +); +make_bitwise_udaf_expr_and_func!( + bit_xor, + bit_xor_udaf, + BitwiseOperationType::Xor, + Documentation { + doc_section: DOC_SECTION_GENERAL, + description: "Computes the bitwise exclusive OR of all non-null input values.", + syntax_example: "bit_xor(expression)", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ), + ])), + related_udfs: None, + } +); /// The different types of bitwise operations that can be performed. #[derive(Debug, Clone, Eq, PartialEq)] @@ -155,14 +210,20 @@ struct BitwiseOperation { /// `operation` indicates the type of bitwise operation to be performed. operation: BitwiseOperationType, func_name: &'static str, + documentation: Documentation, } impl BitwiseOperation { - pub fn new(operator: BitwiseOperationType, func_name: &'static str) -> Self { + pub fn new( + operator: BitwiseOperationType, + func_name: &'static str, + documentation: Documentation, + ) -> Self { Self { operation: operator, signature: Signature::uniform(1, INTEGERS.to_vec(), Volatility::Immutable), func_name, + documentation, } } } @@ -239,6 +300,10 @@ impl AggregateUDFImpl for BitwiseOperation { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } struct BitAndAccumulator { diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index 7f348bf9d2a05..a50939639dc2f 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -28,7 +28,10 @@ use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::expr::WindowFunction; -use datafusion_expr::{Expr, PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; +use datafusion_expr::{ + Documentation, Expr, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, +}; use datafusion_functions_window_common::field; use field::WindowUDFFieldArgs; @@ -57,6 +60,7 @@ pub fn row_number_udwf() -> std::sync::Arc { #[derive(Debug)] pub struct RowNumber { signature: Signature, + documentation: Documentation, } impl RowNumber { @@ -64,6 +68,15 @@ impl RowNumber { pub fn new() -> Self { Self { signature: Signature::any(0, Volatility::Immutable), + documentation: Documentation { + doc_section: DOC_SECTION_RANKING, + description: + "Number of the current row within its partition, counting from 1.", + syntax_example: "row_number()", + sql_example: None, + arguments: None, + related_udfs: None, + }, } } } @@ -101,6 +114,10 @@ impl WindowUDFImpl for RowNumber { nulls_first: false, }) } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } /// State for the `row_number` built-in window function. diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index ff1b926a9b822..0b21be6821b01 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -76,6 +76,7 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } hashbrown = { workspace = true, optional = true } hex = { version = "0.4", optional = true } +indexmap = { workspace = true } itertools = { workspace = true } log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 2fa6d7c197ad7..d2d4b04872e49 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -22,14 +22,17 @@ use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, ExprSchema, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::type_coercion::binary::type_union_resolution; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use indexmap::IndexMap; use itertools::Itertools; #[derive(Debug)] pub struct CoalesceFunc { signature: Signature, + documentation: Documentation, } impl Default for CoalesceFunc { @@ -42,6 +45,19 @@ impl CoalesceFunc { pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), + documentation: Documentation { + doc_section: DOC_SECTION_CONDITIONAL, + description: "Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.", + syntax_example: "coalesce(expression1[, ..., expression_n])", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ), + ])), + related_udfs: None, + }, } } } @@ -140,6 +156,10 @@ impl ScalarUDFImpl for CoalesceFunc { .unwrap_or(arg_types.first().unwrap().clone()); Ok(vec![new_type; arg_types.len()]) } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } #[cfg(test)] diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 2795c4a250041..813f51aef3356 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -19,12 +19,17 @@ use super::basic::{sha224, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_HASHING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use indexmap::IndexMap; use std::any::Any; #[derive(Debug)] pub struct SHA224Func { signature: Signature, + documentation: Documentation, } impl Default for SHA224Func { fn default() -> Self { @@ -41,6 +46,19 @@ impl SHA224Func { vec![Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), + documentation: Documentation { + doc_section: DOC_SECTION_HASHING, + description: "Computes the SHA-224 hash of a binary string.", + syntax_example: "sha224(expression)", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "expression", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." + ), + ])), + related_udfs: None, + } } } } @@ -60,7 +78,12 @@ impl ScalarUDFImpl for SHA224Func { fn return_type(&self, arg_types: &[DataType]) -> Result { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } + fn invoke(&self, args: &[ColumnarValue]) -> Result { sha224(args) } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 288641b84dd7e..5b72f7d30705d 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -17,19 +17,23 @@ use std::any::Any; +use crate::datetime::common::*; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Date32; use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; - -use crate::datetime::common::*; use datafusion_common::error::DataFusionError; use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use indexmap::IndexMap; #[derive(Debug)] pub struct ToDateFunc { signature: Signature, + documentation: Documentation, } impl Default for ToDateFunc { @@ -42,6 +46,49 @@ impl ToDateFunc { pub fn new() -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), + documentation: Documentation { + doc_section: DOC_SECTION_DATETIME, + description: r#"Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. +"#, + syntax_example: "to_date('2017-05-31', '%Y-%m-%d')", + sql_example: Some( + r#"```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) +"#), + arguments: Some(IndexMap::from([ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + ( + "format_n", + "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned.", + ) + ])), + related_udfs: None, + } } } @@ -117,6 +164,10 @@ impl ScalarUDFImpl for ToDateFunc { } } } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } #[cfg(test)] diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 5b80c908cfc31..ba66f554d219c 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -28,16 +28,19 @@ use datafusion_common::{ }; use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{DataFusionError, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Documentation}; use std::sync::Arc; use std::{fmt, str::FromStr}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use indexmap::IndexMap; use std::any::Any; #[derive(Debug)] pub struct EncodeFunc { signature: Signature, + documentation: Documentation, } impl Default for EncodeFunc { @@ -50,6 +53,17 @@ impl EncodeFunc { pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), + documentation: Documentation { + doc_section: DOC_SECTION_BINARY_STRING, + description: "Encode binary data into a textual representation.", + syntax_example: "encode(expression, format)", + sql_example: None, + arguments: Some(IndexMap::from([ + ("expression", "Expression containing string or binary data"), + ("format", "Supported formats are: `base64`, `hex`"), + ])), + related_udfs: Some(vec!["decode"]), + }, } } } @@ -100,11 +114,16 @@ impl ScalarUDFImpl for EncodeFunc { ), } } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } #[derive(Debug)] pub struct DecodeFunc { signature: Signature, + documentation: Documentation, } impl Default for DecodeFunc { @@ -117,6 +136,17 @@ impl DecodeFunc { pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), + documentation: Documentation { + doc_section: DOC_SECTION_BINARY_STRING, + description: "Decode binary data from textual representation in string.", + syntax_example: "decode(expression, format)", + sql_example: None, + arguments: Some(IndexMap::from([ + ("expression", "Expression containing encoded string data"), + ("format", "Same arguments as [encode](#encode)"), + ])), + related_udfs: Some(vec!["encode"]), + }, } } } @@ -167,6 +197,10 @@ impl ScalarUDFImpl for DecodeFunc { ), } } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } #[derive(Debug, Copy, Clone)] diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index ad7cff1f7149f..66869e803886f 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -29,14 +29,19 @@ use datafusion_common::{ ScalarValue, }; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; -use datafusion_expr::{lit, ColumnarValue, Expr, ScalarUDF, TypeSignature::*}; +use datafusion_expr::{ + lit, ColumnarValue, Documentation, Expr, ScalarUDF, TypeSignature::*, +}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use indexmap::IndexMap; #[derive(Debug)] pub struct LogFunc { signature: Signature, + documentation: Documentation, } impl Default for LogFunc { @@ -58,6 +63,24 @@ impl LogFunc { ], Volatility::Immutable, ), + documentation: Documentation { + doc_section: DOC_SECTION_MATH, + description: "Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.", + syntax_example: r#"log(base, numeric_expression) +log(numeric_expression)"#, + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "base", + "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + ( + "numeric_expression", + "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + ])), + related_udfs: None, + } } } } @@ -164,6 +187,10 @@ impl ScalarUDFImpl for LogFunc { Ok(ColumnarValue::Array(arr)) } + fn documentation(&self) -> &Documentation { + &self.documentation + } + /// Simplify the `log` function by the relevant rules: /// 1. Log(a, 1) ===> 0 /// 2. Log(a, Power(a, b)) ===> b diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 20029ba005c49..9e74a86f1e52d 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -25,15 +25,18 @@ use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, }; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX; use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use indexmap::IndexMap; use std::any::Any; use std::sync::Arc; #[derive(Debug)] pub struct RegexpLikeFunc { signature: Signature, + documentation: Documentation, } impl Default for RegexpLikeFunc { fn default() -> Self { @@ -54,6 +57,46 @@ impl RegexpLikeFunc { ], Volatility::Immutable, ), + documentation: Documentation { + doc_section: DOC_SECTION_REGEX, + description: "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.", + syntax_example: "regexp_like(str, regexp[, flags])", + sql_example: Some( + r#"```sql +select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); ++--------------------------------------------------------+ +| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | ++--------------------------------------------------------+ +| true | ++--------------------------------------------------------+ +SELECT regexp_like('aBc', '(b|d)', 'i'); ++--------------------------------------------------+ +| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) | ++--------------------------------------------------+ +| true | ++--------------------------------------------------+ +``` +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) +"#), + arguments: Some(IndexMap::from([ + ( + "str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." + ), + ( "regexp", + "Regular expression to test against the string expression. Can be a constant, column, or function." + ), + ("flags", + r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + - **i**: case-insensitive: letters match both upper and lower case + - **m**: multi-line mode: ^ and $ match begin/end of line + - **s**: allow . to match \n + - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used + - **U**: swap the meaning of x* and x*?"# + ) + ])), + related_udfs: None, + } } } } @@ -105,6 +148,10 @@ impl ScalarUDFImpl for RegexpLikeFunc { result.map(ColumnarValue::Array) } } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } fn regexp_like_func(args: &[ArrayRef]) -> Result { match args[0].data_type() { diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 68ba3f5ff15f5..ca09ada0fed69 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -20,14 +20,17 @@ use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array}; use arrow::datatypes::DataType; use arrow::error::ArrowError; use datafusion_common::{internal_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use indexmap::IndexMap; use std::any::Any; use std::sync::Arc; #[derive(Debug)] pub struct AsciiFunc { signature: Signature, + documentation: Documentation, } impl Default for AsciiFunc { @@ -45,6 +48,19 @@ impl AsciiFunc { vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), + documentation: Documentation { + doc_section: DOC_SECTION_STRING, + description: "Returns the ASCII value of the first character in a string.", + syntax_example: "ascii(str)", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "str", + "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View." + ) + ])), + related_udfs: Some(vec!["chr"]), + }, } } } @@ -71,6 +87,10 @@ impl ScalarUDFImpl for AsciiFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(ascii, vec![])(args) } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } fn calculate_ascii<'a, V>(array: V) -> Result diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index c1d6f327928f2..1867e7cfecae9 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -25,8 +25,12 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; use datafusion_common::DataFusionError; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use indexmap::IndexMap; use std::any::Any; use std::fmt::Write; use std::sync::Arc; @@ -36,6 +40,7 @@ use DataType::{LargeUtf8, Utf8, Utf8View}; #[derive(Debug)] pub struct RPadFunc { signature: Signature, + documentation: Documentation, } impl Default for RPadFunc { @@ -65,6 +70,27 @@ impl RPadFunc { ], Volatility::Immutable, ), + documentation: Documentation { + doc_section: DOC_SECTION_STRING, + description: "Pads the right side of a string with another string to a specified string length.", + syntax_example: "rpad(str, n[, padding_str])", + sql_example: None, + arguments: Some(IndexMap::from([ + ( + "str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." + ), + ( + "n", + "String length to pad to." + ), + ( + "padding_str", + "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" + ), + ])), + related_udfs: Some(vec!["lpad"]), + }, } } } @@ -113,6 +139,10 @@ impl ScalarUDFImpl for RPadFunc { } } } + + fn documentation(&self) -> &Documentation { + &self.documentation + } } pub fn rpad( diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh index 836ba6772eacd..585cb77839f98 100755 --- a/dev/update_config_docs.sh +++ b/dev/update_config_docs.sh @@ -24,7 +24,7 @@ SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SOURCE_DIR}/../" && pwd TARGET_FILE="docs/source/user-guide/configs.md" -PRINT_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs" +PRINT_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs" echo "Inserting header" cat <<'EOF' > "$TARGET_FILE" @@ -67,8 +67,8 @@ Environment variables are read during `SessionConfig` initialisation so they mus EOF -echo "Running CLI and inserting docs table" -$PRINT_DOCS_COMMAND >> "$TARGET_FILE" +echo "Running CLI and inserting config docs table" +$PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE" echo "Running prettier" npx prettier@2.3.2 --write "$TARGET_FILE" diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index c8f0ffbec701e..ababb001f5c5e 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -69,7 +69,7 @@ value ::: :::{note} -Since `&&` and `||` are existed as logical operators in Rust, but those are not overloadable and not works with expression API. +Since `&&` and `||` are logical operators in Rust and cannot be overloaded these are not available in the expression API. ::: ## Bitwise Expressions @@ -151,7 +151,7 @@ but these operators always return a `bool` which makes them not work with the ex | trunc(x) | truncate toward zero | :::{note} -Unlike to some databases the math functions in Datafusion works the same way as Rust math functions, avoiding failing on corner cases e.g +Unlike to some databases the math functions in Datafusion works the same way as Rust math functions, avoiding failing on corner cases e.g. ```sql select log(-1), log(0), sqrt(-1); diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_aggregate_functions_docs.rs b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_aggregate_functions_docs.rs new file mode 100644 index 0000000000000..9f1661cfd6a68 --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_aggregate_functions_docs.rs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::execution::SessionStateDefaults; +use datafusion_expr::aggregate_doc_sections::doc_sections; +use datafusion_expr::AggregateUDF; +use itertools::Itertools; +use std::fmt::Write as _; +use std::sync::Arc; + +fn main() { + let functions = SessionStateDefaults::default_aggregate_functions(); + let mut docs = "".to_string(); + + // doc sections only includes sections that have 'include' == true + for doc_section in doc_sections() { + // make sure there is a function that is in this doc section + if !functions + .iter() + .any(|f| f.documentation().doc_section == doc_section) + { + continue; + } + + // write out section header + let _ = writeln!(&mut docs, "## {} ", doc_section.label); + + if let Some(description) = doc_section.description { + let _ = writeln!(&mut docs, "{description}"); + } + + let filtered = functions + .clone() + .into_iter() + .filter(|f| f.documentation().doc_section == doc_section) + .collect_vec(); + + // names is a sorted list of function names and aliases since we display + // both in the documentation + let names = get_names_and_aliases(&filtered); + + // write out the list of function names and aliases + names.iter().for_each(|name| { + let _ = writeln!(&mut docs, "- [{name}](#{name})"); + }); + + // write out each function and alias in the order of the sorted name list + for name in names { + let f = filtered + .iter() + .find(|f| f.name() == name || f.aliases().contains(&name)) + .unwrap(); + let documentation = f.documentation(); + + // if this name is an alias we need to display what it's an alias of + if f.aliases().contains(&name) { + let _ = write!(&mut docs, "_Alias of [{name}](#{name})._"); + continue; + } + + // otherwise display the documentation for the function + + // first, the name, description and syntax example + let _ = write!( + &mut docs, + r#" +### `{}` + +{} + +``` +{} +``` +"#, + f.name(), + documentation.description, + documentation.syntax_example + ); + + // next, arguments + if let Some(args) = &documentation.arguments { + let _ = writeln!(&mut docs, "#### Arguments\n"); + for (arg_name, arg_desc) in args { + let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); + } + } + + // next, sql example if provided + if let Some(example) = documentation.sql_example { + let _ = writeln!( + &mut docs, + r#" +#### Example + +{} +"#, + example + ); + } + + // next, aliases + if !f.aliases().is_empty() { + let _ = write!(&mut docs, "#### Aliases"); + + for alias in f.aliases() { + let _ = writeln!(&mut docs, "- {alias}"); + } + } + + // finally, any related udfs + if let Some(related_udfs) = &documentation.related_udfs { + let _ = writeln!(&mut docs, "\n**Related functions**:"); + + for related in related_udfs { + let _ = writeln!(&mut docs, "- [{related}](#{related})"); + } + } + } + } + + println!("{docs}"); +} + +fn get_names_and_aliases(functions: &[Arc]) -> Vec { + functions + .iter() + .flat_map(|f| { + if f.aliases().is_empty() { + vec![f.name().to_string()] + } else { + let mut names = vec![f.name().to_string()]; + names.extend(f.aliases().iter().cloned()); + names + } + }) + .sorted() + .collect_vec() +} diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_scalar_functions_docs.rs b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_scalar_functions_docs.rs new file mode 100644 index 0000000000000..b96b42e15948b --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_scalar_functions_docs.rs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::execution::SessionStateDefaults; +use datafusion_expr::scalar_doc_sections::doc_sections; +use datafusion_expr::ScalarUDF; +use itertools::Itertools; +use std::fmt::Write as _; +use std::sync::Arc; + +fn main() { + let functions = SessionStateDefaults::default_scalar_functions(); + let mut docs = "".to_string(); + + // doc sections only includes sections that have 'include' == true + for doc_section in doc_sections() { + // make sure there is a function that is in this doc section + if !functions + .iter() + .any(|f| f.documentation().doc_section == doc_section) + { + continue; + } + + // write out section header + let _ = writeln!(&mut docs, "## {} ", doc_section.label); + + if let Some(description) = doc_section.description { + let _ = writeln!(&mut docs, "{description}"); + } + + let filtered = functions + .clone() + .into_iter() + .filter(|f| f.documentation().doc_section == doc_section) + .collect_vec(); + + // names is a sorted list of function names and aliases since we display + // both in the documentation + let names = get_names_and_aliases(&filtered); + + // write out the list of function names and aliases + names.iter().for_each(|name| { + let _ = writeln!(&mut docs, "- [{name}](#{name})"); + }); + + // write out each function and alias in the order of the sorted name list + for name in names { + let f = filtered + .iter() + .find(|f| f.name() == name || f.aliases().contains(&name)) + .unwrap(); + let documentation = f.documentation(); + + // if this name is an alias we need to display what it's an alias of + if f.aliases().contains(&name) { + let _ = write!(&mut docs, "_Alias of [{name}](#{name})._"); + continue; + } + + // otherwise display the documentation for the function + + // first, the name, description and syntax example + let _ = write!( + &mut docs, + r#" +### `{}` + +{} + +``` +{} +``` +"#, + f.name(), + documentation.description, + documentation.syntax_example + ); + + // next, arguments + if let Some(args) = &documentation.arguments { + let _ = writeln!(&mut docs, "#### Arguments\n"); + for (arg_name, arg_desc) in args { + let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); + } + } + + // next, sql example if provided + if let Some(example) = documentation.sql_example { + let _ = writeln!( + &mut docs, + r#" +#### Example + +{} +"#, + example + ); + } + + // next, aliases + if !f.aliases().is_empty() { + let _ = write!(&mut docs, "#### Aliases"); + + for alias in f.aliases() { + let _ = writeln!(&mut docs, "- {alias}"); + } + } + + // finally, any related udfs + if let Some(related_udfs) = &documentation.related_udfs { + let _ = writeln!(&mut docs, "\n**Related functions**:"); + + for related in related_udfs { + let _ = writeln!(&mut docs, "- [{related}](#{related})"); + } + } + } + } + + println!("{docs}"); +} + +fn get_names_and_aliases(functions: &[Arc]) -> Vec { + functions + .iter() + .flat_map(|f| { + if f.aliases().is_empty() { + vec![f.name().to_string()] + } else { + let mut names = vec![f.name().to_string()]; + names.extend(f.aliases().iter().cloned()); + names + } + }) + .sorted() + .collect_vec() +} diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_window_functions_docs.rs b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_window_functions_docs.rs new file mode 100644 index 0000000000000..272f423af2dcb --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_window_functions_docs.rs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::execution::SessionStateDefaults; +use datafusion_expr::window_doc_sections::doc_sections; +use datafusion_expr::WindowUDF; +use itertools::Itertools; +use std::fmt::Write as _; +use std::sync::Arc; + +fn main() { + let functions = SessionStateDefaults::default_window_functions(); + let mut docs = "".to_string(); + + // doc sections only includes sections that have 'include' == true + for doc_section in doc_sections() { + // make sure there is a function that is in this doc section + if !functions + .iter() + .any(|f| f.documentation().doc_section == doc_section) + { + continue; + } + + // write out section header + let _ = writeln!(&mut docs, "## {} ", doc_section.label); + + if let Some(description) = doc_section.description { + let _ = writeln!(&mut docs, "{description}"); + } + + let filtered = functions + .clone() + .into_iter() + .filter(|f| f.documentation().doc_section == doc_section) + .collect_vec(); + + // names is a sorted list of function names and aliases since we display + // both in the documentation + let names = get_names_and_aliases(&filtered); + + // write out the list of function names and aliases + names.iter().for_each(|name| { + let _ = writeln!(&mut docs, "- [{name}](#{name})"); + }); + + // write out each function and alias in the order of the sorted name list + for name in names { + let f = filtered + .iter() + .find(|f| f.name() == name || f.aliases().contains(&name)) + .unwrap(); + let documentation = f.documentation(); + + // if this name is an alias we need to display what it's an alias of + if f.aliases().contains(&name) { + let _ = write!(&mut docs, "_Alias of [{name}](#{name})._"); + continue; + } + + // otherwise display the documentation for the function + + // first, the name, description and syntax example + let _ = write!( + &mut docs, + r#" +### `{}` + +{} + +``` +{} +``` +"#, + f.name(), + documentation.description, + documentation.syntax_example + ); + + // next, arguments + if let Some(args) = &documentation.arguments { + let _ = writeln!(&mut docs, "#### Arguments\n"); + for (arg_name, arg_desc) in args { + let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); + } + } + + // next, sql example if provided + if let Some(example) = documentation.sql_example { + let _ = writeln!( + &mut docs, + r#" +#### Example + +{} +"#, + example + ); + } + + // next, aliases + if !f.aliases().is_empty() { + let _ = write!(&mut docs, "#### Aliases"); + + for alias in f.aliases() { + let _ = writeln!(&mut docs, "- {alias}"); + } + } + + // finally, any related udfs + if let Some(related_udfs) = &documentation.related_udfs { + let _ = writeln!(&mut docs, "\n**Related functions**:"); + + for related in related_udfs { + let _ = writeln!(&mut docs, "- [{related}](#{related})"); + } + } + } + } + + println!("{docs}"); +} + +fn get_names_and_aliases(functions: &[Arc]) -> Vec { + functions + .iter() + .flat_map(|f| { + if f.aliases().is_empty() { + vec![f.name().to_string()] + } else { + let mut names = vec![f.name().to_string()]; + names.extend(f.aliases().iter().cloned()); + names + } + }) + .sorted() + .collect_vec() +} diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs new file mode 100644 index 0000000000000..32c03bda385f1 --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs @@ -0,0 +1,64 @@ +use indexmap::IndexMap; + +/// Documentation for use by [`crate::ScalarUDFImpl`], +/// [`crate::AggregateUDFImpl`] and [`crate::WindowUDFImpl`] functions +/// that will be used to generate public documentation. +/// +/// The name of the udf will be pulled from the [`crate::ScalarUDFImpl::name`], +/// [`crate::AggregateUDFImpl::name`] or [`crate::WindowUDFImpl::name`] function +/// as appropriate. +/// +/// All strings in the documentation are required to be +/// in [markdown format](https://www.markdownguide.org/basic-syntax/). +/// +/// Currently, documentation only supports a single language +/// thus all text should be in English. +#[derive(Debug, Clone)] +pub struct Documentation { + /// the section in the documentation where the UDF will be documented + pub doc_section: DocSection, + /// the description for the UDF + pub description: &'static str, + pub syntax_example: &'static str, + /// a sql example for the UDF, usually in the form of a sql prompt + /// query and output. It is strongly recommended to provide an + /// example for anything but the most basic UDF's + pub sql_example: Option<&'static str>, + /// arguments for the UDF which will be displayed in insertion + /// order. Key is the argument name, value is a description for + /// the argument + pub arguments: Option>, + /// related functions if any. Values should match the related + /// udf's name exactly. Related udf's must be of the same + /// UDF type (scalar, aggregate or window) for proper linking to + /// occur + pub related_udfs: Option>, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct DocSection { + /// true to include this doc section in the public + /// documentation, false otherwise + pub include: bool, + /// a display label for the doc section. For example: "Math Expressions" + pub label: &'static str, + /// an optional description for the doc section + pub description: Option<&'static str>, +} + +pub const DOCUMENTATION_NONE: Documentation = Documentation { + doc_section: DOC_SECTION_NONE, + description: "", + syntax_example: "", + sql_example: None, + arguments: None, + related_udfs: None, +}; + +/// A doc section that indicated the UDF should not +/// be publicly documented +pub const DOC_SECTION_NONE: DocSection = DocSection { + include: false, + label: "", + description: None, +}; diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_aggregate_docs.sh b/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_aggregate_docs.sh new file mode 100644 index 0000000000000..9ad8074927d27 --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_aggregate_docs.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SOURCE_DIR}/../" && pwd + +TARGET_FILE="docs/source/user-guide/sql/aggregate_functions_new.md" +PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_aggregate_functions_docs" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Aggregate Functions + +Aggregate functions operate on a set of values to compute a single result. +EOF + +echo "Running CLI and inserting aggregate function docs table" +$PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_scalar_docs.sh b/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_scalar_docs.sh new file mode 100644 index 0000000000000..5ff5cebad4f1b --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_scalar_docs.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SOURCE_DIR}/../" && pwd + +TARGET_FILE="docs/source/user-guide/sql/scalar_functions_new.md" +PRINT_SCALAR_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_scalar_functions_docs" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Scalar Functions +EOF + +echo "Running CLI and inserting scalar function docs table" +$PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_window_docs.sh b/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_window_docs.sh new file mode 100644 index 0000000000000..a77fd2fd8cccc --- /dev/null +++ b/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_window_docs.sh @@ -0,0 +1,188 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SOURCE_DIR}/../" && pwd + +TARGET_FILE="docs/source/user-guide/sql/window_functions_new.md" +PRINT_WINDOW_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_window_functions_docs" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + + +# Window Functions + +A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result + +Here is an example that shows how to compare each employee's salary with the average salary in his or her department: + +```sql +SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; + ++-----------+-------+--------+-------------------+ +| depname | empno | salary | avg | ++-----------+-------+--------+-------------------+ +| personnel | 2 | 3900 | 3700.0 | +| personnel | 5 | 3500 | 3700.0 | +| develop | 8 | 6000 | 5020.0 | +| develop | 10 | 5200 | 5020.0 | +| develop | 11 | 5200 | 5020.0 | +| develop | 9 | 4500 | 5020.0 | +| develop | 7 | 4200 | 5020.0 | +| sales | 1 | 5000 | 4866.666666666667 | +| sales | 4 | 4800 | 4866.666666666667 | +| sales | 3 | 4800 | 4866.666666666667 | ++-----------+-------+--------+-------------------+ +``` + +A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. + +You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: + +```sql +SELECT depname, empno, salary, + rank() OVER (PARTITION BY depname ORDER BY salary DESC) +FROM empsalary; + ++-----------+-------+--------+--------+ +| depname | empno | salary | rank | ++-----------+-------+--------+--------+ +| personnel | 2 | 3900 | 1 | +| develop | 8 | 6000 | 1 | +| develop | 10 | 5200 | 2 | +| develop | 11 | 5200 | 2 | +| develop | 9 | 4500 | 4 | +| develop | 7 | 4200 | 5 | +| sales | 1 | 5000 | 1 | +| sales | 4 | 4800 | 2 | +| personnel | 5 | 3500 | 2 | +| sales | 3 | 4800 | 2 | ++-----------+-------+--------+--------+ +``` + +There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: + +```sql +SELECT depname, empno, salary, + avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, + min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min +FROM empsalary +ORDER BY empno ASC; + ++-----------+-------+--------+--------------------+---------+ +| depname | empno | salary | avg | cum_min | ++-----------+-------+--------+--------------------+---------+ +| sales | 1 | 5000 | 5000.0 | 5000 | +| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | +| sales | 3 | 4800 | 4700.0 | 3900 | +| sales | 4 | 4800 | 4866.666666666667 | 3900 | +| personnel | 5 | 3500 | 3700.0 | 3500 | +| develop | 7 | 4200 | 4200.0 | 3500 | +| develop | 8 | 6000 | 5600.0 | 3500 | +| develop | 9 | 4500 | 4500.0 | 3500 | +| develop | 10 | 5200 | 5133.333333333333 | 3500 | +| develop | 11 | 5200 | 5466.666666666667 | 3500 | ++-----------+-------+--------+--------------------+---------+ +``` + +When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: + +```sql +SELECT sum(salary) OVER w, avg(salary) OVER w +FROM empsalary +WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); +``` + +## Syntax + +The syntax for the OVER-clause is + +``` +function([expr]) + OVER( + [PARTITION BY expr[, …]] + [ORDER BY expr [ ASC | DESC ][, …]] + [ frame_clause ] + ) +``` + +where **frame_clause** is one of: + +``` + { RANGE | ROWS | GROUPS } frame_start + { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end +``` + +and **frame_start** and **frame_end** can be one of + +```sql +UNBOUNDED PRECEDING +offset PRECEDING +CURRENT ROW +offset FOLLOWING +UNBOUNDED FOLLOWING +``` + +where **offset** is an non-negative integer. + +RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). + +## Aggregate functions + +All [aggregate functions](aggregate_functions.md) can be used as window functions. + +EOF + +echo "Running CLI and inserting window function docs table" +$PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" From 148efb7b5c78282923a0eee1c0d496b271363efd Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 28 Sep 2024 17:21:48 -0400 Subject: [PATCH 02/13] Add missing license header. --- datafusion/expr/src/udf_docs.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index 32c03bda385f1..2deb48ffdd433 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use indexmap::IndexMap; /// Documentation for use by [`crate::ScalarUDFImpl`], From 065b548774213f6a9f9b744cee41237396258ad2 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 28 Sep 2024 17:23:39 -0400 Subject: [PATCH 03/13] Fixed examples. --- datafusion/expr/src/udaf.rs | 2 +- datafusion/expr/src/udf.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 782e62618bf70..7aeeff3799fae 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -328,7 +328,7 @@ where /// Field::new("ordering", DataType::UInt32, true) /// ]) /// } -/// fn documentation(&self) -> Documentation { +/// fn documentation(&self) -> &Documentation { /// &self.documentation /// } /// } diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index daa4b33cf3b37..b059b04197601 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -348,7 +348,7 @@ where /// } /// // The actual implementation would add one to the argument /// fn invoke(&self, args: &[ColumnarValue]) -> Result { unimplemented!() } -/// fn documentation(&self) -> Documentation { +/// fn documentation(&self) -> &Documentation { /// &self.documentation /// } /// } From 8c2569711d638ddcb229e0b050063f40c8dde44e Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 28 Sep 2024 19:37:50 -0400 Subject: [PATCH 04/13] Fixing a really weird RustRover/wsl ... something. No clue what happened there. --- .../src/bin/print_aggregate_functions_docs.rs | 0 .../src/bin/print_scalar_functions_docs.rs | 0 .../src/bin/print_window_functions_docs.rs | 0 .../dev => dev}/update_aggregate_docs.sh | 0 .../dev => dev}/update_scalar_docs.sh | 0 .../dev => dev}/update_window_docs.sh | 0 .../datafusion/expr/src/udf_docs.rs | 64 ------------------- 7 files changed, 64 deletions(-) rename {wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion => datafusion}/core/src/bin/print_aggregate_functions_docs.rs (100%) rename {wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion => datafusion}/core/src/bin/print_scalar_functions_docs.rs (100%) rename {wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion => datafusion}/core/src/bin/print_window_functions_docs.rs (100%) rename {wsl.localhost/Ubuntu/opt/dev/datafusion/dev => dev}/update_aggregate_docs.sh (100%) rename {wsl.localhost/Ubuntu/opt/dev/datafusion/dev => dev}/update_scalar_docs.sh (100%) rename {wsl.localhost/Ubuntu/opt/dev/datafusion/dev => dev}/update_window_docs.sh (100%) delete mode 100644 wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_aggregate_functions_docs.rs b/datafusion/core/src/bin/print_aggregate_functions_docs.rs similarity index 100% rename from wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_aggregate_functions_docs.rs rename to datafusion/core/src/bin/print_aggregate_functions_docs.rs diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_scalar_functions_docs.rs b/datafusion/core/src/bin/print_scalar_functions_docs.rs similarity index 100% rename from wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_scalar_functions_docs.rs rename to datafusion/core/src/bin/print_scalar_functions_docs.rs diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_window_functions_docs.rs b/datafusion/core/src/bin/print_window_functions_docs.rs similarity index 100% rename from wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/core/src/bin/print_window_functions_docs.rs rename to datafusion/core/src/bin/print_window_functions_docs.rs diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_aggregate_docs.sh b/dev/update_aggregate_docs.sh similarity index 100% rename from wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_aggregate_docs.sh rename to dev/update_aggregate_docs.sh diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_scalar_docs.sh b/dev/update_scalar_docs.sh similarity index 100% rename from wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_scalar_docs.sh rename to dev/update_scalar_docs.sh diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_window_docs.sh b/dev/update_window_docs.sh similarity index 100% rename from wsl.localhost/Ubuntu/opt/dev/datafusion/dev/update_window_docs.sh rename to dev/update_window_docs.sh diff --git a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs b/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs deleted file mode 100644 index 32c03bda385f1..0000000000000 --- a/wsl.localhost/Ubuntu/opt/dev/datafusion/datafusion/expr/src/udf_docs.rs +++ /dev/null @@ -1,64 +0,0 @@ -use indexmap::IndexMap; - -/// Documentation for use by [`crate::ScalarUDFImpl`], -/// [`crate::AggregateUDFImpl`] and [`crate::WindowUDFImpl`] functions -/// that will be used to generate public documentation. -/// -/// The name of the udf will be pulled from the [`crate::ScalarUDFImpl::name`], -/// [`crate::AggregateUDFImpl::name`] or [`crate::WindowUDFImpl::name`] function -/// as appropriate. -/// -/// All strings in the documentation are required to be -/// in [markdown format](https://www.markdownguide.org/basic-syntax/). -/// -/// Currently, documentation only supports a single language -/// thus all text should be in English. -#[derive(Debug, Clone)] -pub struct Documentation { - /// the section in the documentation where the UDF will be documented - pub doc_section: DocSection, - /// the description for the UDF - pub description: &'static str, - pub syntax_example: &'static str, - /// a sql example for the UDF, usually in the form of a sql prompt - /// query and output. It is strongly recommended to provide an - /// example for anything but the most basic UDF's - pub sql_example: Option<&'static str>, - /// arguments for the UDF which will be displayed in insertion - /// order. Key is the argument name, value is a description for - /// the argument - pub arguments: Option>, - /// related functions if any. Values should match the related - /// udf's name exactly. Related udf's must be of the same - /// UDF type (scalar, aggregate or window) for proper linking to - /// occur - pub related_udfs: Option>, -} - -#[derive(Debug, Clone, PartialEq)] -pub struct DocSection { - /// true to include this doc section in the public - /// documentation, false otherwise - pub include: bool, - /// a display label for the doc section. For example: "Math Expressions" - pub label: &'static str, - /// an optional description for the doc section - pub description: Option<&'static str>, -} - -pub const DOCUMENTATION_NONE: Documentation = Documentation { - doc_section: DOC_SECTION_NONE, - description: "", - syntax_example: "", - sql_example: None, - arguments: None, - related_udfs: None, -}; - -/// A doc section that indicated the UDF should not -/// be publicly documented -pub const DOC_SECTION_NONE: DocSection = DocSection { - include: false, - label: "", - description: None, -}; From f57db3f2f4cb0bb3fb2f4cbb3f6cef692dbe85f5 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sat, 28 Sep 2024 21:02:09 -0400 Subject: [PATCH 05/13] permission change --- dev/update_aggregate_docs.sh | 0 dev/update_scalar_docs.sh | 0 dev/update_window_docs.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 dev/update_aggregate_docs.sh mode change 100644 => 100755 dev/update_scalar_docs.sh mode change 100644 => 100755 dev/update_window_docs.sh diff --git a/dev/update_aggregate_docs.sh b/dev/update_aggregate_docs.sh old mode 100644 new mode 100755 diff --git a/dev/update_scalar_docs.sh b/dev/update_scalar_docs.sh old mode 100644 new mode 100755 diff --git a/dev/update_window_docs.sh b/dev/update_window_docs.sh old mode 100644 new mode 100755 From 0c4bde3c37300e14bdb9c6ecdb62b1389cbc099f Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 29 Sep 2024 09:23:39 -0400 Subject: [PATCH 06/13] Cargo fmt update. --- datafusion/expr/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index acc7e0ffb1ed7..60696e217e9bc 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -94,7 +94,7 @@ pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; pub use udaf::{aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF}; pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl}; pub use udf_docs::{DocSection, Documentation, DOCUMENTATION_NONE, DOC_SECTION_NONE}; -pub use udwf::{ReversedUDWF, window_doc_sections, WindowUDF, WindowUDFImpl}; +pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; #[cfg(test)] From bcf5a764e59446732f8a2b6e258fdb717d8270a8 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 29 Sep 2024 11:54:23 -0400 Subject: [PATCH 07/13] Refactored Documentation to allow it to be used in a const. --- datafusion-cli/Cargo.lock | 2 - .../src/bin/print_aggregate_functions_docs.rs | 4 +- .../src/bin/print_scalar_functions_docs.rs | 4 +- .../src/bin/print_window_functions_docs.rs | 6 +- datafusion/expr/Cargo.toml | 1 - datafusion/expr/src/udaf.rs | 21 ++--- datafusion/expr/src/udf.rs | 23 +++-- datafusion/expr/src/udf_docs.rs | 12 +-- datafusion/expr/src/udwf.rs | 21 ++--- .../functions-aggregate/src/bit_and_or_xor.rs | 85 +++++++++-------- datafusion/functions-window/src/row_number.rs | 21 ++--- datafusion/functions/Cargo.toml | 1 - datafusion/functions/src/core/coalesce.rs | 31 +++---- datafusion/functions/src/crypto/sha224.rs | 33 +++---- datafusion/functions/src/datetime/to_date.rs | 91 +++++++++---------- datafusion/functions/src/encoding/inner.rs | 53 ++++++----- datafusion/functions/src/math/log.rs | 41 ++++----- datafusion/functions/src/regex/regexplike.rs | 72 +++++++-------- datafusion/functions/src/string/ascii.rs | 31 +++---- datafusion/functions/src/unicode/rpad.rs | 47 +++++----- 20 files changed, 295 insertions(+), 305 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 179d410e185ee..6d4eeee97675b 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1345,7 +1345,6 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap", "paste", "serde_json", "sqlparser", @@ -1377,7 +1376,6 @@ dependencies = [ "datafusion-expr", "hashbrown", "hex", - "indexmap", "itertools", "log", "md-5", diff --git a/datafusion/core/src/bin/print_aggregate_functions_docs.rs b/datafusion/core/src/bin/print_aggregate_functions_docs.rs index 9f1661cfd6a68..83fe99aa33682 100644 --- a/datafusion/core/src/bin/print_aggregate_functions_docs.rs +++ b/datafusion/core/src/bin/print_aggregate_functions_docs.rs @@ -92,7 +92,7 @@ fn main() { ); // next, arguments - if let Some(args) = &documentation.arguments { + if let Some(args) = documentation.arguments { let _ = writeln!(&mut docs, "#### Arguments\n"); for (arg_name, arg_desc) in args { let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); @@ -122,7 +122,7 @@ fn main() { } // finally, any related udfs - if let Some(related_udfs) = &documentation.related_udfs { + if let Some(related_udfs) = documentation.related_udfs { let _ = writeln!(&mut docs, "\n**Related functions**:"); for related in related_udfs { diff --git a/datafusion/core/src/bin/print_scalar_functions_docs.rs b/datafusion/core/src/bin/print_scalar_functions_docs.rs index b96b42e15948b..951500b5f02b6 100644 --- a/datafusion/core/src/bin/print_scalar_functions_docs.rs +++ b/datafusion/core/src/bin/print_scalar_functions_docs.rs @@ -92,7 +92,7 @@ fn main() { ); // next, arguments - if let Some(args) = &documentation.arguments { + if let Some(args) = documentation.arguments { let _ = writeln!(&mut docs, "#### Arguments\n"); for (arg_name, arg_desc) in args { let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); @@ -122,7 +122,7 @@ fn main() { } // finally, any related udfs - if let Some(related_udfs) = &documentation.related_udfs { + if let Some(related_udfs) = documentation.related_udfs { let _ = writeln!(&mut docs, "\n**Related functions**:"); for related in related_udfs { diff --git a/datafusion/core/src/bin/print_window_functions_docs.rs b/datafusion/core/src/bin/print_window_functions_docs.rs index 272f423af2dcb..8a2f793393f51 100644 --- a/datafusion/core/src/bin/print_window_functions_docs.rs +++ b/datafusion/core/src/bin/print_window_functions_docs.rs @@ -92,7 +92,7 @@ fn main() { ); // next, arguments - if let Some(args) = &documentation.arguments { + if let Some(args) = documentation.arguments { let _ = writeln!(&mut docs, "#### Arguments\n"); for (arg_name, arg_desc) in args { let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); @@ -122,10 +122,10 @@ fn main() { } // finally, any related udfs - if let Some(related_udfs) = &documentation.related_udfs { + if let Some(related_udfs) = documentation.related_udfs { let _ = writeln!(&mut docs, "\n**Related functions**:"); - for related in related_udfs { + for &related in related_udfs { let _ = writeln!(&mut docs, "- [{related}](#{related})"); } } diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index d7dc1afe4d505..55387fea22eeb 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -48,7 +48,6 @@ datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } -indexmap = { workspace = true } paste = "^1.0" serde_json = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 7aeeff3799fae..24fa89e45cb1e 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -285,30 +285,29 @@ where /// # use datafusion_expr::window_doc_sections::DOC_SECTION_AGGREGATE; /// # use arrow::datatypes::Schema; /// # use arrow::datatypes::Field; -/// # use indexmap::IndexMap; /// /// #[derive(Debug, Clone)] /// struct GeoMeanUdf { /// signature: Signature, -/// documentation: Documentation, /// } /// /// impl GeoMeanUdf { /// fn new() -> Self { /// Self { /// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable), -/// documentation: Documentation { -/// doc_section: DOC_SECTION_AGGREGATE, -/// description: "calculates a geometric mean", -/// syntax_example: "geo_mean(2.0)", -/// sql_example: None, -/// arguments: Some(IndexMap::from([("arg_1", "The Float64 number for the geometric mean")])), -/// related_udfs: None, -/// } /// } /// } /// } /// +/// const DOCUMENTATION: Documentation = Documentation { +/// doc_section: DOC_SECTION_AGGREGATE, +/// description: "calculates a geometric mean", +/// syntax_example: "geo_mean(2.0)", +/// sql_example: None, +/// arguments: Some(&[("arg_1", "The Float64 number for the geometric mean")]), +/// related_udfs: None, +/// }; +/// /// /// Implement the AggregateUDFImpl trait for GeoMeanUdf /// impl AggregateUDFImpl for GeoMeanUdf { /// fn as_any(&self) -> &dyn Any { self } @@ -329,7 +328,7 @@ where /// ]) /// } /// fn documentation(&self) -> &Documentation { -/// &self.documentation +/// &DOCUMENTATION /// } /// } /// diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index b059b04197601..36cc8f64d8a98 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -307,7 +307,6 @@ where /// ``` /// # use std::any::Any; /// # use arrow::datatypes::DataType; -/// # use indexmap::IndexMap; /// # use datafusion_common::{DataFusionError, plan_err, Result}; /// # use datafusion_expr::{col, ColumnarValue, Documentation, Signature, Volatility}; /// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF}; @@ -316,25 +315,25 @@ where /// #[derive(Debug)] /// struct AddOne { /// signature: Signature, -/// documentation: Documentation, /// } /// /// impl AddOne { /// fn new() -> Self { /// Self { /// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), -/// documentation: Documentation { -/// doc_section: DOC_SECTION_MATH, -/// description: "Add one to an int32", -/// syntax_example: "add_one(2)", -/// sql_example: None, -/// arguments: Some(IndexMap::from([("arg_1", "The int32 number to add one to")])), -/// related_udfs: None, -/// } /// } /// } /// } -/// +/// +/// const DOCUMENTATION: Documentation = Documentation { +/// doc_section: DOC_SECTION_MATH, +/// description: "Add one to an int32", +/// syntax_example: "add_one(2)", +/// sql_example: None, +/// arguments: Some(&[("arg_1", "The int32 number to add one to")]), +/// related_udfs: None, +/// }; +/// /// /// Implement the ScalarUDFImpl trait for AddOne /// impl ScalarUDFImpl for AddOne { /// fn as_any(&self) -> &dyn Any { self } @@ -349,7 +348,7 @@ where /// // The actual implementation would add one to the argument /// fn invoke(&self, args: &[ColumnarValue]) -> Result { unimplemented!() } /// fn documentation(&self) -> &Documentation { -/// &self.documentation +/// &DOCUMENTATION /// } /// } /// diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index 2deb48ffdd433..faf6492c11eb0 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use indexmap::IndexMap; - /// Documentation for use by [`crate::ScalarUDFImpl`], /// [`crate::AggregateUDFImpl`] and [`crate::WindowUDFImpl`] functions /// that will be used to generate public documentation. @@ -41,15 +39,15 @@ pub struct Documentation { /// query and output. It is strongly recommended to provide an /// example for anything but the most basic UDF's pub sql_example: Option<&'static str>, - /// arguments for the UDF which will be displayed in insertion - /// order. Key is the argument name, value is a description for - /// the argument - pub arguments: Option>, + /// arguments for the UDF which will be displayed in array order. + /// Left member of a pair is the argument name, right is a + /// description for the argument + pub arguments: Option<&'static [(&'static str, &'static str)]>, /// related functions if any. Values should match the related /// udf's name exactly. Related udf's must be of the same /// UDF type (scalar, aggregate or window) for proper linking to /// occur - pub related_udfs: Option>, + pub related_udfs: Option<&'static [&'static str]>, } #[derive(Debug, Clone, PartialEq)] diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 0471c0788d32f..7f9c6a5173c9c 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -217,30 +217,29 @@ where /// # use datafusion_expr::{WindowUDFImpl, WindowUDF}; /// # use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL; /// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; -/// # use indexmap::IndexMap; /// /// #[derive(Debug, Clone)] /// struct SmoothIt { /// signature: Signature, -/// documentation: Documentation, /// } /// /// impl SmoothIt { /// fn new() -> Self { /// Self { /// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), -/// documentation: Documentation { -/// doc_section: DOC_SECTION_ANALYTICAL, -/// description: "smooths the windows", -/// syntax_example: "smooth_it(2)", -/// sql_example: None, -/// arguments: Some(IndexMap::from([("arg_1", "The int32 number to smooth by")])), -/// related_udfs: None, -/// } /// } /// } /// } /// +/// const DOCUMENTATION: Documentation = Documentation { +/// doc_section: DOC_SECTION_ANALYTICAL, +/// description: "smooths the windows", +/// syntax_example: "smooth_it(2)", +/// sql_example: None, +/// arguments: Some(&[("arg_1", "The int32 number to smooth by")]), +/// related_udfs: None, +/// }; + /// /// Implement the WindowUDFImpl trait for SmoothIt /// impl WindowUDFImpl for SmoothIt { /// fn as_any(&self) -> &dyn Any { self } @@ -256,7 +255,7 @@ where /// } /// } /// fn documentation(&self) -> &Documentation { -/// &self.documentation +/// &DOCUMENTATION /// } /// } /// diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index 4307a6d68f5d6..48d2b9646cc74 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -17,7 +17,6 @@ //! Defines `BitAnd`, `BitOr`, `BitXor` and `BitXor DISTINCT` aggregate accumulators -use indexmap::IndexMap; use std::any::Any; use std::collections::HashSet; use std::fmt::{Display, Formatter}; @@ -134,59 +133,65 @@ macro_rules! make_bitwise_udaf_expr_and_func { }; } +const BIT_AND_DOC: Documentation = Documentation { + doc_section: DOC_SECTION_GENERAL, + description: "Computes the bitwise AND of all non-null input values.", + syntax_example: "bit_and(expression)", + sql_example: None, + arguments: Some(&[ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ), + ]), + related_udfs: None, +}; + +const BIT_OR_DOC: Documentation = Documentation { + doc_section: DOC_SECTION_GENERAL, + description: "Computes the bitwise OR of all non-null input values.", + syntax_example: "bit_or(expression)", + sql_example: None, + arguments: Some(&[ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ), + ]), + related_udfs: None, +}; + +const BIT_XOR_DOC: Documentation = Documentation { + doc_section: DOC_SECTION_GENERAL, + description: "Computes the bitwise exclusive OR of all non-null input values.", + syntax_example: "bit_xor(expression)", + sql_example: None, + arguments: Some(&[ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ), + ]), + related_udfs: None, +}; + make_bitwise_udaf_expr_and_func!( bit_and, bit_and_udaf, BitwiseOperationType::And, - Documentation { - doc_section: DOC_SECTION_GENERAL, - description: "Computes the bitwise AND of all non-null input values.", - syntax_example: "bit_and(expression)", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ), - ])), - related_udfs: None, - } + BIT_AND_DOC ); make_bitwise_udaf_expr_and_func!( bit_or, bit_or_udaf, BitwiseOperationType::Or, - Documentation { - doc_section: DOC_SECTION_GENERAL, - description: "Computes the bitwise OR of all non-null input values.", - syntax_example: "bit_or(expression)", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ), - ])), - related_udfs: None, - } + BIT_OR_DOC ); make_bitwise_udaf_expr_and_func!( bit_xor, bit_xor_udaf, BitwiseOperationType::Xor, - Documentation { - doc_section: DOC_SECTION_GENERAL, - description: "Computes the bitwise exclusive OR of all non-null input values.", - syntax_example: "bit_xor(expression)", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ), - ])), - related_udfs: None, - } + BIT_XOR_DOC ); /// The different types of bitwise operations that can be performed. diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index a50939639dc2f..46e239203ff08 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -60,7 +60,6 @@ pub fn row_number_udwf() -> std::sync::Arc { #[derive(Debug)] pub struct RowNumber { signature: Signature, - documentation: Documentation, } impl RowNumber { @@ -68,15 +67,6 @@ impl RowNumber { pub fn new() -> Self { Self { signature: Signature::any(0, Volatility::Immutable), - documentation: Documentation { - doc_section: DOC_SECTION_RANKING, - description: - "Number of the current row within its partition, counting from 1.", - syntax_example: "row_number()", - sql_example: None, - arguments: None, - related_udfs: None, - }, } } } @@ -87,6 +77,15 @@ impl Default for RowNumber { } } +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_RANKING, + description: "Number of the current row within its partition, counting from 1.", + syntax_example: "row_number()", + sql_example: None, + arguments: None, + related_udfs: None, +}; + impl WindowUDFImpl for RowNumber { fn as_any(&self) -> &dyn Any { self @@ -116,7 +115,7 @@ impl WindowUDFImpl for RowNumber { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 0b21be6821b01..ff1b926a9b822 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -76,7 +76,6 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } hashbrown = { workspace = true, optional = true } hex = { version = "0.4", optional = true } -indexmap = { workspace = true } itertools = { workspace = true } log = { workspace = true } md-5 = { version = "^0.10.0", optional = true } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index d2d4b04872e49..c0859bd4ae100 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -26,13 +26,11 @@ use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::type_coercion::binary::type_union_resolution; use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use indexmap::IndexMap; use itertools::Itertools; #[derive(Debug)] pub struct CoalesceFunc { signature: Signature, - documentation: Documentation, } impl Default for CoalesceFunc { @@ -45,23 +43,24 @@ impl CoalesceFunc { pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), - documentation: Documentation { - doc_section: DOC_SECTION_CONDITIONAL, - description: "Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.", - syntax_example: "coalesce(expression1[, ..., expression_n])", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "expression1, expression_n", - "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." - ), - ])), - related_udfs: None, - }, } } } +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_CONDITIONAL, + description: "Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.", + syntax_example: "coalesce(expression1[, ..., expression_n])", + sql_example: None, + arguments: Some(&[ + ( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ), + ]), + related_udfs: None, +}; + impl ScalarUDFImpl for CoalesceFunc { fn as_any(&self) -> &dyn Any { self @@ -158,7 +157,7 @@ impl ScalarUDFImpl for CoalesceFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 813f51aef3356..6104f3a98d341 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -23,14 +23,13 @@ use datafusion_expr::scalar_doc_sections::DOC_SECTION_HASHING; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; -use indexmap::IndexMap; use std::any::Any; #[derive(Debug)] pub struct SHA224Func { signature: Signature, - documentation: Documentation, } + impl Default for SHA224Func { fn default() -> Self { Self::new() @@ -46,22 +45,24 @@ impl SHA224Func { vec![Utf8, LargeUtf8, Binary, LargeBinary], Volatility::Immutable, ), - documentation: Documentation { - doc_section: DOC_SECTION_HASHING, - description: "Computes the SHA-224 hash of a binary string.", - syntax_example: "sha224(expression)", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "expression", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." - ), - ])), - related_udfs: None, - } } } } + +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_HASHING, + description: "Computes the SHA-224 hash of a binary string.", + syntax_example: "sha224(expression)", + sql_example: None, + arguments: Some(&[ + ( + "expression", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." + ), + ]), + related_udfs: None, +}; + impl ScalarUDFImpl for SHA224Func { fn as_any(&self) -> &dyn Any { self @@ -84,6 +85,6 @@ impl ScalarUDFImpl for SHA224Func { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 5b72f7d30705d..f0b53b965dc1e 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -28,12 +28,10 @@ use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; -use indexmap::IndexMap; #[derive(Debug)] pub struct ToDateFunc { signature: Signature, - documentation: Documentation, } impl Default for ToDateFunc { @@ -46,49 +44,6 @@ impl ToDateFunc { pub fn new() -> Self { Self { signature: Signature::variadic_any(Volatility::Immutable), - documentation: Documentation { - doc_section: DOC_SECTION_DATETIME, - description: r#"Converts a value to a date (`YYYY-MM-DD`). -Supports strings, integer and double types as input. -Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. -Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). -Returns the corresponding date. - -Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. -"#, - syntax_example: "to_date('2017-05-31', '%Y-%m-%d')", - sql_example: Some( - r#"```sql -> select to_date('2023-01-31'); -+-----------------------------+ -| to_date(Utf8("2023-01-31")) | -+-----------------------------+ -| 2023-01-31 | -+-----------------------------+ -> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); -+---------------------------------------------------------------+ -| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | -+---------------------------------------------------------------+ -| 2023-01-31 | -+---------------------------------------------------------------+ -``` - -Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) -"#), - arguments: Some(IndexMap::from([ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." - ), - ( - "format_n", - "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order - they appear with the first successful one being returned. If none of the formats successfully parse the expression - an error will be returned.", - ) - ])), - related_udfs: None, - } } } @@ -124,6 +79,50 @@ Additional examples can be found [here](https://github.com/apache/datafusion/blo } } +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_DATETIME, + description: r#"Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. +"#, + syntax_example: "to_date('2017-05-31', '%Y-%m-%d')", + sql_example: Some( + r#"```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) +"#), + arguments: Some(&[ + ( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + ( + "format_n", + "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned.", + ) + ]), + related_udfs: None, +}; + impl ScalarUDFImpl for ToDateFunc { fn as_any(&self) -> &dyn Any { self @@ -166,7 +165,7 @@ impl ScalarUDFImpl for ToDateFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index ba66f554d219c..f41a10f6ace7f 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -34,13 +34,11 @@ use std::{fmt, str::FromStr}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use indexmap::IndexMap; use std::any::Any; #[derive(Debug)] pub struct EncodeFunc { signature: Signature, - documentation: Documentation, } impl Default for EncodeFunc { @@ -53,21 +51,22 @@ impl EncodeFunc { pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), - documentation: Documentation { - doc_section: DOC_SECTION_BINARY_STRING, - description: "Encode binary data into a textual representation.", - syntax_example: "encode(expression, format)", - sql_example: None, - arguments: Some(IndexMap::from([ - ("expression", "Expression containing string or binary data"), - ("format", "Supported formats are: `base64`, `hex`"), - ])), - related_udfs: Some(vec!["decode"]), - }, } } } +const ENCODE_DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_BINARY_STRING, + description: "Encode binary data into a textual representation.", + syntax_example: "encode(expression, format)", + sql_example: None, + arguments: Some(&[ + ("expression", "Expression containing string or binary data"), + ("format", "Supported formats are: `base64`, `hex`"), + ]), + related_udfs: Some(&["decode"]), +}; + impl ScalarUDFImpl for EncodeFunc { fn as_any(&self) -> &dyn Any { self @@ -116,14 +115,13 @@ impl ScalarUDFImpl for EncodeFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &ENCODE_DOCUMENTATION } } #[derive(Debug)] pub struct DecodeFunc { signature: Signature, - documentation: Documentation, } impl Default for DecodeFunc { @@ -136,21 +134,22 @@ impl DecodeFunc { pub fn new() -> Self { Self { signature: Signature::user_defined(Volatility::Immutable), - documentation: Documentation { - doc_section: DOC_SECTION_BINARY_STRING, - description: "Decode binary data from textual representation in string.", - syntax_example: "decode(expression, format)", - sql_example: None, - arguments: Some(IndexMap::from([ - ("expression", "Expression containing encoded string data"), - ("format", "Same arguments as [encode](#encode)"), - ])), - related_udfs: Some(vec!["encode"]), - }, } } } +const DECODE_DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_BINARY_STRING, + description: "Decode binary data from textual representation in string.", + syntax_example: "decode(expression, format)", + sql_example: None, + arguments: Some(&[ + ("expression", "Expression containing encoded string data"), + ("format", "Same arguments as [encode](#encode)"), + ]), + related_udfs: Some(&["encode"]), +}; + impl ScalarUDFImpl for DecodeFunc { fn as_any(&self) -> &dyn Any { self @@ -199,7 +198,7 @@ impl ScalarUDFImpl for DecodeFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DECODE_DOCUMENTATION } } diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index 66869e803886f..5925b9a47bf26 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -36,12 +36,10 @@ use datafusion_expr::{ lit, ColumnarValue, Documentation, Expr, ScalarUDF, TypeSignature::*, }; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use indexmap::IndexMap; #[derive(Debug)] pub struct LogFunc { signature: Signature, - documentation: Documentation, } impl Default for LogFunc { @@ -50,6 +48,25 @@ impl Default for LogFunc { } } +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_MATH, + description: "Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.", + syntax_example: r#"log(base, numeric_expression) +log(numeric_expression)"#, + sql_example: None, + arguments: Some(&[ + ( + "base", + "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + ( + "numeric_expression", + "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + ]), + related_udfs: None, +}; + impl LogFunc { pub fn new() -> Self { use DataType::*; @@ -63,24 +80,6 @@ impl LogFunc { ], Volatility::Immutable, ), - documentation: Documentation { - doc_section: DOC_SECTION_MATH, - description: "Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.", - syntax_example: r#"log(base, numeric_expression) -log(numeric_expression)"#, - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "base", - "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." - ), - ( - "numeric_expression", - "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." - ), - ])), - related_udfs: None, - } } } } @@ -188,7 +187,7 @@ impl ScalarUDFImpl for LogFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } /// Simplify the `log` function by the relevant rules: diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 9e74a86f1e52d..41349e6e80dfb 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -29,40 +29,26 @@ use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use indexmap::IndexMap; use std::any::Any; use std::sync::Arc; #[derive(Debug)] pub struct RegexpLikeFunc { signature: Signature, - documentation: Documentation, } + impl Default for RegexpLikeFunc { fn default() -> Self { Self::new() } } -impl RegexpLikeFunc { - pub fn new() -> Self { - use DataType::*; - Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), - ], - Volatility::Immutable, - ), - documentation: Documentation { - doc_section: DOC_SECTION_REGEX, - description: "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.", - syntax_example: "regexp_like(str, regexp[, flags])", - sql_example: Some( - r#"```sql +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_REGEX, + description: "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.", + syntax_example: "regexp_like(str, regexp[, flags])", + sql_example: Some( + r#"```sql select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); +--------------------------------------------------------+ | regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | @@ -78,25 +64,39 @@ SELECT regexp_like('aBc', '(b|d)', 'i'); ``` Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) "#), - arguments: Some(IndexMap::from([ - ( - "str", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." - ), - ( "regexp", - "Regular expression to test against the string expression. Can be a constant, column, or function." - ), - ("flags", - r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + arguments: Some(&[ + ( + "str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." + ), + ( "regexp", + "Regular expression to test against the string expression. Can be a constant, column, or function." + ), + ("flags", + r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: - **i**: case-insensitive: letters match both upper and lower case - **m**: multi-line mode: ^ and $ match begin/end of line - **s**: allow . to match \n - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used - **U**: swap the meaning of x* and x*?"# - ) - ])), - related_udfs: None, - } + ) + ]), + related_udfs: None, +}; + +impl RegexpLikeFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8]), + Exact(vec![LargeUtf8, Utf8]), + Exact(vec![Utf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, Utf8, Utf8]), + ], + Volatility::Immutable, + ), } } } @@ -150,7 +150,7 @@ impl ScalarUDFImpl for RegexpLikeFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } fn regexp_like_func(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index ca09ada0fed69..3e0b321804720 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -23,14 +23,26 @@ use datafusion_common::{internal_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use indexmap::IndexMap; use std::any::Any; use std::sync::Arc; +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_STRING, + description: "Returns the ASCII value of the first character in a string.", + syntax_example: "ascii(str)", + sql_example: None, + arguments: Some(&[ + ( + "str", + "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View." + ) + ]), + related_udfs: Some(&["chr"]), +}; + #[derive(Debug)] pub struct AsciiFunc { signature: Signature, - documentation: Documentation, } impl Default for AsciiFunc { @@ -48,19 +60,6 @@ impl AsciiFunc { vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), - documentation: Documentation { - doc_section: DOC_SECTION_STRING, - description: "Returns the ASCII value of the first character in a string.", - syntax_example: "ascii(str)", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "str", - "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View." - ) - ])), - related_udfs: Some(vec!["chr"]), - }, } } } @@ -89,7 +88,7 @@ impl ScalarUDFImpl for AsciiFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 1867e7cfecae9..e72490e7afb72 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -30,7 +30,6 @@ use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; -use indexmap::IndexMap; use std::any::Any; use std::fmt::Write; use std::sync::Arc; @@ -40,7 +39,6 @@ use DataType::{LargeUtf8, Utf8, Utf8View}; #[derive(Debug)] pub struct RPadFunc { signature: Signature, - documentation: Documentation, } impl Default for RPadFunc { @@ -49,6 +47,28 @@ impl Default for RPadFunc { } } +const DOCUMENTATION: Documentation = Documentation { + doc_section: DOC_SECTION_STRING, + description: "Pads the right side of a string with another string to a specified string length.", + syntax_example: "rpad(str, n[, padding_str])", + sql_example: None, + arguments: Some(&[ + ( + "str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." + ), + ( + "n", + "String length to pad to." + ), + ( + "padding_str", + "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" + ), + ]), + related_udfs: Some(&["lpad"]), +}; + impl RPadFunc { pub fn new() -> Self { use DataType::*; @@ -70,27 +90,6 @@ impl RPadFunc { ], Volatility::Immutable, ), - documentation: Documentation { - doc_section: DOC_SECTION_STRING, - description: "Pads the right side of a string with another string to a specified string length.", - syntax_example: "rpad(str, n[, padding_str])", - sql_example: None, - arguments: Some(IndexMap::from([ - ( - "str", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." - ), - ( - "n", - "String length to pad to." - ), - ( - "padding_str", - "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" - ), - ])), - related_udfs: Some(vec!["lpad"]), - }, } } } @@ -141,7 +140,7 @@ impl ScalarUDFImpl for RPadFunc { } fn documentation(&self) -> &Documentation { - &self.documentation + &DOCUMENTATION } } From a236784e23b2ee406130160bca8d13e358beeb89 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 29 Sep 2024 19:53:41 -0400 Subject: [PATCH 08/13] Add documentation for syntax_example --- datafusion/expr/src/udf_docs.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index faf6492c11eb0..2b3c1a34ac5a1 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -34,6 +34,7 @@ pub struct Documentation { pub doc_section: DocSection, /// the description for the UDF pub description: &'static str, + /// a brief example of the syntax. For example "ascii(str)" pub syntax_example: &'static str, /// a sql example for the UDF, usually in the form of a sql prompt /// query and output. It is strongly recommended to provide an From 9050171a8340848e2dee77eef02ceac394169918 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 1 Oct 2024 21:11:54 -0400 Subject: [PATCH 09/13] Refactoring Documentation based on PR feedback. --- .github/workflows/rust.yml | 16 +- .../src/bin/print_aggregate_functions_docs.rs | 152 ---------- .../core/src/bin/print_functions_docs.rs | 265 ++++++++++++++++ .../src/bin/print_scalar_functions_docs.rs | 152 ---------- .../src/bin/print_window_functions_docs.rs | 152 ---------- datafusion/expr/src/lib.rs | 2 +- datafusion/expr/src/udaf.rs | 46 +-- datafusion/expr/src/udf.rs | 48 +-- datafusion/expr/src/udf_docs.rs | 159 ++++++++-- datafusion/expr/src/udwf.rs | 48 +-- .../functions-aggregate/src/bit_and_or_xor.rs | 102 ++++--- datafusion/functions-window/src/row_number.rs | 34 ++- datafusion/functions/src/core/coalesce.rs | 37 +-- datafusion/functions/src/crypto/sha224.rs | 32 +- datafusion/functions/src/datetime/to_date.rs | 51 ++-- datafusion/functions/src/encoding/inner.rs | 62 ++-- datafusion/functions/src/math/log.rs | 41 ++- datafusion/functions/src/regex/regexplike.rs | 49 ++- datafusion/functions/src/string/ascii.rs | 38 +-- datafusion/functions/src/unicode/rpad.rs | 47 ++- dev/update_aggregate_docs.sh | 69 ----- dev/update_function_docs.sh | 284 ++++++++++++++++++ dev/update_scalar_docs.sh | 67 ----- .../user-guide/sql/aggregate_functions_new.md | 74 +++++ docs/source/user-guide/sql/index.rst | 3 + .../user-guide/sql/scalar_functions_new.md | 248 +++++++++++++++ .../user-guide/sql/window_functions_new.md | 51 +--- 27 files changed, 1372 insertions(+), 957 deletions(-) delete mode 100644 datafusion/core/src/bin/print_aggregate_functions_docs.rs create mode 100644 datafusion/core/src/bin/print_functions_docs.rs delete mode 100644 datafusion/core/src/bin/print_scalar_functions_docs.rs delete mode 100644 datafusion/core/src/bin/print_window_functions_docs.rs delete mode 100755 dev/update_aggregate_docs.sh create mode 100755 dev/update_function_docs.sh delete mode 100755 dev/update_scalar_docs.sh create mode 100644 docs/source/user-guide/sql/aggregate_functions_new.md create mode 100644 docs/source/user-guide/sql/scalar_functions_new.md rename dev/update_window_docs.sh => docs/source/user-guide/sql/window_functions_new.md (81%) mode change 100755 => 100644 diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1d28989a21540..4f8a2f67aa51a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -542,20 +542,10 @@ jobs: # If you encounter an error, run './dev/update_config_docs.sh' and commit ./dev/update_config_docs.sh git diff --exit-code - - name: Check if aggregate_functions.md has been modified + - name: Check if any of the xyz_functions.md has been modified run: | - # If you encounter an error, run './dev/update_aggregate_docs.sh' and commit - ./dev/update_aggregate_docs.sh - git diff --exit-code - - name: Check if scalar_functions.md has been modified - run: | - # If you encounter an error, run './dev/update_scalar_docs.sh' and commit - ./dev/update_scalar_docs.sh - git diff --exit-code - - name: Check if window_functions.md has been modified - run: | - # If you encounter an error, run './dev/update_window_docs.sh' and commit - ./dev/update_window_docs.sh + # If you encounter an error, run './dev/update_function_docs.sh' and commit + ./dev/update_function_docs.sh git diff --exit-code # Verify MSRV for the crates which are directly used by other projects: diff --git a/datafusion/core/src/bin/print_aggregate_functions_docs.rs b/datafusion/core/src/bin/print_aggregate_functions_docs.rs deleted file mode 100644 index 83fe99aa33682..0000000000000 --- a/datafusion/core/src/bin/print_aggregate_functions_docs.rs +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::execution::SessionStateDefaults; -use datafusion_expr::aggregate_doc_sections::doc_sections; -use datafusion_expr::AggregateUDF; -use itertools::Itertools; -use std::fmt::Write as _; -use std::sync::Arc; - -fn main() { - let functions = SessionStateDefaults::default_aggregate_functions(); - let mut docs = "".to_string(); - - // doc sections only includes sections that have 'include' == true - for doc_section in doc_sections() { - // make sure there is a function that is in this doc section - if !functions - .iter() - .any(|f| f.documentation().doc_section == doc_section) - { - continue; - } - - // write out section header - let _ = writeln!(&mut docs, "## {} ", doc_section.label); - - if let Some(description) = doc_section.description { - let _ = writeln!(&mut docs, "{description}"); - } - - let filtered = functions - .clone() - .into_iter() - .filter(|f| f.documentation().doc_section == doc_section) - .collect_vec(); - - // names is a sorted list of function names and aliases since we display - // both in the documentation - let names = get_names_and_aliases(&filtered); - - // write out the list of function names and aliases - names.iter().for_each(|name| { - let _ = writeln!(&mut docs, "- [{name}](#{name})"); - }); - - // write out each function and alias in the order of the sorted name list - for name in names { - let f = filtered - .iter() - .find(|f| f.name() == name || f.aliases().contains(&name)) - .unwrap(); - let documentation = f.documentation(); - - // if this name is an alias we need to display what it's an alias of - if f.aliases().contains(&name) { - let _ = write!(&mut docs, "_Alias of [{name}](#{name})._"); - continue; - } - - // otherwise display the documentation for the function - - // first, the name, description and syntax example - let _ = write!( - &mut docs, - r#" -### `{}` - -{} - -``` -{} -``` -"#, - f.name(), - documentation.description, - documentation.syntax_example - ); - - // next, arguments - if let Some(args) = documentation.arguments { - let _ = writeln!(&mut docs, "#### Arguments\n"); - for (arg_name, arg_desc) in args { - let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); - } - } - - // next, sql example if provided - if let Some(example) = documentation.sql_example { - let _ = writeln!( - &mut docs, - r#" -#### Example - -{} -"#, - example - ); - } - - // next, aliases - if !f.aliases().is_empty() { - let _ = write!(&mut docs, "#### Aliases"); - - for alias in f.aliases() { - let _ = writeln!(&mut docs, "- {alias}"); - } - } - - // finally, any related udfs - if let Some(related_udfs) = documentation.related_udfs { - let _ = writeln!(&mut docs, "\n**Related functions**:"); - - for related in related_udfs { - let _ = writeln!(&mut docs, "- [{related}](#{related})"); - } - } - } - } - - println!("{docs}"); -} - -fn get_names_and_aliases(functions: &[Arc]) -> Vec { - functions - .iter() - .flat_map(|f| { - if f.aliases().is_empty() { - vec![f.name().to_string()] - } else { - let mut names = vec![f.name().to_string()]; - names.extend(f.aliases().iter().cloned()); - names - } - }) - .sorted() - .collect_vec() -} diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs new file mode 100644 index 0000000000000..92737b244a647 --- /dev/null +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::execution::SessionStateDefaults; +use datafusion_expr::{ + aggregate_doc_sections, scalar_doc_sections, window_doc_sections, AggregateUDF, + DocSection, Documentation, ScalarUDF, WindowUDF, +}; +use itertools::Itertools; +use std::env::args; +use std::fmt::Write as _; + +fn main() { + let args: Vec = args().collect(); + + if args.len() != 2 { + panic!( + "Usage: {} type (one of 'aggregate', 'scalar', 'window')", + args[0] + ); + } + + let function_type = args[1].trim().to_lowercase(); + let docs = match function_type.as_str() { + "aggregate" => print_aggregate_docs(), + "scalar" => print_scalar_docs(), + "window" => print_window_docs(), + _ => { + panic!("Unknown function type: {}", function_type) + } + }; + + println!("{docs}"); +} + +fn print_aggregate_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_aggregate_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, aggregate_doc_sections::doc_sections()) +} + +fn print_scalar_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_scalar_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, scalar_doc_sections::doc_sections()) +} + +fn print_window_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_window_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, window_doc_sections::doc_sections()) +} + +fn print_docs( + providers: Vec>, + doc_sections: Vec, +) -> String { + let mut docs = "".to_string(); + + // doc sections only includes sections that have 'include' == true + for doc_section in doc_sections { + // make sure there is a function that is in this doc section + if !&providers.iter().any(|f| { + if let Some(documentation) = f.get_documentation() { + documentation.doc_section == doc_section + } else { + false + } + }) { + continue; + } + + let providers: Vec<&Box> = providers + .iter() + .filter(|&f| { + if let Some(documentation) = f.get_documentation() { + documentation.doc_section == doc_section + } else { + false + } + }) + .collect::>(); + + // write out section header + let _ = writeln!(docs, "## {} ", doc_section.label); + + if let Some(description) = doc_section.description { + let _ = writeln!(docs, "{description}"); + } + + // names is a sorted list of function names and aliases since we display + // both in the documentation + let names = get_names_and_aliases(&providers); + + // write out the list of function names and aliases + names.iter().for_each(|name| { + let _ = writeln!(docs, "- [{name}](#{name})"); + }); + + // write out each function and alias in the order of the sorted name list + for name in names { + let f = providers + .iter() + .find(|f| f.get_name() == name || f.get_aliases().contains(&name)) + .unwrap(); + + let name = f.get_name(); + let aliases = f.get_aliases(); + let documentation = f.get_documentation(); + + // if this name is an alias we need to display what it's an alias of + if aliases.contains(&name) { + let _ = write!(docs, "_Alias of [{name}](#{name})._"); + continue; + } + + // otherwise display the documentation for the function + let Some(documentation) = documentation else { + unreachable!() + }; + + // first, the name, description and syntax example + let _ = write!( + docs, + r#" +### `{}` + +{} + +``` +{} +``` +"#, + name, documentation.description, documentation.syntax_example + ); + + // next, arguments + if let Some(args) = &documentation.arguments { + let _ = writeln!(docs, "#### Arguments\n"); + for (arg_name, arg_desc) in args { + let _ = writeln!(docs, "- **{arg_name}**: {arg_desc}"); + } + } + + // next, sql example if provided + if let Some(example) = &documentation.sql_example { + let _ = writeln!( + docs, + r#" +#### Example + +{} +"#, + example + ); + } + + // next, aliases + if !f.get_aliases().is_empty() { + let _ = write!(docs, "#### Aliases"); + + for alias in f.get_aliases() { + let _ = writeln!(docs, "- {alias}"); + } + } + + // finally, any related udfs + if let Some(related_udfs) = &documentation.related_udfs { + let _ = writeln!(docs, "\n**Related functions**:"); + + for related in related_udfs { + let _ = writeln!(docs, "- [{related}](#{related})"); + } + } + } + } + + docs +} + +trait DocProvider { + fn get_name(&self) -> String; + fn get_aliases(&self) -> Vec; + fn get_documentation(&self) -> Option<&Documentation>; +} + +impl DocProvider for AggregateUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +impl DocProvider for ScalarUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +impl DocProvider for WindowUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +#[allow(clippy::borrowed_box)] +#[allow(clippy::ptr_arg)] +fn get_names_and_aliases(functions: &Vec<&Box>) -> Vec { + functions + .iter() + .flat_map(|f| { + if f.get_aliases().is_empty() { + vec![f.get_name().to_string()] + } else { + let mut names = vec![f.get_name().to_string()]; + names.extend(f.get_aliases().iter().cloned()); + names + } + }) + .sorted() + .collect_vec() +} diff --git a/datafusion/core/src/bin/print_scalar_functions_docs.rs b/datafusion/core/src/bin/print_scalar_functions_docs.rs deleted file mode 100644 index 951500b5f02b6..0000000000000 --- a/datafusion/core/src/bin/print_scalar_functions_docs.rs +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::execution::SessionStateDefaults; -use datafusion_expr::scalar_doc_sections::doc_sections; -use datafusion_expr::ScalarUDF; -use itertools::Itertools; -use std::fmt::Write as _; -use std::sync::Arc; - -fn main() { - let functions = SessionStateDefaults::default_scalar_functions(); - let mut docs = "".to_string(); - - // doc sections only includes sections that have 'include' == true - for doc_section in doc_sections() { - // make sure there is a function that is in this doc section - if !functions - .iter() - .any(|f| f.documentation().doc_section == doc_section) - { - continue; - } - - // write out section header - let _ = writeln!(&mut docs, "## {} ", doc_section.label); - - if let Some(description) = doc_section.description { - let _ = writeln!(&mut docs, "{description}"); - } - - let filtered = functions - .clone() - .into_iter() - .filter(|f| f.documentation().doc_section == doc_section) - .collect_vec(); - - // names is a sorted list of function names and aliases since we display - // both in the documentation - let names = get_names_and_aliases(&filtered); - - // write out the list of function names and aliases - names.iter().for_each(|name| { - let _ = writeln!(&mut docs, "- [{name}](#{name})"); - }); - - // write out each function and alias in the order of the sorted name list - for name in names { - let f = filtered - .iter() - .find(|f| f.name() == name || f.aliases().contains(&name)) - .unwrap(); - let documentation = f.documentation(); - - // if this name is an alias we need to display what it's an alias of - if f.aliases().contains(&name) { - let _ = write!(&mut docs, "_Alias of [{name}](#{name})._"); - continue; - } - - // otherwise display the documentation for the function - - // first, the name, description and syntax example - let _ = write!( - &mut docs, - r#" -### `{}` - -{} - -``` -{} -``` -"#, - f.name(), - documentation.description, - documentation.syntax_example - ); - - // next, arguments - if let Some(args) = documentation.arguments { - let _ = writeln!(&mut docs, "#### Arguments\n"); - for (arg_name, arg_desc) in args { - let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); - } - } - - // next, sql example if provided - if let Some(example) = documentation.sql_example { - let _ = writeln!( - &mut docs, - r#" -#### Example - -{} -"#, - example - ); - } - - // next, aliases - if !f.aliases().is_empty() { - let _ = write!(&mut docs, "#### Aliases"); - - for alias in f.aliases() { - let _ = writeln!(&mut docs, "- {alias}"); - } - } - - // finally, any related udfs - if let Some(related_udfs) = documentation.related_udfs { - let _ = writeln!(&mut docs, "\n**Related functions**:"); - - for related in related_udfs { - let _ = writeln!(&mut docs, "- [{related}](#{related})"); - } - } - } - } - - println!("{docs}"); -} - -fn get_names_and_aliases(functions: &[Arc]) -> Vec { - functions - .iter() - .flat_map(|f| { - if f.aliases().is_empty() { - vec![f.name().to_string()] - } else { - let mut names = vec![f.name().to_string()]; - names.extend(f.aliases().iter().cloned()); - names - } - }) - .sorted() - .collect_vec() -} diff --git a/datafusion/core/src/bin/print_window_functions_docs.rs b/datafusion/core/src/bin/print_window_functions_docs.rs deleted file mode 100644 index 8a2f793393f51..0000000000000 --- a/datafusion/core/src/bin/print_window_functions_docs.rs +++ /dev/null @@ -1,152 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::execution::SessionStateDefaults; -use datafusion_expr::window_doc_sections::doc_sections; -use datafusion_expr::WindowUDF; -use itertools::Itertools; -use std::fmt::Write as _; -use std::sync::Arc; - -fn main() { - let functions = SessionStateDefaults::default_window_functions(); - let mut docs = "".to_string(); - - // doc sections only includes sections that have 'include' == true - for doc_section in doc_sections() { - // make sure there is a function that is in this doc section - if !functions - .iter() - .any(|f| f.documentation().doc_section == doc_section) - { - continue; - } - - // write out section header - let _ = writeln!(&mut docs, "## {} ", doc_section.label); - - if let Some(description) = doc_section.description { - let _ = writeln!(&mut docs, "{description}"); - } - - let filtered = functions - .clone() - .into_iter() - .filter(|f| f.documentation().doc_section == doc_section) - .collect_vec(); - - // names is a sorted list of function names and aliases since we display - // both in the documentation - let names = get_names_and_aliases(&filtered); - - // write out the list of function names and aliases - names.iter().for_each(|name| { - let _ = writeln!(&mut docs, "- [{name}](#{name})"); - }); - - // write out each function and alias in the order of the sorted name list - for name in names { - let f = filtered - .iter() - .find(|f| f.name() == name || f.aliases().contains(&name)) - .unwrap(); - let documentation = f.documentation(); - - // if this name is an alias we need to display what it's an alias of - if f.aliases().contains(&name) { - let _ = write!(&mut docs, "_Alias of [{name}](#{name})._"); - continue; - } - - // otherwise display the documentation for the function - - // first, the name, description and syntax example - let _ = write!( - &mut docs, - r#" -### `{}` - -{} - -``` -{} -``` -"#, - f.name(), - documentation.description, - documentation.syntax_example - ); - - // next, arguments - if let Some(args) = documentation.arguments { - let _ = writeln!(&mut docs, "#### Arguments\n"); - for (arg_name, arg_desc) in args { - let _ = writeln!(&mut docs, "- **{arg_name}**: {arg_desc}"); - } - } - - // next, sql example if provided - if let Some(example) = documentation.sql_example { - let _ = writeln!( - &mut docs, - r#" -#### Example - -{} -"#, - example - ); - } - - // next, aliases - if !f.aliases().is_empty() { - let _ = write!(&mut docs, "#### Aliases"); - - for alias in f.aliases() { - let _ = writeln!(&mut docs, "- {alias}"); - } - } - - // finally, any related udfs - if let Some(related_udfs) = documentation.related_udfs { - let _ = writeln!(&mut docs, "\n**Related functions**:"); - - for &related in related_udfs { - let _ = writeln!(&mut docs, "- [{related}](#{related})"); - } - } - } - } - - println!("{docs}"); -} - -fn get_names_and_aliases(functions: &[Arc]) -> Vec { - functions - .iter() - .flat_map(|f| { - if f.aliases().is_empty() { - vec![f.name().to_string()] - } else { - let mut names = vec![f.name().to_string()]; - names.extend(f.aliases().iter().cloned()); - names - } - }) - .sorted() - .collect_vec() -} diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 60696e217e9bc..bb312c52b5c0e 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -93,7 +93,7 @@ pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; pub use udaf::{aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF}; pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl}; -pub use udf_docs::{DocSection, Documentation, DOCUMENTATION_NONE, DOC_SECTION_NONE}; +pub use udf_docs::{DocSection, Documentation, DocumentationBuilder}; pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 24fa89e45cb1e..5c23926abb971 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -33,7 +33,6 @@ use crate::function::{ AccumulatorArgs, AggregateFunctionSimplification, StateFieldsArgs, }; use crate::groups_accumulator::GroupsAccumulator; -use crate::udf_docs::DOCUMENTATION_NONE; use crate::utils::format_state_name; use crate::utils::AggregateOrderSensitivity; use crate::{Accumulator, Expr}; @@ -250,8 +249,11 @@ impl AggregateUDF { self.inner.default_value(data_type) } - /// Returns this UDF's documentation that will be used to generate public documentation - pub fn documentation(&self) -> &Documentation { + /// Returns the documentation for this Aggregate UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { self.inner.documentation() } } @@ -278,6 +280,7 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; /// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr, Documentation}; @@ -299,14 +302,19 @@ where /// } /// } /// -/// const DOCUMENTATION: Documentation = Documentation { -/// doc_section: DOC_SECTION_AGGREGATE, -/// description: "calculates a geometric mean", -/// syntax_example: "geo_mean(2.0)", -/// sql_example: None, -/// arguments: Some(&[("arg_1", "The Float64 number for the geometric mean")]), -/// related_udfs: None, -/// }; +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_AGGREGATE) +/// .with_description("calculates a geometric mean") +/// .with_syntax_example("geo_mean(2.0)") +/// .with_argument("arg1", "The Float64 number for the geometric mean") +/// .build() +/// .unwrap() +/// }) +/// } /// /// /// Implement the AggregateUDFImpl trait for GeoMeanUdf /// impl AggregateUDFImpl for GeoMeanUdf { @@ -327,8 +335,8 @@ where /// Field::new("ordering", DataType::UInt32, true) /// ]) /// } -/// fn documentation(&self) -> &Documentation { -/// &DOCUMENTATION +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) /// } /// } /// @@ -585,10 +593,12 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { ScalarValue::try_from(data_type) } - /// Returns the documentation for this Aggregate UDF for use - /// in generating publicly facing documentation. - fn documentation(&self) -> &Documentation { - &DOCUMENTATION_NONE + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None } } @@ -737,7 +747,7 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl { self.inner.is_descending() } - fn documentation(&self) -> &Documentation { + fn documentation(&self) -> Option<&Documentation> { self.inner.documentation() } } diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 36cc8f64d8a98..3759fb18f56df 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -20,7 +20,6 @@ use crate::expr::schema_name_from_exprs_comma_seperated_without_space; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; -use crate::udf_docs::DOCUMENTATION_NONE; use crate::{ ColumnarValue, Documentation, Expr, ScalarFunctionImplementation, Signature, }; @@ -278,8 +277,11 @@ impl ScalarUDF { self.inner.coerce_types(arg_types) } - /// Returns this UDF's documentation that will be used to generate public documentation - pub fn documentation(&self) -> &Documentation { + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { self.inner.documentation() } } @@ -306,6 +308,7 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; /// # use datafusion_expr::{col, ColumnarValue, Documentation, Signature, Volatility}; @@ -325,15 +328,20 @@ where /// } /// } /// -/// const DOCUMENTATION: Documentation = Documentation { -/// doc_section: DOC_SECTION_MATH, -/// description: "Add one to an int32", -/// syntax_example: "add_one(2)", -/// sql_example: None, -/// arguments: Some(&[("arg_1", "The int32 number to add one to")]), -/// related_udfs: None, -/// }; -/// +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_MATH) +/// .with_description("Add one to an int32") +/// .with_syntax_example("add_one(2)") +/// .with_argument("arg1", "The int32 number to add one to") +/// .build() +/// .unwrap() +/// }) +/// } +/// /// /// Implement the ScalarUDFImpl trait for AddOne /// impl ScalarUDFImpl for AddOne { /// fn as_any(&self) -> &dyn Any { self } @@ -347,8 +355,8 @@ where /// } /// // The actual implementation would add one to the argument /// fn invoke(&self, args: &[ColumnarValue]) -> Result { unimplemented!() } -/// fn documentation(&self) -> &Documentation { -/// &DOCUMENTATION +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) /// } /// } /// @@ -619,10 +627,12 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { hasher.finish() } - /// Returns the documentation for this scalar UDF for use - /// in generating publicly facing documentation. - fn documentation(&self) -> &Documentation { - &DOCUMENTATION_NONE + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None } } @@ -738,7 +748,7 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { hasher.finish() } - fn documentation(&self) -> &Documentation { + fn documentation(&self) -> Option<&Documentation> { self.inner.documentation() } } diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index 2b3c1a34ac5a1..280910b87199d 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -15,13 +15,16 @@ // specific language governing permissions and limitations // under the License. -/// Documentation for use by [`crate::ScalarUDFImpl`], -/// [`crate::AggregateUDFImpl`] and [`crate::WindowUDFImpl`] functions +use datafusion_common::exec_err; +use datafusion_common::Result; + +/// Documentation for use by [`ScalarUDFImpl`](crate::ScalarUDFImpl), +/// [`AggregateUDFImpl`](crate::AggregateUDFImpl) and [`WindowUDFImpl`](crate::WindowUDFImpl) functions /// that will be used to generate public documentation. /// -/// The name of the udf will be pulled from the [`crate::ScalarUDFImpl::name`], -/// [`crate::AggregateUDFImpl::name`] or [`crate::WindowUDFImpl::name`] function -/// as appropriate. +/// The name of the udf will be pulled from the [`ScalarUDFImpl::name`](crate::ScalarUDFImpl::name), +/// [`AggregateUDFImpl::name`](crate::AggregateUDFImpl::name) or [`WindowUDFImpl::name`](crate::WindowUDFImpl::name) +/// function as appropriate. /// /// All strings in the documentation are required to be /// in [markdown format](https://www.markdownguide.org/basic-syntax/). @@ -33,22 +36,29 @@ pub struct Documentation { /// the section in the documentation where the UDF will be documented pub doc_section: DocSection, /// the description for the UDF - pub description: &'static str, + pub description: String, /// a brief example of the syntax. For example "ascii(str)" - pub syntax_example: &'static str, + pub syntax_example: String, /// a sql example for the UDF, usually in the form of a sql prompt /// query and output. It is strongly recommended to provide an /// example for anything but the most basic UDF's - pub sql_example: Option<&'static str>, + pub sql_example: Option, /// arguments for the UDF which will be displayed in array order. /// Left member of a pair is the argument name, right is a /// description for the argument - pub arguments: Option<&'static [(&'static str, &'static str)]>, + pub arguments: Option>, /// related functions if any. Values should match the related /// udf's name exactly. Related udf's must be of the same /// UDF type (scalar, aggregate or window) for proper linking to /// occur - pub related_udfs: Option<&'static [&'static str]>, + pub related_udfs: Option>, +} + +impl Documentation { + /// Returns a new [`DocumentationBuilder`] with no options set. + pub fn builder() -> DocumentationBuilder { + DocumentationBuilder::new() + } } #[derive(Debug, Clone, PartialEq)] @@ -62,19 +72,116 @@ pub struct DocSection { pub description: Option<&'static str>, } -pub const DOCUMENTATION_NONE: Documentation = Documentation { - doc_section: DOC_SECTION_NONE, - description: "", - syntax_example: "", - sql_example: None, - arguments: None, - related_udfs: None, -}; - -/// A doc section that indicated the UDF should not -/// be publicly documented -pub const DOC_SECTION_NONE: DocSection = DocSection { - include: false, - label: "", - description: None, -}; +/// A builder to be used for building [`Documentation`]'s. +/// +/// Example: +/// +/// ```rust +/// # use datafusion_expr::Documentation; +/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; +/// # use datafusion_common::Result; +/// # +/// # fn main() -> Result<()> { +/// let documentation = Documentation::builder() +/// .with_doc_section(DOC_SECTION_MATH) +/// .with_description("Add one to an int32") +/// .with_syntax_example("add_one(2)") +/// .with_argument("arg_1", "The int32 number to add one to") +/// .build()?; +/// Ok(()) +/// # } +pub struct DocumentationBuilder { + pub doc_section: Option, + pub description: Option, + pub syntax_example: Option, + pub sql_example: Option, + pub arguments: Option>, + pub related_udfs: Option>, +} + +impl DocumentationBuilder { + pub fn new() -> Self { + Self { + doc_section: None, + description: None, + syntax_example: None, + sql_example: None, + arguments: None, + related_udfs: None, + } + } + + pub fn with_doc_section(mut self, doc_section: DocSection) -> Self { + self.doc_section = Some(doc_section); + self + } + + pub fn with_description(mut self, description: impl Into) -> Self { + self.description = Some(description.into()); + self + } + + pub fn with_syntax_example(mut self, syntax_example: impl Into) -> Self { + self.syntax_example = Some(syntax_example.into()); + self + } + + pub fn with_sql_example(mut self, sql_example: impl Into) -> Self { + self.sql_example = Some(sql_example.into()); + self + } + + pub fn with_argument( + mut self, + arg_name: impl Into, + arg_description: impl Into, + ) -> Self { + let mut args = self.arguments.unwrap_or_default(); + args.push((arg_name.into(), arg_description.into())); + self.arguments = Some(args); + self + } + + pub fn with_related_udf(mut self, related_udf: impl Into) -> Self { + let mut related = self.related_udfs.unwrap_or_default(); + related.push(related_udf.into()); + self.related_udfs = Some(related); + self + } + + pub fn build(self) -> Result { + let Self { + doc_section, + description, + syntax_example, + sql_example, + arguments, + related_udfs, + } = self; + + if doc_section.is_none() { + return exec_err!("Documentation must have a doc section"); + } + if description.is_none() { + return exec_err!("Documentation must have a description"); + } + if syntax_example.is_none() { + return exec_err!("Documentation must have a syntax_example"); + } + + Ok(Documentation { + doc_section: doc_section.unwrap(), + description: description.unwrap(), + syntax_example: syntax_example.unwrap(), + sql_example, + arguments, + related_udfs, + }) + } +} + +impl Default for DocumentationBuilder { + fn default() -> Self { + Self::new() + } +} diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 7f9c6a5173c9c..6459e8f3f7d17 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -32,7 +32,6 @@ use datafusion_common::{not_impl_err, Result}; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use crate::expr::WindowFunction; -use crate::udf_docs::DOCUMENTATION_NONE; use crate::{ function::WindowFunctionSimplification, Documentation, Expr, PartitionEvaluator, Signature, @@ -183,8 +182,11 @@ impl WindowUDF { self.inner.reverse_expr() } - /// Returns this UDF's documentation that will be used to generate public documentation - pub fn documentation(&self) -> &Documentation { + /// Returns the documentation for this Window UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { self.inner.documentation() } } @@ -211,6 +213,7 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::{DataFusionError, plan_err, Result}; /// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt, Documentation}; @@ -231,15 +234,20 @@ where /// } /// } /// -/// const DOCUMENTATION: Documentation = Documentation { -/// doc_section: DOC_SECTION_ANALYTICAL, -/// description: "smooths the windows", -/// syntax_example: "smooth_it(2)", -/// sql_example: None, -/// arguments: Some(&[("arg_1", "The int32 number to smooth by")]), -/// related_udfs: None, -/// }; - +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_ANALYTICAL) +/// .with_description("smooths the windows") +/// .with_syntax_example("smooth_it(2)") +/// .with_argument("arg1", "The int32 number to smooth by") +/// .build() +/// .unwrap() +/// }) +/// } +/// /// /// Implement the WindowUDFImpl trait for SmoothIt /// impl WindowUDFImpl for SmoothIt { /// fn as_any(&self) -> &dyn Any { self } @@ -254,8 +262,8 @@ where /// plan_err!("smooth_it only accepts Int32 arguments") /// } /// } -/// fn documentation(&self) -> &Documentation { -/// &DOCUMENTATION +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) /// } /// } /// @@ -387,10 +395,12 @@ pub trait WindowUDFImpl: Debug + Send + Sync { ReversedUDWF::NotSupported } - /// Returns the documentation for this window UDF for use - /// in generating publicly facing documentation. - fn documentation(&self) -> &Documentation { - &DOCUMENTATION_NONE + /// Returns the documentation for this Window UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None } } @@ -493,7 +503,7 @@ impl WindowUDFImpl for AliasedWindowUDFImpl { self.inner.coerce_types(arg_types) } - fn documentation(&self) -> &Documentation { + fn documentation(&self) -> Option<&Documentation> { self.inner.documentation() } } diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index 48d2b9646cc74..ce36e09bc25b5 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -42,6 +42,7 @@ use datafusion_expr::{ use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; +use std::sync::OnceLock; /// This macro helps create group accumulators based on bitwise operations typically used internally /// and might not be necessary for users to call directly. @@ -133,65 +134,74 @@ macro_rules! make_bitwise_udaf_expr_and_func { }; } -const BIT_AND_DOC: Documentation = Documentation { - doc_section: DOC_SECTION_GENERAL, - description: "Computes the bitwise AND of all non-null input values.", - syntax_example: "bit_and(expression)", - sql_example: None, - arguments: Some(&[ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ), - ]), - related_udfs: None, -}; +static BIT_AND_DOC: OnceLock = OnceLock::new(); + +fn get_bit_and_doc() -> &'static Documentation { + BIT_AND_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise AND of all non-null input values.") + .with_syntax_example("bit_and(expression)") + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .build() + .unwrap() + }) +} -const BIT_OR_DOC: Documentation = Documentation { - doc_section: DOC_SECTION_GENERAL, - description: "Computes the bitwise OR of all non-null input values.", - syntax_example: "bit_or(expression)", - sql_example: None, - arguments: Some(&[ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ), - ]), - related_udfs: None, -}; +static BIT_OR_DOC: OnceLock = OnceLock::new(); + +fn get_bit_or_doc() -> &'static Documentation { + BIT_OR_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise OR of all non-null input values.") + .with_syntax_example("bit_or(expression)") + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .build() + .unwrap() + }) +} -const BIT_XOR_DOC: Documentation = Documentation { - doc_section: DOC_SECTION_GENERAL, - description: "Computes the bitwise exclusive OR of all non-null input values.", - syntax_example: "bit_xor(expression)", - sql_example: None, - arguments: Some(&[ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ), - ]), - related_udfs: None, -}; +static BIT_XOR_DOC: OnceLock = OnceLock::new(); + +fn get_bit_xor_doc() -> &'static Documentation { + BIT_XOR_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise exclusive OR of all non-null input values.") + .with_syntax_example("bit_xor(expression)") + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .build() + .unwrap() + }) +} make_bitwise_udaf_expr_and_func!( bit_and, bit_and_udaf, BitwiseOperationType::And, - BIT_AND_DOC + get_bit_and_doc() ); make_bitwise_udaf_expr_and_func!( bit_or, bit_or_udaf, BitwiseOperationType::Or, - BIT_OR_DOC + get_bit_or_doc() ); make_bitwise_udaf_expr_and_func!( bit_xor, bit_xor_udaf, BitwiseOperationType::Xor, - BIT_XOR_DOC + get_bit_xor_doc() ); /// The different types of bitwise operations that can be performed. @@ -215,14 +225,14 @@ struct BitwiseOperation { /// `operation` indicates the type of bitwise operation to be performed. operation: BitwiseOperationType, func_name: &'static str, - documentation: Documentation, + documentation: &'static Documentation, } impl BitwiseOperation { pub fn new( operator: BitwiseOperationType, func_name: &'static str, - documentation: Documentation, + documentation: &'static Documentation, ) -> Self { Self { operation: operator, @@ -306,8 +316,8 @@ impl AggregateUDFImpl for BitwiseOperation { ReversedUDAF::Identical } - fn documentation(&self) -> &Documentation { - &self.documentation + fn documentation(&self) -> Option<&Documentation> { + Some(self.documentation) } } diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index 46e239203ff08..326e1d49ea466 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -17,10 +17,6 @@ //! Defines physical expression for `row_number` that can evaluated at runtime during query execution -use std::any::Any; -use std::fmt::Debug; -use std::ops::Range; - use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::UInt64Array; use datafusion_common::arrow::compute::SortOptions; @@ -34,6 +30,10 @@ use datafusion_expr::{ }; use datafusion_functions_window_common::field; use field::WindowUDFFieldArgs; +use std::any::Any; +use std::fmt::Debug; +use std::ops::Range; +use std::sync::OnceLock; /// Create a [`WindowFunction`](Expr::WindowFunction) expression for /// `row_number` user-defined window function. @@ -77,14 +77,20 @@ impl Default for RowNumber { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_RANKING, - description: "Number of the current row within its partition, counting from 1.", - syntax_example: "row_number()", - sql_example: None, - arguments: None, - related_udfs: None, -}; +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_row_number_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_RANKING) + .with_description( + "Number of the current row within its partition, counting from 1.", + ) + .with_syntax_example("row_number()") + .build() + .unwrap() + }) +} impl WindowUDFImpl for RowNumber { fn as_any(&self) -> &dyn Any { @@ -114,8 +120,8 @@ impl WindowUDFImpl for RowNumber { }) } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_row_number_doc()) } } diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index c0859bd4ae100..d8ff44798f8a7 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::array::{new_null_array, BooleanArray}; use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; @@ -27,6 +25,8 @@ use datafusion_expr::type_coercion::binary::type_union_resolution; use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use itertools::Itertools; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct CoalesceFunc { @@ -47,19 +47,22 @@ impl CoalesceFunc { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_CONDITIONAL, - description: "Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.", - syntax_example: "coalesce(expression1[, ..., expression_n])", - sql_example: None, - arguments: Some(&[ - ( - "expression1, expression_n", - "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." - ), - ]), - related_udfs: None, -}; +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_coalesce_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") + .with_syntax_example("coalesce(expression1[, ..., expression_n])") + .with_argument( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ) + .build() + .unwrap() + }) +} impl ScalarUDFImpl for CoalesceFunc { fn as_any(&self) -> &dyn Any { @@ -156,8 +159,8 @@ impl ScalarUDFImpl for CoalesceFunc { Ok(vec![new_type; arg_types.len()]) } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_coalesce_doc()) } } diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 6104f3a98d341..df3045f22cf52 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -24,6 +24,7 @@ use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct SHA224Func { @@ -49,19 +50,20 @@ impl SHA224Func { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_HASHING, - description: "Computes the SHA-224 hash of a binary string.", - syntax_example: "sha224(expression)", - sql_example: None, - arguments: Some(&[ - ( - "expression", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." - ), - ]), - related_udfs: None, -}; +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_sha224_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_HASHING) + .with_description("Computes the SHA-224 hash of a binary string.") + .with_syntax_example("sha224(expression)") + .with_argument("expression", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.") + .build() + .unwrap() + }) +} impl ScalarUDFImpl for SHA224Func { fn as_any(&self) -> &dyn Any { @@ -84,7 +86,7 @@ impl ScalarUDFImpl for SHA224Func { sha224(args) } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_sha224_doc()) } } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index f0b53b965dc1e..176d7f8bbcbf6 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use crate::datetime::common::*; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Date32; @@ -28,6 +26,8 @@ use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct ToDateFunc { @@ -79,19 +79,22 @@ impl ToDateFunc { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_DATETIME, - description: r#"Converts a value to a date (`YYYY-MM-DD`). +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_to_date_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_DATETIME) + .with_description(r#"Converts a value to a date (`YYYY-MM-DD`). Supports strings, integer and double types as input. Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding date. Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. -"#, - syntax_example: "to_date('2017-05-31', '%Y-%m-%d')", - sql_example: Some( - r#"```sql +"#) + .with_syntax_example("to_date('2017-05-31', '%Y-%m-%d')") + .with_sql_example(r#"```sql > select to_date('2023-01-31'); +-----------------------------+ | to_date(Utf8("2023-01-31")) | @@ -107,21 +110,21 @@ Note: `to_date` returns Date32, which represents its values as the number of day ``` Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) -"#), - arguments: Some(&[ - ( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." - ), - ( - "format_n", - "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order +"#) + .with_argument( + "expression", + "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + ) + .with_argument( + "format_n", + "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned.", - ) - ]), - related_udfs: None, -}; + ) + .build() + .unwrap() + }) +} impl ScalarUDFImpl for ToDateFunc { fn as_any(&self) -> &dyn Any { @@ -164,8 +167,8 @@ impl ScalarUDFImpl for ToDateFunc { } } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_to_date_doc()) } } diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index f41a10f6ace7f..7dcf6d60618fe 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -29,7 +29,7 @@ use datafusion_common::{ use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::{ColumnarValue, Documentation}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use std::{fmt, str::FromStr}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING; @@ -55,17 +55,21 @@ impl EncodeFunc { } } -const ENCODE_DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_BINARY_STRING, - description: "Encode binary data into a textual representation.", - syntax_example: "encode(expression, format)", - sql_example: None, - arguments: Some(&[ - ("expression", "Expression containing string or binary data"), - ("format", "Supported formats are: `base64`, `hex`"), - ]), - related_udfs: Some(&["decode"]), -}; +static ENCODE_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_encode_doc() -> &'static Documentation { + ENCODE_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_BINARY_STRING) + .with_description("Encode binary data into a textual representation.") + .with_syntax_example("encode(expression, format)") + .with_argument("expression", "Expression containing string or binary data") + .with_argument("format", "Supported formats are: `base64`, `hex`") + .with_related_udf("decode") + .build() + .unwrap() + }) +} impl ScalarUDFImpl for EncodeFunc { fn as_any(&self) -> &dyn Any { @@ -114,8 +118,8 @@ impl ScalarUDFImpl for EncodeFunc { } } - fn documentation(&self) -> &Documentation { - &ENCODE_DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_encode_doc()) } } @@ -138,17 +142,21 @@ impl DecodeFunc { } } -const DECODE_DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_BINARY_STRING, - description: "Decode binary data from textual representation in string.", - syntax_example: "decode(expression, format)", - sql_example: None, - arguments: Some(&[ - ("expression", "Expression containing encoded string data"), - ("format", "Same arguments as [encode](#encode)"), - ]), - related_udfs: Some(&["encode"]), -}; +static DECODE_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_decode_doc() -> &'static Documentation { + DECODE_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_BINARY_STRING) + .with_description("Decode binary data from textual representation in string.") + .with_syntax_example("decode(expression, format)") + .with_argument("expression", "Expression containing encoded string data") + .with_argument("format", "Same arguments as [encode](#encode)") + .with_related_udf("encode") + .build() + .unwrap() + }) +} impl ScalarUDFImpl for DecodeFunc { fn as_any(&self) -> &dyn Any { @@ -197,8 +205,8 @@ impl ScalarUDFImpl for DecodeFunc { } } - fn documentation(&self) -> &Documentation { - &DECODE_DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_decode_doc()) } } diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index 5925b9a47bf26..889e3761d26cc 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -18,7 +18,7 @@ //! Math function: `log()`. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use super::power::PowerFunc; @@ -48,24 +48,23 @@ impl Default for LogFunc { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_MATH, - description: "Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.", - syntax_example: r#"log(base, numeric_expression) -log(numeric_expression)"#, - sql_example: None, - arguments: Some(&[ - ( - "base", - "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." - ), - ( - "numeric_expression", - "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators." - ), - ]), - related_udfs: None, -}; +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_log_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MATH) + .with_description("Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.") + .with_syntax_example(r#"log(base, numeric_expression) +log(numeric_expression)"#) + .with_argument("base", + "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_argument("numeric_expression", + "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .build() + .unwrap() + }) +} impl LogFunc { pub fn new() -> Self { @@ -186,8 +185,8 @@ impl ScalarUDFImpl for LogFunc { Ok(ColumnarValue::Array(arr)) } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_log_doc()) } /// Simplify the `log` function by the relevant rules: diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 41349e6e80dfb..c57b0eaf2a367 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -30,7 +30,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct RegexpLikeFunc { @@ -43,12 +43,15 @@ impl Default for RegexpLikeFunc { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_REGEX, - description: "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.", - syntax_example: "regexp_like(str, regexp[, flags])", - sql_example: Some( - r#"```sql +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_regexp_like_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_REGEX) + .with_description("Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.") + .with_syntax_example("regexp_like(str, regexp[, flags])") + .with_sql_example(r#"```sql select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); +--------------------------------------------------------+ | regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | @@ -63,26 +66,22 @@ SELECT regexp_like('aBc', '(b|d)', 'i'); +--------------------------------------------------+ ``` Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) -"#), - arguments: Some(&[ - ( - "str", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." - ), - ( "regexp", - "Regular expression to test against the string expression. Can be a constant, column, or function." - ), - ("flags", - r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: +"#) + .with_argument("str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.") + .with_argument("regexp", + "Regular expression to test against the string expression. Can be a constant, column, or function.") + .with_argument("flags", + r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: - **i**: case-insensitive: letters match both upper and lower case - **m**: multi-line mode: ^ and $ match begin/end of line - **s**: allow . to match \n - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used - - **U**: swap the meaning of x* and x*?"# - ) - ]), - related_udfs: None, -}; + - **U**: swap the meaning of x* and x*?"#) + .build() + .unwrap() + }) +} impl RegexpLikeFunc { pub fn new() -> Self { @@ -149,8 +148,8 @@ impl ScalarUDFImpl for RegexpLikeFunc { } } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_regexp_like_doc()) } } fn regexp_like_func(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 3e0b321804720..d01c6631e9dde 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -24,21 +24,25 @@ use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; - -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_STRING, - description: "Returns the ASCII value of the first character in a string.", - syntax_example: "ascii(str)", - sql_example: None, - arguments: Some(&[ - ( - "str", - "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View." - ) - ]), - related_udfs: Some(&["chr"]), -}; +use std::sync::{Arc, OnceLock}; + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ascii_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the ASCII value of the first character in a string.") + .with_syntax_example("ascii(str)") + .with_argument( + "str", + "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View.", + ) + .with_related_udf("chr") + .build() + .unwrap() + }) +} #[derive(Debug)] pub struct AsciiFunc { @@ -87,8 +91,8 @@ impl ScalarUDFImpl for AsciiFunc { make_scalar_function(ascii, vec![])(args) } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_ascii_doc()) } } diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index e72490e7afb72..ce221b44f42be 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -32,7 +32,7 @@ use datafusion_expr::{ }; use std::any::Any; use std::fmt::Write; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; @@ -47,27 +47,26 @@ impl Default for RPadFunc { } } -const DOCUMENTATION: Documentation = Documentation { - doc_section: DOC_SECTION_STRING, - description: "Pads the right side of a string with another string to a specified string length.", - syntax_example: "rpad(str, n[, padding_str])", - sql_example: None, - arguments: Some(&[ - ( - "str", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators." - ), - ( - "n", - "String length to pad to." - ), - ( - "padding_str", - "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" - ), - ]), - related_udfs: Some(&["lpad"]), -}; +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rpad_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Pads the right side of a string with another string to a specified string length.") + .with_syntax_example("rpad(str, n[, padding_str])") + .with_argument( + "str", + "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.", + ) + .with_argument("n", "String length to pad to.") + .with_argument("padding_str", + "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") + .with_related_udf("lpad") + .build() + .unwrap() + }) +} impl RPadFunc { pub fn new() -> Self { @@ -139,8 +138,8 @@ impl ScalarUDFImpl for RPadFunc { } } - fn documentation(&self) -> &Documentation { - &DOCUMENTATION + fn documentation(&self) -> Option<&Documentation> { + Some(get_rpad_doc()) } } diff --git a/dev/update_aggregate_docs.sh b/dev/update_aggregate_docs.sh deleted file mode 100755 index 9ad8074927d27..0000000000000 --- a/dev/update_aggregate_docs.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -e - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "${SOURCE_DIR}/../" && pwd - -TARGET_FILE="docs/source/user-guide/sql/aggregate_functions_new.md" -PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_aggregate_functions_docs" - -echo "Inserting header" -cat <<'EOF' > "$TARGET_FILE" - - - - -# Aggregate Functions - -Aggregate functions operate on a set of values to compute a single result. -EOF - -echo "Running CLI and inserting aggregate function docs table" -$PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" - -echo "Running prettier" -npx prettier@2.3.2 --write "$TARGET_FILE" - -echo "'$TARGET_FILE' successfully updated!" diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh new file mode 100755 index 0000000000000..a4236eefc8c8d --- /dev/null +++ b/dev/update_function_docs.sh @@ -0,0 +1,284 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SOURCE_DIR}/../" && pwd + + +TARGET_FILE="docs/source/user-guide/sql/aggregate_functions_new.md" +PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- aggregate" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Aggregate Functions (NEW) + +This page is a WIP and will replace the Aggregate Functions page once completed. + +Aggregate functions operate on a set of values to compute a single result. +EOF + +echo "Running CLI and inserting aggregate function docs table" +$PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + +TARGET_FILE="docs/source/user-guide/sql/scalar_functions_new.md" +PRINT_SCALAR_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- scalar" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Scalar Functions (NEW) + +This page is a WIP and will replace the Scalar Functions page once completed. +EOF + +echo "Running CLI and inserting scalar function docs table" +$PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + +TARGET_FILE="docs/source/user-guide/sql/window_functions_new.md" +PRINT_WINDOW_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- window" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + + +# Window Functions (NEW) + +This page is a WIP and will replace the Window Functions page once completed. + +A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result + +Here is an example that shows how to compare each employee's salary with the average salary in his or her department: + +```sql +SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; + ++-----------+-------+--------+-------------------+ +| depname | empno | salary | avg | ++-----------+-------+--------+-------------------+ +| personnel | 2 | 3900 | 3700.0 | +| personnel | 5 | 3500 | 3700.0 | +| develop | 8 | 6000 | 5020.0 | +| develop | 10 | 5200 | 5020.0 | +| develop | 11 | 5200 | 5020.0 | +| develop | 9 | 4500 | 5020.0 | +| develop | 7 | 4200 | 5020.0 | +| sales | 1 | 5000 | 4866.666666666667 | +| sales | 4 | 4800 | 4866.666666666667 | +| sales | 3 | 4800 | 4866.666666666667 | ++-----------+-------+--------+-------------------+ +``` + +A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. + +You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: + +```sql +SELECT depname, empno, salary, + rank() OVER (PARTITION BY depname ORDER BY salary DESC) +FROM empsalary; + ++-----------+-------+--------+--------+ +| depname | empno | salary | rank | ++-----------+-------+--------+--------+ +| personnel | 2 | 3900 | 1 | +| develop | 8 | 6000 | 1 | +| develop | 10 | 5200 | 2 | +| develop | 11 | 5200 | 2 | +| develop | 9 | 4500 | 4 | +| develop | 7 | 4200 | 5 | +| sales | 1 | 5000 | 1 | +| sales | 4 | 4800 | 2 | +| personnel | 5 | 3500 | 2 | +| sales | 3 | 4800 | 2 | ++-----------+-------+--------+--------+ +``` + +There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: + +```sql +SELECT depname, empno, salary, + avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, + min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min +FROM empsalary +ORDER BY empno ASC; + ++-----------+-------+--------+--------------------+---------+ +| depname | empno | salary | avg | cum_min | ++-----------+-------+--------+--------------------+---------+ +| sales | 1 | 5000 | 5000.0 | 5000 | +| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | +| sales | 3 | 4800 | 4700.0 | 3900 | +| sales | 4 | 4800 | 4866.666666666667 | 3900 | +| personnel | 5 | 3500 | 3700.0 | 3500 | +| develop | 7 | 4200 | 4200.0 | 3500 | +| develop | 8 | 6000 | 5600.0 | 3500 | +| develop | 9 | 4500 | 4500.0 | 3500 | +| develop | 10 | 5200 | 5133.333333333333 | 3500 | +| develop | 11 | 5200 | 5466.666666666667 | 3500 | ++-----------+-------+--------+--------------------+---------+ +``` + +When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: + +```sql +SELECT sum(salary) OVER w, avg(salary) OVER w +FROM empsalary +WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); +``` + +## Syntax + +The syntax for the OVER-clause is + +``` +function([expr]) + OVER( + [PARTITION BY expr[, …]] + [ORDER BY expr [ ASC | DESC ][, …]] + [ frame_clause ] + ) +``` + +where **frame_clause** is one of: + +``` + { RANGE | ROWS | GROUPS } frame_start + { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end +``` + +and **frame_start** and **frame_end** can be one of + +```sql +UNBOUNDED PRECEDING +offset PRECEDING +CURRENT ROW +offset FOLLOWING +UNBOUNDED FOLLOWING +``` + +where **offset** is an non-negative integer. + +RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). + +## Aggregate functions + +All [aggregate functions](aggregate_functions.md) can be used as window functions. + +EOF + +echo "Running CLI and inserting window function docs table" +$PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + diff --git a/dev/update_scalar_docs.sh b/dev/update_scalar_docs.sh deleted file mode 100755 index 5ff5cebad4f1b..0000000000000 --- a/dev/update_scalar_docs.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -e - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "${SOURCE_DIR}/../" && pwd - -TARGET_FILE="docs/source/user-guide/sql/scalar_functions_new.md" -PRINT_SCALAR_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_scalar_functions_docs" - -echo "Inserting header" -cat <<'EOF' > "$TARGET_FILE" - - - - -# Scalar Functions -EOF - -echo "Running CLI and inserting scalar function docs table" -$PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" - -echo "Running prettier" -npx prettier@2.3.2 --write "$TARGET_FILE" - -echo "'$TARGET_FILE' successfully updated!" diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md new file mode 100644 index 0000000000000..8303c50c2471f --- /dev/null +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -0,0 +1,74 @@ + + + + +# Aggregate Functions (NEW) + +This page is a WIP and will replace the Aggregate Functions page once completed. + +Aggregate functions operate on a set of values to compute a single result. + +## General Functions + +- [bit_and](#bit_and) +- [bit_or](#bit_or) +- [bit_xor](#bit_xor) + +### `bit_and` + +Computes the bitwise AND of all non-null input values. + +``` +bit_and(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +### `bit_or` + +Computes the bitwise OR of all non-null input values. + +``` +bit_or(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +### `bit_xor` + +Computes the bitwise exclusive OR of all non-null input values. + +``` +bit_xor(expression) +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst index 04d1fc228f816..6eb451c83b964 100644 --- a/docs/source/user-guide/sql/index.rst +++ b/docs/source/user-guide/sql/index.rst @@ -30,7 +30,10 @@ SQL Reference information_schema operators aggregate_functions + aggregate_functions_new window_functions + window_functions_new scalar_functions + scalar_functions_new sql_status write_options diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md new file mode 100644 index 0000000000000..a341cbd81c7ee --- /dev/null +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -0,0 +1,248 @@ + + + + +# Scalar Functions (NEW) + +This page is a WIP and will replace the Scalar Functions page once completed. + +## Math Functions + +- [log](#log) + +### `log` + +Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number. + +``` +log(base, numeric_expression) +log(numeric_expression) +``` + +#### Arguments + +- **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + +## Conditional Functions + +- [coalesce](#coalesce) + +### `coalesce` + +Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values. + +``` +coalesce(expression1[, ..., expression_n]) +``` + +#### Arguments + +- **expression1, expression_n**: Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary. + +## String Functions + +- [ascii](#ascii) +- [rpad](#rpad) + +### `ascii` + +Returns the ASCII value of the first character in a string. + +``` +ascii(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View. + +**Related functions**: + +- [chr](#chr) + +### `rpad` + +Pads the right side of a string with another string to a specified string length. + +``` +rpad(str, n[, padding_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ + +**Related functions**: + +- [lpad](#lpad) + +## Binary String Functions + +- [decode](#decode) +- [encode](#encode) + +### `decode` + +Decode binary data from textual representation in string. + +``` +decode(expression, format) +``` + +#### Arguments + +- **expression**: Expression containing encoded string data +- **format**: Same arguments as [encode](#encode) + +**Related functions**: + +- [encode](#encode) + +### `encode` + +Encode binary data into a textual representation. + +``` +encode(expression, format) +``` + +#### Arguments + +- **expression**: Expression containing string or binary data +- **format**: Supported formats are: `base64`, `hex` + +**Related functions**: + +- [decode](#decode) + +## Regular Expression Functions + +Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported: + +- [regexp_like](#regexp_like) + +### `regexp_like` + +Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise. + +``` +regexp_like(str, regexp[, flags]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **regexp**: Regular expression to test against the string expression. Can be a constant, column, or function. +- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + - **i**: case-insensitive: letters match both upper and lower case + - **m**: multi-line mode: ^ and $ match begin/end of line + - **s**: allow . to match \n + - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used + - **U**: swap the meaning of x* and x*? + +#### Example + +```sql +select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); ++--------------------------------------------------------+ +| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | ++--------------------------------------------------------+ +| true | ++--------------------------------------------------------+ +SELECT regexp_like('aBc', '(b|d)', 'i'); ++--------------------------------------------------+ +| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) | ++--------------------------------------------------+ +| true | ++--------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) + +## Time and Date Functions + +- [to_date](#to_date) + +### `to_date` + +Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. + +``` +to_date('2017-05-31', '%Y-%m-%d') +``` + +#### Arguments + +- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. + +#### Example + +```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) + +## Hashing Functions + +- [sha224](#sha224) + +### `sha224` + +Computes the SHA-224 hash of a binary string. + +``` +sha224(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. diff --git a/dev/update_window_docs.sh b/docs/source/user-guide/sql/window_functions_new.md old mode 100755 new mode 100644 similarity index 81% rename from dev/update_window_docs.sh rename to docs/source/user-guide/sql/window_functions_new.md index a77fd2fd8cccc..1ab6740a6f874 --- a/dev/update_window_docs.sh +++ b/docs/source/user-guide/sql/window_functions_new.md @@ -1,33 +1,3 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -set -e - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "${SOURCE_DIR}/../" && pwd - -TARGET_FILE="docs/source/user-guide/sql/window_functions_new.md" -PRINT_WINDOW_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_window_functions_docs" - -echo "Inserting header" -cat <<'EOF' > "$TARGET_FILE" +# Window Functions (NEW) -# Window Functions +This page is a WIP and will replace the Window Functions page once completed. A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result @@ -177,12 +148,14 @@ RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must All [aggregate functions](aggregate_functions.md) can be used as window functions. -EOF +## Ranking Functions -echo "Running CLI and inserting window function docs table" -$PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" +- [row_number](#row_number) -echo "Running prettier" -npx prettier@2.3.2 --write "$TARGET_FILE" +### `row_number` -echo "'$TARGET_FILE' successfully updated!" +Number of the current row within its partition, counting from 1. + +``` +row_number() +``` From 1094e52daad18f4757917560f4afde928f759713 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 1 Oct 2024 21:16:44 -0400 Subject: [PATCH 10/13] Cargo fmt update. --- datafusion/expr/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index c1249d5f2caaa..849d9604808ca 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -91,7 +91,9 @@ pub use logical_plan::*; pub use partition_evaluator::PartitionEvaluator; pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; -pub use udaf::{aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs}; +pub use udaf::{ + aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs, +}; pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl}; pub use udf_docs::{DocSection, Documentation, DocumentationBuilder}; pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl}; From 2af36d0927955b7df80b42b29ae595598c06aa97 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 1 Oct 2024 21:54:11 -0400 Subject: [PATCH 11/13] Doc update --- docs/source/user-guide/sql/scalar_functions_new.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md index a341cbd81c7ee..ae2744c1650e6 100644 --- a/docs/source/user-guide/sql/scalar_functions_new.md +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -95,6 +95,7 @@ rpad(str, n[, padding_str]) #### Arguments - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **n**: String length to pad to. - **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ **Related functions**: From 2b4aa5353e3fcc543816cf6e448432639500b117 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 1 Oct 2024 21:58:59 -0400 Subject: [PATCH 12/13] Fixed copy/paste error. --- datafusion/expr/src/udaf.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 7b572717ac7f6..6e48054bcf3d6 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -632,7 +632,7 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { ScalarValue::try_from(data_type) } - /// Returns the documentation for this Scalar UDF. + /// Returns the documentation for this Aggregate UDF. /// /// Documentation can be accessed programmatically as well as /// generating publicly facing documentation. From 91e095ac19472d144ee5c03efa0cee15a0c329d1 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 1 Oct 2024 22:12:32 -0400 Subject: [PATCH 13/13] Minor text updates. --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4f8a2f67aa51a..4527d047e4c07 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -521,7 +521,7 @@ jobs: run: taplo format --check config-docs-check: - name: check configs.md is up-to-date + name: check configs.md and ***_functions.md is up-to-date needs: [ linux-build-lib ] runs-on: ubuntu-latest container: @@ -542,7 +542,7 @@ jobs: # If you encounter an error, run './dev/update_config_docs.sh' and commit ./dev/update_config_docs.sh git diff --exit-code - - name: Check if any of the xyz_functions.md has been modified + - name: Check if any of the ***_functions.md has been modified run: | # If you encounter an error, run './dev/update_function_docs.sh' and commit ./dev/update_function_docs.sh