diff --git a/rust/arrow/src/json/writer.rs b/rust/arrow/src/json/writer.rs index dbb70cf897e..94305a1b1e8 100644 --- a/rust/arrow/src/json/writer.rs +++ b/rust/arrow/src/json/writer.rs @@ -329,7 +329,7 @@ fn set_column_for_json_rows( } /// Converts an arrow [`RecordBatch`] into a `Vec` of Serde JSON -/// [`serde_json::map::JsonMap`]s (objects) +/// [`JsonMap`]s (objects) pub fn record_batches_to_json_rows( batches: &[RecordBatch], ) -> Vec> { diff --git a/rust/datafusion/README.md b/rust/datafusion/README.md index 2b69b8a3dec..4dd0c3e3f7e 100644 --- a/rust/datafusion/README.md +++ b/rust/datafusion/README.md @@ -58,6 +58,69 @@ Here are some of the projects known to use DataFusion: (if you know of another project, please submit a PR to add a link!) +## Example Usage + +Run a SQL query against data stored in a CSV: + +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let mut ctx = ExecutionContext::new(); + let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; + + let mut ctx = ExecutionContext::new(); + ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?; + + // create a plan to run a SQL query + let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?; + + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} +``` + +Use the DataFrame API to process data stored in a CSV: + +```rust +use datafusion::prelude::*; +use arrow::util::pretty::print_batches; +use arrow::record_batch::RecordBatch; + +#[tokio::main] +async fn main() -> datafusion::error::Result<()> { + // create the dataframe + let mut ctx = ExecutionContext::new(); + let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; + + let df = df.filter(col("a").lt_eq(col("b")))? + .aggregate(&[col("a")], &[min(col("b"))])? + .limit(100)?; + + // execute and print results + let results: Vec = df.collect().await?; + print_batches(&results)?; + Ok(()) +} +``` + +Both of these examples will produce + +```text ++---+--------+ +| a | MIN(b) | ++---+--------+ +| 1 | 2 | ++---+--------+ +``` + + ## Using DataFusion as a library diff --git a/rust/datafusion/src/lib.rs b/rust/datafusion/src/lib.rs index f0fcc4f1d29..41c2491ecd8 100644 --- a/rust/datafusion/src/lib.rs +++ b/rust/datafusion/src/lib.rs @@ -31,7 +31,8 @@ //! as well as a query optimizer and execution engine capable of parallel execution //! against partitioned data sources (CSV and Parquet) using threads. //! -//! Below is an example of how to execute a query against a CSV using [`DataFrames`](dataframe::DataFrame): +//! Below is an example of how to execute a query against data stored +//! in a CSV file using a [`DataFrame`](dataframe::DataFrame): //! //! ```rust //! # use datafusion::prelude::*; @@ -52,6 +53,19 @@ //! //! // execute the plan //! let results: Vec = df.collect().await?; +//! +//! // format the results +//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?; +//! +//! let expected = vec![ +//! "+---+--------+", +//! "| a | MIN(b) |", +//! "+---+--------+", +//! "| 1 | 2 |", +//! "+---+--------+" +//! ]; +//! +//! assert_eq!(pretty_results.trim().lines().collect::>(), expected); //! # Ok(()) //! # } //! ``` @@ -74,6 +88,19 @@ //! //! // execute the plan //! let results: Vec = df.collect().await?; +//! +//! // format the results +//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?; +//! +//! let expected = vec![ +//! "+---+--------+", +//! "| a | MIN(b) |", +//! "+---+--------+", +//! "| 1 | 2 |", +//! "+---+--------+" +//! ]; +//! +//! assert_eq!(pretty_results.trim().lines().collect::>(), expected); //! # Ok(()) //! # } //! ``` diff --git a/rust/datafusion/src/physical_plan/regex_expressions.rs b/rust/datafusion/src/physical_plan/regex_expressions.rs index 8df9a822f31..6482424e105 100644 --- a/rust/datafusion/src/physical_plan/regex_expressions.rs +++ b/rust/datafusion/src/physical_plan/regex_expressions.rs @@ -54,8 +54,9 @@ fn regex_replace_posix_groups(replacement: &str) -> String { .into_owned() } -/// Replaces substring(s) matching a POSIX regular expression -/// regexp_replace('Thomas', '.[mN]a.', 'M') = 'ThM' +/// Replaces substring(s) matching a POSIX regular expression. +/// +/// example: `regexp_replace('Thomas', '.[mN]a.', 'M') = 'ThM'` pub fn regexp_replace(args: &[ArrayRef]) -> Result { // creating Regex is expensive so create hashmap for memoization let mut patterns: HashMap = HashMap::new();