From de56ba200a35dec6a199980379eeb2087c4f38b0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 17 Feb 2024 10:46:29 -0500 Subject: [PATCH 01/10] Clean up documentation for datafusion-substrait and datafusion-proto --- datafusion/proto/README.md | 121 ++---------------------- datafusion/proto/REAME-dev.md | 29 ++++++ datafusion/proto/examples/expr_serde.rs | 33 ------- datafusion/proto/src/lib.rs | 92 +++++++++++++++++- datafusion/substrait/README.md | 36 +++++-- 5 files changed, 157 insertions(+), 154 deletions(-) create mode 100644 datafusion/proto/REAME-dev.md delete mode 100644 datafusion/proto/examples/expr_serde.rs diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md index 8d25f193fa6b1..0b84810931327 100644 --- a/datafusion/proto/README.md +++ b/datafusion/proto/README.md @@ -17,121 +17,14 @@ under the License. --> -# Apache Arrow DataFusion Proto +# `datafusion-proto`: Apache Arrow DataFusion Proto Serialization / Deserialization -Apache Arrow [DataFusion][df] is an extensible query execution framework, -written in Rust, that uses Apache Arrow as its in-memory format. +This crate contains code to convert Apache Arrow [DataFusion] plans to and from +bytes, which can be useful for sending plans over the network, for example +when building a distributed query engine. -This crate provides support format for serializing and deserializing the -following structures to and from bytes: +See [API Docs] for details and examples. -1. [`LogicalPlan`]'s (including [`Expr`]), -2. [`ExecutionPlan`]s (including [`PhysiscalExpr`]) +[datafusion]: https://arrow.apache.org/datafusion +[API Docs]: http://docs.rs/datafusion-substrait/latest/datafusion-substrait -This format can be useful for sending plans over the network, for example when -building a distributed query engine. - -Internally, this crate is implemented by converting the plans to [protocol -buffers] using [prost]. - -[protocol buffers]: https://developers.google.com/protocol-buffers -[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html -[`expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/expr/enum.Expr.html -[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html -[`physiscalexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/trait.PhysicalExpr.html -[prost]: https://docs.rs/prost/latest/prost/ - -## See Also - -The binary format created by this crate supports the full range of DataFusion -plans, but is DataFusion specific. See [datafusion-substrait] which can encode -many DataFusion plans using the [substrait.io] standard. - -[datafusion-substrait]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait -[substrait.io]: https://substrait.io - -# Examples - -## Serializing Expressions - -Based on [examples/expr_serde.rs](examples/expr_serde.rs) - -```rust -use datafusion_common::Result; -use datafusion_expr::{col, lit, Expr}; -use datafusion_proto::bytes::Serializeable; - -fn main() -> Result<()> { - // Create a new `Expr` a < 32 - let expr = col("a").lt(lit(5i32)); - - // Convert it to an opaque form - let bytes = expr.to_bytes()?; - - // Decode bytes from somewhere (over network, etc.) - let decoded_expr = Expr::from_bytes(&bytes)?; - assert_eq!(expr, decoded_expr); - Ok(()) -} -``` - -## Serializing Logical Plans - -Based on [examples/logical_plan_serde.rs](examples/logical_plan_serde.rs) - -```rust -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; - -#[tokio::main] -async fn main() -> Result<()> { - let ctx = SessionContext::new(); - ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()) - .await - ?; - let plan = ctx.table("t1").await?.into_optimized_plan()?; - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; - assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); - Ok(()) -} -``` - -## Serializing Physical Plans - -Based on [examples/physical_plan_serde.rs](examples/physical_plan_serde.rs) - -```rust -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_proto::bytes::{physical_plan_from_bytes,physical_plan_to_bytes}; - -#[tokio::main] -async fn main() -> Result<()> { - let ctx = SessionContext::new(); - ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()) - .await - ?; - let logical_plan = ctx.table("t1").await?.into_optimized_plan()?; - let physical_plan = ctx.create_physical_plan(&logical_plan).await?; - let bytes = physical_plan_to_bytes(physical_plan.clone())?; - let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?; - assert_eq!(format!("{:?}", physical_plan), format!("{:?}", physical_round_trip)); - Ok(()) -} - -``` - -## Generated Code - -The prost/tonic code can be generated by running, which in turn invokes the Rust binary located in [gen](./gen) - -This is necessary after modifying the protobuf definitions or altering the dependencies of [gen](./gen), and requires a -valid installation of [protoc](https://github.com/protocolbuffers/protobuf#protocol-compiler-installation). - -```bash -./regen.sh -``` - -[df]: https://crates.io/crates/datafusion diff --git a/datafusion/proto/REAME-dev.md b/datafusion/proto/REAME-dev.md new file mode 100644 index 0000000000000..faa21f16497f3 --- /dev/null +++ b/datafusion/proto/REAME-dev.md @@ -0,0 +1,29 @@ + + +## Generated Code + +The prost/tonic code can be generated by running, which in turn invokes the Rust binary located in [gen](./gen) + +This is necessary after modifying the protobuf definitions or altering the dependencies of [gen](./gen), and requires a +valid installation of [protoc](https://github.com/protocolbuffers/protobuf#protocol-compiler-installation). + +```bash +./regen.sh +``` diff --git a/datafusion/proto/examples/expr_serde.rs b/datafusion/proto/examples/expr_serde.rs deleted file mode 100644 index 9da64f87e2b1e..0000000000000 --- a/datafusion/proto/examples/expr_serde.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion_common::Result; -use datafusion_expr::{col, lit, Expr}; -use datafusion_proto::bytes::Serializeable; - -fn main() -> Result<()> { - // Create a new `Expr` a < 32 - let expr = col("a").lt(lit(5i32)); - - // Convert it to an opaque form - let bytes = expr.to_bytes()?; - - // Decode bytes from somewhere (over network, etc.) - let decoded_expr = Expr::from_bytes(&bytes)?; - assert_eq!(expr, decoded_expr); - Ok(()) -} diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index 872c26408ebe4..b5ef9c35b680e 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -15,7 +15,97 @@ // specific language governing permissions and limitations // under the License. -//! Serde code for logical plans and expressions. +//! Serialization / Deserializing plans to bytes +//! +//! This crate provides support for serializing and deserializing the +//! following structures to and from bytes: +//! +//! 1. [`LogicalPlan`]'s (including [`Expr`]), +//! +//! 2. [`ExecutionPlan`]s (including [`PhysiscalExpr`]) +//! +//! +//! Internally, this crate is implemented by converting the plans to [protocol +//! buffers] using [prost]. +//! +//! [protocol buffers]: https://developers.google.com/protocol-buffers +//! [prost]: https://docs.rs/prost/latest/prost/ +//! +//! ## See Also +//! +//! The binary format created by this crate supports the full range of DataFusion +//! plans, but is DataFusion specific. See [datafusion-substrait] which can encode +//! many DataFusion plans using the [substrait.io] standard. +//! +//! [datafusion-substrait]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait +//! [substrait.io]: https://substrait.io +//! +//! # Examples +//! +//! ## Serializing Expressions (`Expr`) +//! ``` +//! use datafusion_common::Result; +//! use datafusion_expr::{col, lit, Expr}; +//! use datafusion_proto::bytes::Serializeable; +//! +//! // Create a new `Expr` a < 32 +//! let expr = col("a").lt(lit(5i32)); +//! +//! // Convert it to an opaque form +//! let bytes = expr.to_bytes()?; +//! +//! // Decode bytes from somewhere (over network, etc.) +//! let decoded_expr = Expr::from_bytes(&bytes)?; +//! assert_eq!(expr, decoded_expr); +//! Ok(()) +//! ``` +//! +//! ## Serializing [`LogicalPlan`]s + +Based on [examples/logical_plan_serde.rs](examples/logical_plan_serde.rs) + +```rust +use datafusion::prelude::*; +use datafusion_common::Result; +use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()) + .await + ?; + let plan = ctx.table("t1").await?.into_optimized_plan()?; + let bytes = logical_plan_to_bytes(&plan)?; + let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; + assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); + Ok(()) +} +``` + +## Serializing Physical Plans + + +```rust +use datafusion::prelude::*; +use datafusion_common::Result; +use datafusion_proto::bytes::{physical_plan_from_bytes,physical_plan_to_bytes}; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()) + .await + ?; + let logical_plan = ctx.table("t1").await?.into_optimized_plan()?; + let physical_plan = ctx.create_physical_plan(&logical_plan).await?; + let bytes = physical_plan_to_bytes(physical_plan.clone())?; + let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?; + assert_eq!(format!("{:?}", physical_plan), format!("{:?}", physical_round_trip)); + Ok(()) +} + +``` pub mod bytes; pub mod common; diff --git a/datafusion/substrait/README.md b/datafusion/substrait/README.md index 17591cdd62326..339802024ac3d 100644 --- a/datafusion/substrait/README.md +++ b/datafusion/substrait/README.md @@ -17,18 +17,42 @@ under the License. --> -# DataFusion + Substrait +# Apache Arrow DataFusion Substrait + +This crate contains a [Substrait] producer and consumer for Apache Arrow +[DataFusion] plans. See [API Docs] for details and examples. + +[API Docs]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait + +[DataFusion] is an extensible query execution framework, +written in Rust, that uses Apache Arrow as its in-memory format. [Substrait](https://substrait.io/) provides a cross-language serialization format for relational algebra, based on protocol buffers. -This repository provides a Substrait producer and consumer for DataFusion: +This crate provides support format for serializing (the producer) and deserializing the +following structures to and from Substrait protobuf: -- The producer converts a DataFusion logical plan into a Substrait protobuf. -- The consumer converts a Substrait protobuf into a DataFusion logical plan. +1. [`LogicalPlan`]'s (including [`Expr`]), +2. [`ExecutionPlan`]s (including [`PhysiscalExpr`]) -Potential uses of this crate: +Potential uses of this crate: +- Build systems that run Substrait plans using DataFusion. +- Connect DataFusion based systems to the broader ecosystem - Replace the current [DataFusion protobuf definition](https://github.com/apache/arrow-datafusion/blob/main/datafusion/proto/proto/datafusion.proto) used in Ballista for passing query plan fragments to executors - Make it easier to pass query plans over FFI boundaries, such as from Python to Rust -- Allow Apache Calcite query plans to be executed in DataFusion +- Run plans created by Apache Calcite to run in DataFusion + +## See Also + +Substrait does not (yet) support the full range of plans and expressions than +that DataFusion offers. See [datafusion-proto] for a crate which can + + +[substrait]: https://substrait.io/ +[datafusion]: https://arrow.apache.org/datafusion +[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html +[`expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/expr/enum.Expr.html +[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html +[`physiscalexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/trait.PhysicalExpr.html From 260716cabb9d0ad13b722d92b9369e0d5d0f2ccc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 17 Feb 2024 10:53:48 -0500 Subject: [PATCH 02/10] Port datafusion-proto examples to rustdocs --- .../proto/examples/logical_plan_serde.rs | 32 ----- .../proto/examples/physical_plan_serde.rs | 36 ----- datafusion/proto/src/lib.rs | 128 +++++++++--------- 3 files changed, 63 insertions(+), 133 deletions(-) delete mode 100644 datafusion/proto/examples/logical_plan_serde.rs delete mode 100644 datafusion/proto/examples/physical_plan_serde.rs diff --git a/datafusion/proto/examples/logical_plan_serde.rs b/datafusion/proto/examples/logical_plan_serde.rs deleted file mode 100644 index 9f468638c150d..0000000000000 --- a/datafusion/proto/examples/logical_plan_serde.rs +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; - -#[tokio::main] -async fn main() -> Result<()> { - let ctx = SessionContext::new(); - ctx.register_csv("t1", "testdata/test.csv", CsvReadOptions::default()) - .await?; - let plan = ctx.table("t1").await?.into_optimized_plan()?; - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; - assert_eq!(format!("{plan:?}"), format!("{logical_round_trip:?}")); - Ok(()) -} diff --git a/datafusion/proto/examples/physical_plan_serde.rs b/datafusion/proto/examples/physical_plan_serde.rs deleted file mode 100644 index 72e216074a16d..0000000000000 --- a/datafusion/proto/examples/physical_plan_serde.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_proto::bytes::{physical_plan_from_bytes, physical_plan_to_bytes}; - -#[tokio::main] -async fn main() -> Result<()> { - let ctx = SessionContext::new(); - ctx.register_csv("t1", "testdata/test.csv", CsvReadOptions::default()) - .await?; - let dataframe = ctx.table("t1").await?; - let physical_plan = dataframe.create_physical_plan().await?; - let bytes = physical_plan_to_bytes(physical_plan.clone())?; - let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?; - assert_eq!( - format!("{physical_plan:?}"), - format!("{physical_round_trip:?}") - ); - Ok(()) -} diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index b5ef9c35b680e..9365a64ced0e4 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -15,15 +15,18 @@ // specific language governing permissions and limitations // under the License. -//! Serialization / Deserializing plans to bytes +//! Serialize / Deserialize DataFusion Plans to bytes //! //! This crate provides support for serializing and deserializing the //! following structures to and from bytes: //! //! 1. [`LogicalPlan`]'s (including [`Expr`]), +//! 2. [`ExecutionPlan`]s (including [`PhysicalExpr`]) //! -//! 2. [`ExecutionPlan`]s (including [`PhysiscalExpr`]) -//! +//! [`LogicalPlan`]: datafusion_expr::LogicalPlan +//! [`Expr`]: datafusion_expr::Expr +//! [`ExecutionPlan`]: datafusion::physical_plan::ExecutionPlan +//! [`PhysicalExpr`]: datafusion::physical_expr::PhysicalExpr //! //! Internally, this crate is implemented by converting the plans to [protocol //! buffers] using [prost]. @@ -34,79 +37,74 @@ //! ## See Also //! //! The binary format created by this crate supports the full range of DataFusion -//! plans, but is DataFusion specific. See [datafusion-substrait] which can encode -//! many DataFusion plans using the [substrait.io] standard. +//! plans, but is DataFusion specific. See [datafusion-substrait] for a crate +//! which can encode many DataFusion plans using the [substrait.io] standard. //! //! [datafusion-substrait]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait //! [substrait.io]: https://substrait.io //! -//! # Examples +//! # Example: Serializing [`Expr`]s +//! ``` +//! # use datafusion_common::Result; +//! # use datafusion_expr::{col, lit, Expr}; +//! # use datafusion_proto::bytes::Serializeable; +//! # fn main() -> Result<()>{ +//! // Create a new `Expr` a < 32 +//! let expr = col("a").lt(lit(5i32)); +//! +//! // Convert it to bytes (for sending over the network, etc.) +//! let bytes = expr.to_bytes()?; //! -//! ## Serializing Expressions (`Expr`) +//! // Decode bytes from somewhere (over network, etc.) back to Expr +//! let decoded_expr = Expr::from_bytes(&bytes)?; +//! assert_eq!(expr, decoded_expr); +//! # Ok(()) +//! # } //! ``` -//! use datafusion_common::Result; -//! use datafusion_expr::{col, lit, Expr}; -//! use datafusion_proto::bytes::Serializeable; //! -//! // Create a new `Expr` a < 32 -//! let expr = col("a").lt(lit(5i32)); +//! # Example: Serializing [`LogicalPlan`]s +//! ``` +//! # use datafusion::prelude::*; +//! # use datafusion_common::Result; +//! # use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; +//! # #[tokio::main] +//! # async fn main() -> Result<()>{ +//! // Create a plan that scans table 't' +//! let ctx = SessionContext::new(); +//! ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()).await?; +//! let plan = ctx.table("t1").await?.into_optimized_plan()?; //! -//! // Convert it to an opaque form -//! let bytes = expr.to_bytes()?; +//! // Convert the plan into bytes (for sending over the network, etc.) +//! let bytes = logical_plan_to_bytes(&plan)?; //! -//! // Decode bytes from somewhere (over network, etc.) -//! let decoded_expr = Expr::from_bytes(&bytes)?; -//! assert_eq!(expr, decoded_expr); -//! Ok(()) +//! // Decode bytes from somewhere (over network, etc.) back to LogicalPlan +//! let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; +//! assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); +//! # Ok(()) +//! # } //! ``` +//! # Example: Serializing [`ExecutionPlan`]s //! -//! ## Serializing [`LogicalPlan`]s - -Based on [examples/logical_plan_serde.rs](examples/logical_plan_serde.rs) - -```rust -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_proto::bytes::{logical_plan_from_bytes, logical_plan_to_bytes}; - -#[tokio::main] -async fn main() -> Result<()> { - let ctx = SessionContext::new(); - ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()) - .await - ?; - let plan = ctx.table("t1").await?.into_optimized_plan()?; - let bytes = logical_plan_to_bytes(&plan)?; - let logical_round_trip = logical_plan_from_bytes(&bytes, &ctx)?; - assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); - Ok(()) -} -``` - -## Serializing Physical Plans - - -```rust -use datafusion::prelude::*; -use datafusion_common::Result; -use datafusion_proto::bytes::{physical_plan_from_bytes,physical_plan_to_bytes}; - -#[tokio::main] -async fn main() -> Result<()> { - let ctx = SessionContext::new(); - ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()) - .await - ?; - let logical_plan = ctx.table("t1").await?.into_optimized_plan()?; - let physical_plan = ctx.create_physical_plan(&logical_plan).await?; - let bytes = physical_plan_to_bytes(physical_plan.clone())?; - let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?; - assert_eq!(format!("{:?}", physical_plan), format!("{:?}", physical_round_trip)); - Ok(()) -} - -``` - +//! ``` +//! # use datafusion::prelude::*; +//! # use datafusion_common::Result; +//! # use datafusion_proto::bytes::{physical_plan_from_bytes,physical_plan_to_bytes}; +//! # #[tokio::main] +//! # async fn main() -> Result<()>{ +//! // Create a plan that scans table 't' +//! let ctx = SessionContext::new(); +//! ctx.register_csv("t1", "tests/testdata/test.csv", CsvReadOptions::default()).await?; +//! let physical_plan = ctx.table("t1").await?.create_physical_plan().await?; +//! +//! // Convert the plan into bytes (for sending over the network, etc.) +//! let bytes = physical_plan_to_bytes(physical_plan.clone())?; +//! +//! // Decode bytes from somewhere (over network, etc.) back to ExecutionPlan +//! let physical_round_trip = physical_plan_from_bytes(&bytes, &ctx)?; +//! assert_eq!(format!("{:?}", physical_plan), format!("{:?}", physical_round_trip)); +//! # Ok(()) +//! # } +//! ``` pub mod bytes; pub mod common; pub mod generated; From 87af4b2f30e6d445966c49335db469acfe4ca8d4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 17 Feb 2024 11:20:17 -0500 Subject: [PATCH 03/10] Cleanup substrait docs and examples --- datafusion/proto/src/lib.rs | 2 +- datafusion/substrait/README.md | 33 +------------------- datafusion/substrait/src/lib.rs | 55 +++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index 9365a64ced0e4..c2f3abc06e34c 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -34,7 +34,7 @@ //! [protocol buffers]: https://developers.google.com/protocol-buffers //! [prost]: https://docs.rs/prost/latest/prost/ //! -//! ## See Also +//! # See Also //! //! The binary format created by this crate supports the full range of DataFusion //! plans, but is DataFusion specific. See [datafusion-substrait] for a crate diff --git a/datafusion/substrait/README.md b/datafusion/substrait/README.md index 339802024ac3d..ed51ae55c625a 100644 --- a/datafusion/substrait/README.md +++ b/datafusion/substrait/README.md @@ -22,37 +22,6 @@ This crate contains a [Substrait] producer and consumer for Apache Arrow [DataFusion] plans. See [API Docs] for details and examples. +[Substrait]: https://substrait.io [API Docs]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait -[DataFusion] is an extensible query execution framework, -written in Rust, that uses Apache Arrow as its in-memory format. - -[Substrait](https://substrait.io/) provides a cross-language serialization format for relational algebra, based on -protocol buffers. - -This crate provides support format for serializing (the producer) and deserializing the -following structures to and from Substrait protobuf: - -1. [`LogicalPlan`]'s (including [`Expr`]), -2. [`ExecutionPlan`]s (including [`PhysiscalExpr`]) - - -Potential uses of this crate: -- Build systems that run Substrait plans using DataFusion. -- Connect DataFusion based systems to the broader ecosystem -- Replace the current [DataFusion protobuf definition](https://github.com/apache/arrow-datafusion/blob/main/datafusion/proto/proto/datafusion.proto) used in Ballista for passing query plan fragments to executors -- Make it easier to pass query plans over FFI boundaries, such as from Python to Rust -- Run plans created by Apache Calcite to run in DataFusion - -## See Also - -Substrait does not (yet) support the full range of plans and expressions than -that DataFusion offers. See [datafusion-proto] for a crate which can - - -[substrait]: https://substrait.io/ -[datafusion]: https://arrow.apache.org/datafusion -[`logicalplan`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html -[`expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/expr/enum.Expr.html -[`executionplan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html -[`physiscalexpr`]: https://docs.rs/datafusion/latest/datafusion/physical_expr/trait.PhysicalExpr.html diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index 432553ec79037..f39379b03d2f3 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -15,6 +15,61 @@ // specific language governing permissions and limitations // under the License. +//! Serialize / Deserialize DataFusion Plans to [Substrait.io] +//! +//! This crate provides support for serializing and deserializing DataFusion plans +//! to and from the generated types in [substrait::proto] from the [substrait] crate. +//! +//! [Substrait.io] provides a cross-language serialization format for relational +//! algebra (e.g. query plans and expressions), based on protocol buffers. +//! +//! [Substrait.io]: https://substrait.io/ +//! +//! [`LogicalPlan`]: datafusion::logical_expr::LogicalPlan +//! [`ExecutionPlan`]: datafusion::physical_plan::ExecutionPlan +//! +//! Potential uses of this crate: +//! * Use DataFusion run Substrait plans created by other systems (e.g. Apache Calcite) +//! * Use DataFusion to create plans to run on other systems +//! * Pass query plans over FFI boundaries, such as from Python to Rust +//! +//! # See Also +//! +//! Substrait does not (yet) support the full range of plans and expressions +//! that DataFusion offers. See the [datafusion-proto] crate for a DataFusion +//! specific format that does support of the full range. +//! +//! [datafusion-proto]: https://docs.rs/datafusion-proto/latest/datafusion_proto +//! +//! Note that generated types such as [`substrait::proto::Plan`] and +//! [`substrait::proto::Rel`] can be serialized / deserialized to bytes, json and +//! other formats using [prost] and the rest of the rust protobuf ecosystem. +//! +//! # Example: Serializing [`LogicalPlan`]s +//! ``` +//! # use datafusion::prelude::*; +//! # use datafusion::error::Result; +//! # #[tokio::main(flavor = "current_thread")] +//! # async fn main() -> Result<()>{ +//! # use std::sync::Arc; +//! # use datafusion::arrow::array::{Int32Array, RecordBatch}; +//! # use datafusion_substrait::logical_plan; +//! // Create a plan that scans table 't' +//! let ctx = SessionContext::new(); +//! let batch = RecordBatch::try_from_iter(vec![("x", Arc::new(Int32Array::from(vec![42])) as _)])?; +//! ctx.register_batch("t", batch)?; +//! let df = ctx.sql("SELECT x from t").await?; +//! let plan = df.into_optimized_plan()?; +//! +//! // Convert the plan into a substrait (protobuf) Plan +//! let substrait_plan = logical_plan::producer::to_substrait_plan(&plan, &ctx)?; +//! +//! // Receive a substrait protobuf from somewhere, and turn it into a LogicalPlan +//! let logical_round_trip = logical_plan::consumer::from_substrait_plan(&ctx, &substrait_plan).await?; +//! assert_eq!(format!("{:?}", plan), format!("{:?}", logical_round_trip)); +//! # Ok(()) +//! # } +//! ``` pub mod logical_plan; pub mod physical_plan; pub mod serializer; From b83722f5168d33041dd4686606ae9398b4dda96e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 18 Feb 2024 03:05:56 -0500 Subject: [PATCH 04/10] prettier --- datafusion/proto/README.md | 5 ++--- datafusion/substrait/README.md | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md index 0b84810931327..727dbfb5dce91 100644 --- a/datafusion/proto/README.md +++ b/datafusion/proto/README.md @@ -21,10 +21,9 @@ This crate contains code to convert Apache Arrow [DataFusion] plans to and from bytes, which can be useful for sending plans over the network, for example -when building a distributed query engine. +when building a distributed query engine. See [API Docs] for details and examples. [datafusion]: https://arrow.apache.org/datafusion -[API Docs]: http://docs.rs/datafusion-substrait/latest/datafusion-substrait - +[api docs]: http://docs.rs/datafusion-substrait/latest/datafusion-substrait diff --git a/datafusion/substrait/README.md b/datafusion/substrait/README.md index ed51ae55c625a..2f0b7f98cc9b5 100644 --- a/datafusion/substrait/README.md +++ b/datafusion/substrait/README.md @@ -22,6 +22,5 @@ This crate contains a [Substrait] producer and consumer for Apache Arrow [DataFusion] plans. See [API Docs] for details and examples. -[Substrait]: https://substrait.io -[API Docs]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait - +[substrait]: https://substrait.io +[api docs]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait From d06417c3179335cdb4545bd4829e3b2a2f633463 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Feb 2024 01:40:39 -0500 Subject: [PATCH 05/10] Apply suggestions from code review Co-authored-by: Jeffrey Vo Co-authored-by: Ruihang Xia --- datafusion/proto/README.md | 2 +- datafusion/proto/REAME-dev.md | 2 +- datafusion/substrait/src/lib.rs | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md index 727dbfb5dce91..0efd0c62dec68 100644 --- a/datafusion/proto/README.md +++ b/datafusion/proto/README.md @@ -17,7 +17,7 @@ under the License. --> -# `datafusion-proto`: Apache Arrow DataFusion Proto Serialization / Deserialization +# `datafusion-proto`: Apache Arrow DataFusion Protobuf Serialization / Deserialization This crate contains code to convert Apache Arrow [DataFusion] plans to and from bytes, which can be useful for sending plans over the network, for example diff --git a/datafusion/proto/REAME-dev.md b/datafusion/proto/REAME-dev.md index faa21f16497f3..2dd0701a75813 100644 --- a/datafusion/proto/REAME-dev.md +++ b/datafusion/proto/REAME-dev.md @@ -19,7 +19,7 @@ ## Generated Code -The prost/tonic code can be generated by running, which in turn invokes the Rust binary located in [gen](./gen) +The prost/tonic code can be generated by running `./regen.sh`, which in turn invokes the Rust binary located in [gen](./gen) This is necessary after modifying the protobuf definitions or altering the dependencies of [gen](./gen), and requires a valid installation of [protoc](https://github.com/protocolbuffers/protobuf#protocol-compiler-installation). diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index f39379b03d2f3..c40a178a8de81 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -17,8 +17,9 @@ //! Serialize / Deserialize DataFusion Plans to [Substrait.io] //! -//! This crate provides support for serializing and deserializing DataFusion plans -//! to and from the generated types in [substrait::proto] from the [substrait] crate. +//! This crate provides support for serializing and deserializing both DataFusion +//! [`LogicalPlan`] and [`ExecutionPlan`] to and from the generated types in +//! [substrait::proto] from the [substrait] crate. //! //! [Substrait.io] provides a cross-language serialization format for relational //! algebra (e.g. query plans and expressions), based on protocol buffers. @@ -29,7 +30,7 @@ //! [`ExecutionPlan`]: datafusion::physical_plan::ExecutionPlan //! //! Potential uses of this crate: -//! * Use DataFusion run Substrait plans created by other systems (e.g. Apache Calcite) +//! * Use DataFusion to run Substrait plans created by other systems (e.g. Apache Calcite) //! * Use DataFusion to create plans to run on other systems //! * Pass query plans over FFI boundaries, such as from Python to Rust //! @@ -42,8 +43,8 @@ //! [datafusion-proto]: https://docs.rs/datafusion-proto/latest/datafusion_proto //! //! Note that generated types such as [`substrait::proto::Plan`] and -//! [`substrait::proto::Rel`] can be serialized / deserialized to bytes, json and -//! other formats using [prost] and the rest of the rust protobuf ecosystem. +//! [`substrait::proto::Rel`] can be serialized / deserialized to bytes, JSON and +//! other formats using [prost] and the rest of the Rust protobuf ecosystem. //! //! # Example: Serializing [`LogicalPlan`]s //! ``` From f596397fea805d92cb5f18bc5c2d2e11f9967ae6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Feb 2024 01:42:39 -0500 Subject: [PATCH 06/10] Fix readme links --- datafusion/proto/README.md | 2 +- datafusion/substrait/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/proto/README.md b/datafusion/proto/README.md index 0efd0c62dec68..ca6ae7fc68f47 100644 --- a/datafusion/proto/README.md +++ b/datafusion/proto/README.md @@ -26,4 +26,4 @@ when building a distributed query engine. See [API Docs] for details and examples. [datafusion]: https://arrow.apache.org/datafusion -[api docs]: http://docs.rs/datafusion-substrait/latest/datafusion-substrait +[api docs]: http://docs.rs/datafusion-proto/latest diff --git a/datafusion/substrait/README.md b/datafusion/substrait/README.md index 2f0b7f98cc9b5..a9f2ba4c3c524 100644 --- a/datafusion/substrait/README.md +++ b/datafusion/substrait/README.md @@ -23,4 +23,4 @@ This crate contains a [Substrait] producer and consumer for Apache Arrow [DataFusion] plans. See [API Docs] for details and examples. [substrait]: https://substrait.io -[api docs]: https://docs.rs/datafusion-substrait/latest/datafusion_substrait +[api docs]: https://docs.rs/datafusion-substrait/latest From aa09db69f7504f671ea5b30e9b62f7e05e24702b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Feb 2024 01:45:30 -0500 Subject: [PATCH 07/10] Update protoc docs --- datafusion/proto/{REAME-dev.md => README-dev.md} | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) rename datafusion/proto/{REAME-dev.md => README-dev.md} (80%) diff --git a/datafusion/proto/REAME-dev.md b/datafusion/proto/README-dev.md similarity index 80% rename from datafusion/proto/REAME-dev.md rename to datafusion/proto/README-dev.md index 2dd0701a75813..a2b2965b50e48 100644 --- a/datafusion/proto/REAME-dev.md +++ b/datafusion/proto/README-dev.md @@ -22,8 +22,11 @@ The prost/tonic code can be generated by running `./regen.sh`, which in turn invokes the Rust binary located in [gen](./gen) This is necessary after modifying the protobuf definitions or altering the dependencies of [gen](./gen), and requires a -valid installation of [protoc](https://github.com/protocolbuffers/protobuf#protocol-compiler-installation). +valid installation of [protoc] (see [installation instructions] for details). ```bash ./regen.sh ``` + +[protoc]: https://github.com/protocolbuffers/protobuf#protocol-compiler-installation +[installation instructions]: https://arrow.apache.org/datafusion/contributor-guide/#protoc-installation \ No newline at end of file From 7d91abf8ec26402eb3e6618033cd8695640af6ef Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Feb 2024 01:49:14 -0500 Subject: [PATCH 08/10] Add compatibilty note --- datafusion/proto/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index c2f3abc06e34c..928e3dc35f596 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -34,6 +34,12 @@ //! [protocol buffers]: https://developers.google.com/protocol-buffers //! [prost]: https://docs.rs/prost/latest/prost/ //! +//! # Version Compatibility +//! +//! The serialized form are not guaranteed to be compatible across +//! DataFusion versions. A plan serialized with one version of DataFusion +//! may not be able to deserialized with a different version. +//! //! # See Also //! //! The binary format created by this crate supports the full range of DataFusion From 708c7cd1151dea6e762f77af7e5951a5504df86f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Feb 2024 01:53:36 -0500 Subject: [PATCH 09/10] add plan serialization use --- datafusion/proto/src/lib.rs | 2 +- datafusion/substrait/src/lib.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index 928e3dc35f596..5d60b9b57454e 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -38,7 +38,7 @@ //! //! The serialized form are not guaranteed to be compatible across //! DataFusion versions. A plan serialized with one version of DataFusion -//! may not be able to deserialized with a different version. +//! may not be able to deserialized with a different version. //! //! # See Also //! diff --git a/datafusion/substrait/src/lib.rs b/datafusion/substrait/src/lib.rs index c40a178a8de81..454f0e7b7cb99 100644 --- a/datafusion/substrait/src/lib.rs +++ b/datafusion/substrait/src/lib.rs @@ -33,6 +33,7 @@ //! * Use DataFusion to run Substrait plans created by other systems (e.g. Apache Calcite) //! * Use DataFusion to create plans to run on other systems //! * Pass query plans over FFI boundaries, such as from Python to Rust +//! * Pass query plans across node boundaries //! //! # See Also //! From 875e8031d9ff13defd0460d19cb795e27c483bbf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Feb 2024 01:54:04 -0500 Subject: [PATCH 10/10] prettier --- datafusion/proto/README-dev.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/proto/README-dev.md b/datafusion/proto/README-dev.md index a2b2965b50e48..b793b47e76a6d 100644 --- a/datafusion/proto/README-dev.md +++ b/datafusion/proto/README-dev.md @@ -29,4 +29,4 @@ valid installation of [protoc] (see [installation instructions] for details). ``` [protoc]: https://github.com/protocolbuffers/protobuf#protocol-compiler-installation -[installation instructions]: https://arrow.apache.org/datafusion/contributor-guide/#protoc-installation \ No newline at end of file +[installation instructions]: https://arrow.apache.org/datafusion/contributor-guide/#protoc-installation