From d6db347236a19389d5fe12d037ee69d85bf8b42d Mon Sep 17 00:00:00 2001 From: Miro Date: Tue, 12 Aug 2025 15:11:40 +0800 Subject: [PATCH 1/8] feat: Make parquet_encryption a non-default feature --- datafusion-cli/Cargo.toml | 1 + datafusion/common/src/config.rs | 4 ++++ datafusion/core/Cargo.toml | 1 - 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 3a0f658a57742..1daa851f96e37 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -47,6 +47,7 @@ datafusion = { workspace = true, features = [ "encoding_expressions", "nested_expressions", "parquet", + "parquet_encryption", "recursive_protection", "regex_expressions", "unicode_expressions", diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 939d13d9690e5..b2e9ba6571c97 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -671,6 +671,8 @@ config_namespace! { config_namespace! { /// Options for configuring Parquet Modular Encryption + /// + /// To use Parquet encryption, you must enable the `parquet_encryption` feature flag, as it is not activated by default. pub struct ParquetEncryptionOptions { /// Optional file decryption properties pub file_decryption: Option, default = None @@ -1880,6 +1882,8 @@ pub struct TableParquetOptions { /// ``` pub key_value_metadata: HashMap>, /// Options for configuring Parquet modular encryption + /// + /// To use Parquet encryption, you must enable the `parquet_encryption` feature flag, as it is not activated by default. /// See ConfigFileEncryptionProperties and ConfigFileDecryptionProperties in datafusion/common/src/config.rs /// These can be set via 'format.crypto', for example: /// ```sql diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 53a17c1b93c7a..84542af68dc58 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -62,7 +62,6 @@ default = [ "unicode_expressions", "compression", "parquet", - "parquet_encryption", "recursive_protection", ] encoding_expressions = ["datafusion-functions/encoding_expressions"] From 58c004bb4046e55e429e67627baa6a68c8466e2a Mon Sep 17 00:00:00 2001 From: Miro Date: Tue, 12 Aug 2025 19:36:21 +0800 Subject: [PATCH 2/8] fix: enable parquet_encryption for sqllogictest --- datafusion/sqllogictest/Cargo.toml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index b9aef40777b71..cd90103c3e6ba 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -43,7 +43,10 @@ bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } clap = { version = "4.5.43", features = ["derive", "env"] } -datafusion = { workspace = true, default-features = true, features = ["avro"] } +datafusion = { workspace = true, default-features = true, features = [ + "avro", + "parquet_encryption", +] } datafusion-spark = { workspace = true, default-features = true } datafusion-substrait = { workspace = true, default-features = true } futures = { workspace = true } @@ -53,7 +56,10 @@ itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } postgres-protocol = { version = "0.6.7", optional = true } -postgres-types = { version = "0.2.8", features = ["derive", "with-chrono-0_4"], optional = true } +postgres-types = { version = "0.2.8", features = [ + "derive", + "with-chrono-0_4", +], optional = true } rust_decimal = { version = "1.37.2", features = ["tokio-pg"] } # When updating the following dependency verify that sqlite test file regeneration works correctly # by running the regenerate_sqlite_files.sh script. @@ -61,7 +67,9 @@ sqllogictest = "0.28.3" sqlparser = { workspace = true } tempfile = { workspace = true } testcontainers = { workspace = true, optional = true } -testcontainers-modules = { workspace = true, features = ["postgres"], optional = true } +testcontainers-modules = { workspace = true, features = [ + "postgres", +], optional = true } thiserror = "2.0.12" tokio = { workspace = true } tokio-postgres = { version = "0.7.12", optional = true } From 346700902ef2bd787dd0e997c8ae952fb0a3e5d7 Mon Sep 17 00:00:00 2001 From: Miro Date: Tue, 12 Aug 2025 21:18:14 +0800 Subject: [PATCH 3/8] fix: enable parquet_encryption for datafusion-examples --- datafusion-examples/Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 409fc12bcbc5b..0095148586014 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -66,7 +66,9 @@ bytes = { workspace = true } dashmap = { workspace = true } # note only use main datafusion crate for examples base64 = "0.22.1" -datafusion = { workspace = true, default-features = true } +datafusion = { workspace = true, default-features = true, features = [ + "parquet_encryption", +] } datafusion-ffi = { workspace = true } datafusion-proto = { workspace = true } env_logger = { workspace = true } From 4e13422f602524f36855bced9174054db2fbe763 Mon Sep 17 00:00:00 2001 From: Miro Date: Tue, 12 Aug 2025 21:50:58 +0800 Subject: [PATCH 4/8] fmt --- datafusion-examples/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 0095148586014..7d0cc091777a5 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -67,7 +67,7 @@ dashmap = { workspace = true } # note only use main datafusion crate for examples base64 = "0.22.1" datafusion = { workspace = true, default-features = true, features = [ - "parquet_encryption", + "parquet_encryption", ] } datafusion-ffi = { workspace = true } datafusion-proto = { workspace = true } From 3848234d89d4f04225e356e4a00e33772891ec28 Mon Sep 17 00:00:00 2001 From: Miro Date: Thu, 14 Aug 2025 23:22:35 +0800 Subject: [PATCH 5/8] fix: update features list in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8c92d0780fa50..9c3f6190c0966 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,6 @@ Default features: - `datetime_expressions`: date and time functions such as `to_timestamp` - `encoding_expressions`: `encode` and `decode` functions - `parquet`: support for reading the [Apache Parquet] format -- `parquet_encryption`: support for using [Parquet Modular Encryption] - `regex_expressions`: regular expression functions, such as `regexp_match` - `unicode_expressions`: Include unicode aware functions such as `character_length` - `unparser`: enables support to reverse LogicalPlans back into SQL @@ -128,6 +127,7 @@ Optional features: - `avro`: support for reading the [Apache Avro] format - `backtrace`: include backtrace information in error messages +- `parquet_encryption`: support for using [Parquet Modular Encryption] - `pyarrow`: conversions between PyArrow and DataFusion types - `serde`: enable arrow-schema's `serde` feature From ab269f15ca8e80e2c8dd3d1f610fa32afee64665 Mon Sep 17 00:00:00 2001 From: Miro Date: Fri, 15 Aug 2025 23:26:51 +0800 Subject: [PATCH 6/8] update: add parquet_encryption checks to CI workflow --- .github/workflows/rust.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 2ec1ee7c0f5da..8754b12801adb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -220,6 +220,8 @@ jobs: run: cargo check --profile ci --no-default-features -p datafusion --features=unicode_expressions - name: Check parquet encryption (parquet_encryption) run: cargo check --profile ci --no-default-features -p datafusion --features=parquet_encryption + - name: Check parquet encryption (parquet, parquet_encryption) + run: cargo check --profile ci --no-default-features -p datafusion --features=parquet,parquet_encryption # Check datafusion-functions crate features # @@ -294,7 +296,7 @@ jobs: --lib \ --tests \ --bins \ - --features serde,avro,json,backtrace,integration-tests + --features serde,avro,json,backtrace,integration-tests,parquet_encryption - name: Verify Working Directory Clean run: git diff --exit-code From 655d17fe396c92985de4b5f8b0e278b3af07490f Mon Sep 17 00:00:00 2001 From: Miro Date: Mon, 18 Aug 2025 12:45:05 +0800 Subject: [PATCH 7/8] update parquet_encryption dependencies in Cargo.toml and CI workflow --- .github/workflows/rust.yml | 2 -- datafusion/core/Cargo.toml | 35 ++++++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8754b12801adb..fd86424476f01 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -220,8 +220,6 @@ jobs: run: cargo check --profile ci --no-default-features -p datafusion --features=unicode_expressions - name: Check parquet encryption (parquet_encryption) run: cargo check --profile ci --no-default-features -p datafusion --features=parquet_encryption - - name: Check parquet encryption (parquet, parquet_encryption) - run: cargo check --profile ci --no-default-features -p datafusion --features=parquet,parquet_encryption # Check datafusion-functions crate features # diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 63959964af015..f963592238566 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -19,7 +19,13 @@ name = "datafusion" description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" keywords = ["arrow", "query", "sql"] -include = ["benches/*.rs", "src/**/*.rs", "Cargo.toml", "LICENSE.txt", "NOTICE.txt"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", + "LICENSE.txt", + "NOTICE.txt", +] readme = "../../README.md" version = { workspace = true } edition = { workspace = true } @@ -66,20 +72,25 @@ default = [ ] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) -force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"] +force_hash_collisions = [ + "datafusion-physical-plan/force_hash_collisions", + "datafusion-common/force_hash_collisions", +] math_expressions = ["datafusion-functions/math_expressions"] -parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"] -parquet_encryption = [ +parquet = [ + "datafusion-common/parquet", "dep:parquet", + "datafusion-datasource-parquet", +] +parquet_encryption = [ + "parquet", "parquet/encryption", "datafusion-common/parquet_encryption", "datafusion-datasource-parquet/parquet_encryption", "dep:hex", ] pyarrow = ["datafusion-common/pyarrow", "parquet"] -regex_expressions = [ - "datafusion-functions/regex_expressions", -] +regex_expressions = ["datafusion-functions/regex_expressions"] recursive_protection = [ "datafusion-common/recursive_protection", "datafusion-expr/recursive_protection", @@ -144,7 +155,9 @@ parking_lot = { workspace = true } parquet = { workspace = true, optional = true, default-features = true } rand = { workspace = true } regex = { workspace = true } -serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } +serde = { version = "1.0", default-features = false, features = [ + "derive", +], optional = true } sqlparser = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } @@ -174,7 +187,11 @@ rstest = { workspace = true } serde_json = { workspace = true } sysinfo = "0.37.0" test-utils = { path = "../../test-utils" } -tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] } +tokio = { workspace = true, features = [ + "rt-multi-thread", + "parking_lot", + "fs", +] } [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.30.1", features = ["fs"] } From 161f7af90f40337a532a7b10b9d7eb09d865cc79 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 22 Aug 2025 15:22:01 -0400 Subject: [PATCH 8/8] revert unecessary cargo changes --- datafusion-examples/Cargo.toml | 4 +--- datafusion/core/Cargo.toml | 33 ++++++++---------------------- datafusion/sqllogictest/Cargo.toml | 10 ++------- 3 files changed, 11 insertions(+), 36 deletions(-) diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index f0898642293f7..d847db6165b68 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -66,9 +66,7 @@ bytes = { workspace = true } dashmap = { workspace = true } # note only use main datafusion crate for examples base64 = "0.22.1" -datafusion = { workspace = true, default-features = true, features = [ - "parquet_encryption", -] } +datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] } datafusion-ffi = { workspace = true } datafusion-physical-expr-adapter = { workspace = true } datafusion-proto = { workspace = true } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index f963592238566..b87a3c93d8757 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -19,13 +19,7 @@ name = "datafusion" description = "DataFusion is an in-memory query engine that uses Apache Arrow as the memory model" keywords = ["arrow", "query", "sql"] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", - "LICENSE.txt", - "NOTICE.txt", -] +include = ["benches/*.rs", "src/**/*.rs", "Cargo.toml", "LICENSE.txt", "NOTICE.txt"] readme = "../../README.md" version = { workspace = true } edition = { workspace = true } @@ -72,16 +66,9 @@ default = [ ] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) -force_hash_collisions = [ - "datafusion-physical-plan/force_hash_collisions", - "datafusion-common/force_hash_collisions", -] +force_hash_collisions = ["datafusion-physical-plan/force_hash_collisions", "datafusion-common/force_hash_collisions"] math_expressions = ["datafusion-functions/math_expressions"] -parquet = [ - "datafusion-common/parquet", - "dep:parquet", - "datafusion-datasource-parquet", -] +parquet = ["datafusion-common/parquet", "dep:parquet", "datafusion-datasource-parquet"] parquet_encryption = [ "parquet", "parquet/encryption", @@ -90,7 +77,9 @@ parquet_encryption = [ "dep:hex", ] pyarrow = ["datafusion-common/pyarrow", "parquet"] -regex_expressions = ["datafusion-functions/regex_expressions"] +regex_expressions = [ + "datafusion-functions/regex_expressions", +] recursive_protection = [ "datafusion-common/recursive_protection", "datafusion-expr/recursive_protection", @@ -155,9 +144,7 @@ parking_lot = { workspace = true } parquet = { workspace = true, optional = true, default-features = true } rand = { workspace = true } regex = { workspace = true } -serde = { version = "1.0", default-features = false, features = [ - "derive", -], optional = true } +serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } sqlparser = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } @@ -187,11 +174,7 @@ rstest = { workspace = true } serde_json = { workspace = true } sysinfo = "0.37.0" test-utils = { path = "../../test-utils" } -tokio = { workspace = true, features = [ - "rt-multi-thread", - "parking_lot", - "fs", -] } +tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] } [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.30.1", features = ["fs"] } diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index ba4e9e7a62b09..1e38c147e0e18 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -43,10 +43,7 @@ bigdecimal = { workspace = true } bytes = { workspace = true, optional = true } chrono = { workspace = true, optional = true } clap = { version = "4.5.44", features = ["derive", "env"] } -datafusion = { workspace = true, default-features = true, features = [ - "avro", - "parquet_encryption", -] } +datafusion = { workspace = true, default-features = true, features = ["avro", "parquet_encryption"] } datafusion-spark = { workspace = true, default-features = true } datafusion-substrait = { workspace = true, default-features = true } futures = { workspace = true } @@ -56,10 +53,7 @@ itertools = { workspace = true } log = { workspace = true } object_store = { workspace = true } postgres-protocol = { version = "0.6.7", optional = true } -postgres-types = { version = "0.2.8", features = [ - "derive", - "with-chrono-0_4", -], optional = true } +postgres-types = { version = "0.2.8", features = ["derive", "with-chrono-0_4"], optional = true } rust_decimal = { version = "1.37.2", features = ["tokio-pg"] } # When updating the following dependency verify that sqlite test file regeneration works correctly # by running the regenerate_sqlite_files.sh script.