-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Expose parquet reader settings using normal DataFusion ConfigOptions
#3822
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,8 +21,10 @@ use arrow::datatypes::DataType; | |
| use datafusion_common::ScalarValue; | ||
| use itertools::Itertools; | ||
| use log::warn; | ||
| use parking_lot::RwLock; | ||
| use std::collections::HashMap; | ||
| use std::env; | ||
| use std::sync::Arc; | ||
|
|
||
| /// Configuration option "datafusion.optimizer.filter_null_join_keys" | ||
| pub const OPT_FILTER_NULL_JOIN_KEYS: &str = "datafusion.optimizer.filter_null_join_keys"; | ||
|
|
@@ -43,13 +45,25 @@ pub const OPT_COALESCE_BATCHES: &str = "datafusion.execution.coalesce_batches"; | |
| pub const OPT_COALESCE_TARGET_BATCH_SIZE: &str = | ||
| "datafusion.execution.coalesce_target_batch_size"; | ||
|
|
||
| /// Configuration option "datafusion.execution.time_zone" | ||
| pub const OPT_TIME_ZONE: &str = "datafusion.execution.time_zone"; | ||
|
|
||
| /// Configuration option "datafusion.execution.parquet.pushdown_filters" | ||
| pub const OPT_PARQUET_PUSHDOWN_FILTERS: &str = | ||
| "datafusion.execution.parquet.pushdown_filters"; | ||
|
|
||
| /// Configuration option "datafusion.execution.parquet.reorder_filters" | ||
| pub const OPT_PARQUET_REORDER_FILTERS: &str = | ||
| "datafusion.execution.parquet.reorder_filters"; | ||
|
|
||
| /// Configuration option "datafusion.execution.parquet.enable_page_index" | ||
| pub const OPT_PARQUET_ENABLE_PAGE_INDEX: &str = | ||
| "datafusion.execution.parquet.enable_page_index"; | ||
|
|
||
| /// Configuration option "datafusion.optimizer.skip_failed_rules" | ||
| pub const OPT_OPTIMIZER_SKIP_FAILED_RULES: &str = | ||
| "datafusion.optimizer.skip_failed_rules"; | ||
|
|
||
| /// Configuration option "datafusion.execution.time_zone" | ||
| pub const OPT_TIME_ZONE: &str = "datafusion.execution.time_zone"; | ||
|
|
||
| /// Definition of a configuration option | ||
| pub struct ConfigDefinition { | ||
| /// key used to identifier this configuration option | ||
|
|
@@ -173,11 +187,11 @@ impl BuiltInConfigs { | |
| false, | ||
| ), | ||
| ConfigDefinition::new_u64( | ||
| OPT_BATCH_SIZE, | ||
| "Default batch size while creating new batches, it's especially useful for \ | ||
| buffer-in-memory batches since creating tiny batches would results in too much metadata \ | ||
| memory consumption.", | ||
| 8192, | ||
| OPT_BATCH_SIZE, | ||
| "Default batch size while creating new batches, it's especially useful for \ | ||
| buffer-in-memory batches since creating tiny batches would results in too much metadata \ | ||
| memory consumption.", | ||
| 8192, | ||
| ), | ||
| ConfigDefinition::new_bool( | ||
| OPT_COALESCE_BATCHES, | ||
|
|
@@ -191,23 +205,43 @@ impl BuiltInConfigs { | |
| ConfigDefinition::new_u64( | ||
| OPT_COALESCE_TARGET_BATCH_SIZE, | ||
| format!("Target batch size when coalescing batches. Uses in conjunction with the \ | ||
| configuration setting '{}'.", OPT_COALESCE_BATCHES), | ||
| configuration setting '{}'.", OPT_COALESCE_BATCHES), | ||
| 4096, | ||
| ), | ||
| ConfigDefinition::new_string( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved this to be with the other settings |
||
| OPT_TIME_ZONE, | ||
| "The session time zone which some function require \ | ||
| e.g. EXTRACT(HOUR from SOME_TIME) shift the underline datetime according to the time zone, | ||
| then extract the hour.", | ||
| "UTC".into() | ||
| ), | ||
| ConfigDefinition::new_bool( | ||
| OPT_PARQUET_PUSHDOWN_FILTERS, | ||
| "If true, filter expressions are be applied during the parquet decoding operation to \ | ||
| reduce the number of rows decoded.", | ||
| false, | ||
| ), | ||
| ConfigDefinition::new_bool( | ||
| OPT_PARQUET_REORDER_FILTERS, | ||
| "If true, filter expressions evaluated during the parquet decoding opearation \ | ||
| will be reordered heuristically to minimize the cost of evaluation. If false, \ | ||
| the filters are applied in the same order as written in the query.", | ||
| false, | ||
| ), | ||
| ConfigDefinition::new_bool( | ||
| OPT_PARQUET_ENABLE_PAGE_INDEX, | ||
| "If true, uses parquet data page level metadata (Page Index) statistics \ | ||
| to reduce the number of rows decoded.", | ||
| false, | ||
| ), | ||
| ConfigDefinition::new_bool( | ||
| OPT_OPTIMIZER_SKIP_FAILED_RULES, | ||
| "When set to true, the logical plan optimizer will produce warning \ | ||
| messages if any optimization rules produce errors and then proceed to the next \ | ||
| rule. When set to false, any rules that produce errors will cause the query to fail.", | ||
| true | ||
| ), | ||
| ConfigDefinition::new_string( | ||
| OPT_TIME_ZONE, | ||
| "The session time zone which some function require \ | ||
| e.g. EXTRACT(HOUR from SOME_TIME) shift the underline datetime according to the time zone, | ||
| then extract the hour", | ||
| "UTC".into() | ||
| )] | ||
| ] | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -255,8 +289,16 @@ impl ConfigOptions { | |
| Self { options } | ||
| } | ||
|
|
||
| /// Create new ConfigOptions struct, taking values from environment variables where possible. | ||
| /// For example, setting `DATAFUSION_EXECUTION_BATCH_SIZE` to control `datafusion.execution.batch_size`. | ||
| /// Create a new [`ConfigOptions`] wrapped in an RwLock and Arc | ||
| pub fn into_shareable(self) -> Arc<RwLock<Self>> { | ||
| Arc::new(RwLock::new(self)) | ||
| } | ||
|
|
||
| /// Create new ConfigOptions struct, taking values from | ||
| /// environment variables where possible. | ||
| /// | ||
| /// For example, setting `DATAFUSION_EXECUTION_BATCH_SIZE` will | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will add some documentation about this to the datafusion-cli docs as I couldn't find it when I was looking
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| /// control `datafusion.execution.batch_size`. | ||
| pub fn from_env() -> Self { | ||
| let built_in = BuiltInConfigs::new(); | ||
| let mut options = HashMap::with_capacity(built_in.config_definitions.len()); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,12 @@ use crate::datasource::{ | |
| listing::ListingOptions, | ||
| }; | ||
|
|
||
| /// CSV file read option | ||
| /// Options that control the reading of CSV files. | ||
| /// | ||
| /// Note this structure is supplied when a datasource is created and | ||
| /// can not not vary from statement to statement. For settings that | ||
| /// can vary statement to statement see | ||
| /// [`ConfigOptions`](crate::config::ConfigOptions). | ||
| #[derive(Clone)] | ||
| pub struct CsvReadOptions<'a> { | ||
| /// Does the CSV file have a header? | ||
|
|
@@ -150,7 +155,12 @@ impl<'a> CsvReadOptions<'a> { | |
| } | ||
| } | ||
|
|
||
| /// Parquet read options | ||
| /// Options that control the reading of Parquet files. | ||
| /// | ||
| /// Note this structure is supplied when a datasource is created and | ||
| /// can not not vary from statement to statement. For settings that | ||
| /// can vary statement to statement see | ||
| /// [`ConfigOptions`](crate::config::ConfigOptions). | ||
| #[derive(Clone)] | ||
| pub struct ParquetReadOptions<'a> { | ||
| /// File extension; only files with this extension are selected for data input. | ||
|
|
@@ -160,10 +170,12 @@ pub struct ParquetReadOptions<'a> { | |
| pub table_partition_cols: Vec<String>, | ||
| /// Should DataFusion parquet reader use the predicate to prune data, | ||
| /// overridden by value on execution::context::SessionConfig | ||
| // TODO move this into ConfigOptions | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will do this as a follow on PR |
||
| pub parquet_pruning: bool, | ||
| /// Tell the parquet reader to skip any metadata that may be in | ||
| /// the file Schema. This can help avoid schema conflicts due to | ||
| /// metadata. Defaults to true. | ||
| // TODO move this into ConfigOptions | ||
| pub skip_metadata: bool, | ||
| } | ||
|
|
||
|
|
@@ -217,7 +229,12 @@ impl<'a> ParquetReadOptions<'a> { | |
| } | ||
| } | ||
|
|
||
| /// Avro read options | ||
| /// Options that control the reading of AVRO files. | ||
| /// | ||
| /// Note this structure is supplied when a datasource is created and | ||
| /// can not not vary from statement to statement. For settings that | ||
| /// can vary statement to statement see | ||
| /// [`ConfigOptions`](crate::config::ConfigOptions). | ||
| #[derive(Clone)] | ||
| pub struct AvroReadOptions<'a> { | ||
| /// The data source schema. | ||
|
|
@@ -261,7 +278,12 @@ impl<'a> AvroReadOptions<'a> { | |
| } | ||
| } | ||
|
|
||
| /// Line-delimited JSON read options | ||
| /// Options that control the reading of Line-delimited JSON files (NDJson) | ||
| /// | ||
| /// Note this structure is supplied when a datasource is created and | ||
| /// can not not vary from statement to statement. For settings that | ||
| /// can vary statement to statement see | ||
| /// [`ConfigOptions`](crate::config::ConfigOptions). | ||
| #[derive(Clone)] | ||
| pub struct NdJsonReadOptions<'a> { | ||
| /// The data source schema. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this was replicated for the benchmark code as I felt such a struct was the easiest to understand for this matrix strategy