diff --git a/Cargo.toml b/Cargo.toml index cc94b4292a50d..acdd1ae5ba750 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -177,3 +177,18 @@ large_futures = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_qualifications = "deny" + +# Temp patch to main of arrow-rs +[patch.crates-io] +arrow = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-array = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-cast = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-data = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-ipc = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-select = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-string = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +parquet = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } \ No newline at end of file diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9552542befd85..4968cff4bab79 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -175,8 +175,7 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-arith", "arrow-array", @@ -196,23 +195,20 @@ dependencies = [ [[package]] name = "arrow-arith" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", "num", ] [[package]] name = "arrow-array" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "ahash", "arrow-buffer", @@ -228,8 +224,7 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "bytes", "half", @@ -239,8 +234,7 @@ dependencies = [ [[package]] name = "arrow-cast" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", "arrow-buffer", @@ -260,27 +254,22 @@ dependencies = [ [[package]] name = "arrow-csv" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-buffer", "arrow-schema", @@ -291,12 +280,10 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", "flatbuffers", @@ -306,8 +293,7 @@ dependencies = [ [[package]] name = "arrow-json" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", "arrow-buffer", @@ -326,25 +312,20 @@ dependencies = [ [[package]] name = "arrow-ord" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half", - "num", ] [[package]] name = "arrow-row" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -355,14 +336,12 @@ dependencies = [ [[package]] name = "arrow-schema" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" [[package]] name = "arrow-select" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "ahash", "arrow-array", @@ -375,8 +354,7 @@ dependencies = [ [[package]] name = "arrow-string" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "arrow-array", "arrow-buffer", @@ -614,9 +592,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" +checksum = "8aa8ff1492fd9fb99ae28e8467af0dbbb7c31512b16fabf1a0f10d7bb6ef78bb" dependencies = [ "futures-util", "pin-project-lite", @@ -673,9 +651,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.4" +version = "1.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" +checksum = "431a10d0e07e09091284ef04453dae4069283aa108d209974d67e77ae1caa658" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -717,9 +695,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.9" +version = "1.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" +checksum = "8ecbf4d5dfb169812e2b240a4350f15ad3c6b03a54074e5712818801615f2dc5" dependencies = [ "base64-simd", "bytes", @@ -2648,9 +2626,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] @@ -2873,8 +2851,7 @@ dependencies = [ [[package]] name = "parquet" version = "53.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" dependencies = [ "ahash", "arrow-array", @@ -4724,3 +4701,8 @@ dependencies = [ "cc", "pkg-config", ] + +[[patch.unused]] +name = "arrow-flight" +version = "53.3.0" +source = "git+https://github.com/apache/arrow-rs.git?rev=54dccadccc6b599b93a46aef5f03dfd4d9b7349e#54dccadccc6b599b93a46aef5f03dfd4d9b7349e" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 4cdc2120a0298..691c75c26ef3d 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -69,3 +69,18 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.22" + +# Temp patch to main of arrow-rs +[patch.crates-io] +arrow = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-array = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-cast = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-data = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-ipc = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-select = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-string = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } +parquet = { git = "https://github.com/alamb/arrow-rs.git", rev = "78994df5316106b4db2601ba4f641359b2b8084c" } \ No newline at end of file diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 82909404e455f..d1d4394fad5a7 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -60,7 +60,7 @@ libc = "0.2.140" object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.22.0", optional = true } +pyo3 = { version = "0.23.3", optional = true } recursive = { workspace = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3a07a238a4c96..5d3f2e6a84fa1 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -446,10 +446,6 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option, default = Some("page".into()) - /// (writing) Sets max statistics size for any column. If NULL, uses - /// default parquet writer setting - pub max_statistics_size: Option, default = Some(4096) - /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -1621,10 +1617,6 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option, default = None - - /// Sets max statistics size for the column path. If NULL, uses - /// default parquet options - pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index dd9d67d6bb47f..fab50750eb41f 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -26,7 +26,7 @@ use parquet::{ basic::{BrotliLevel, GzipLevel, ZstdLevel}, file::properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, + DEFAULT_STATISTICS_ENABLED, }, format::KeyValue, schema::types::ColumnPath, @@ -129,11 +129,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - - if let Some(max_statistics_size) = options.max_statistics_size { - builder = - builder.set_column_max_statistics_size(path, max_statistics_size); - } } Ok(builder) @@ -154,7 +149,6 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, - max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -190,9 +184,6 @@ impl ParquetOptions { .and_then(|s| parse_statistics_string(s).ok()) .unwrap_or(DEFAULT_STATISTICS_ENABLED), ) - .set_max_statistics_size( - max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), - ) .set_max_row_group_size(*max_row_group_size) .set_created_by(created_by.clone()) .set_column_index_truncate_length(*column_index_truncate_length) @@ -395,7 +386,6 @@ mod tests { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), - max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -419,7 +409,6 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), - max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -473,7 +462,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -523,7 +511,6 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, - max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs index bdcf831c7884b..60dde7861104a 100644 --- a/datafusion/common/src/pyarrow.rs +++ b/datafusion/common/src/pyarrow.rs @@ -23,7 +23,7 @@ use arrow_array::Array; use pyo3::exceptions::PyException; use pyo3::prelude::PyErr; use pyo3::types::{PyAnyMethods, PyList}; -use pyo3::{Bound, FromPyObject, IntoPy, PyAny, PyObject, PyResult, Python}; +use pyo3::{Bound, FromPyObject, IntoPyObject, PyAny, PyObject, PyResult, Python}; use crate::{DataFusionError, ScalarValue}; @@ -40,8 +40,8 @@ impl FromPyArrow for ScalarValue { let val = value.call_method0("as_py")?; // construct pyarrow array from the python value and pyarrow type - let factory = py.import_bound("pyarrow")?.getattr("array")?; - let args = PyList::new_bound(py, [val]); + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [val])?; let array = factory.call1((args, typ))?; // convert the pyarrow array to rust array using C data interface @@ -69,14 +69,25 @@ impl<'source> FromPyObject<'source> for ScalarValue { } } -impl IntoPy for ScalarValue { - fn into_py(self, py: Python) -> PyObject { - self.to_pyarrow(py).unwrap() +impl<'source> IntoPyObject<'source> for ScalarValue { + type Target = PyAny; + + type Output = Bound<'source, Self::Target>; + + type Error = PyErr; + + fn into_pyobject(self, py: Python<'source>) -> Result { + let array = self.to_array()?; + // convert to pyarrow array using C data interface + let pyarray = array.to_data().to_pyarrow(py)?; + let pyarray_bound = pyarray.bind(py); + pyarray_bound.call_method1("__getitem__", (0,)) } } #[cfg(test)] mod tests { + use pyo3::ffi::c_str; use pyo3::prepare_freethreaded_python; use pyo3::py_run; use pyo3::types::PyDict; @@ -86,10 +97,12 @@ mod tests { fn init_python() { prepare_freethreaded_python(); Python::with_gil(|py| { - if py.run_bound("import pyarrow", None, None).is_err() { - let locals = PyDict::new_bound(py); - py.run_bound( - "import sys; executable = sys.executable; python_path = sys.path", + if py.run(c_str!("import pyarrow"), None, None).is_err() { + let locals = PyDict::new(py); + py.run( + c_str!( + "import sys; executable = sys.executable; python_path = sys.path" + ), None, Some(&locals), ) @@ -135,17 +148,25 @@ mod tests { } #[test] - fn test_py_scalar() { + fn test_py_scalar() -> PyResult<()> { init_python(); - Python::with_gil(|py| { + Python::with_gil(|py| -> PyResult<()> { let scalar_float = ScalarValue::Float64(Some(12.34)); - let py_float = scalar_float.into_py(py).call_method0(py, "as_py").unwrap(); + let py_float = scalar_float + .into_pyobject(py)? + .call_method0("as_py") + .unwrap(); py_run!(py, py_float, "assert py_float == 12.34"); let scalar_string = ScalarValue::Utf8(Some("Hello!".to_string())); - let py_string = scalar_string.into_py(py).call_method0(py, "as_py").unwrap(); + let py_string = scalar_string + .into_pyobject(py)? + .call_method0("as_py") + .unwrap(); py_run!(py, py_string, "assert py_string == 'Hello!'"); - }); + + Ok(()) + }) } } diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 1d08de172273f..ff21d7a7dc139 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -70,9 +70,7 @@ use parquet::arrow::arrow_writer::{ compute_leaves, get_column_writers, ArrowColumnChunk, ArrowColumnWriter, ArrowLeafColumn, }; -use parquet::arrow::{ - arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, -}; +use parquet::arrow::{parquet_to_arrow_schema, ArrowSchemaConverter, AsyncArrowWriter}; use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::file::properties::WriterProperties; use parquet::file::writer::SerializedFileWriter; @@ -934,7 +932,7 @@ fn spawn_column_parallel_row_group_writer( max_buffer_size: usize, pool: &Arc, ) -> Result<(Vec, Vec)> { - let schema_desc = arrow_to_parquet_schema(&schema)?; + let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?; let num_columns = col_writers.len(); @@ -1137,7 +1135,7 @@ async fn concatenate_parallel_row_groups( let mut file_reservation = MemoryConsumer::new("ParquetSink(SerializedFileWriter)").register(&pool); - let schema_desc = arrow_to_parquet_schema(schema.as_ref())?; + let schema_desc = ArrowSchemaConverter::new().convert(&schema)?; let mut parquet_writer = SerializedFileWriter::new( merged_buff.clone(), schema_desc.root_schema_ptr(), diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs index 516310dc81ae6..7bcba89f7c68b 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs @@ -431,8 +431,8 @@ mod tests { use datafusion_expr::{cast, col, lit, Expr}; use datafusion_physical_expr::planner::logical2physical; - use parquet::arrow::arrow_to_parquet_schema; use parquet::arrow::async_reader::ParquetObjectReader; + use parquet::arrow::ArrowSchemaConverter; use parquet::basic::LogicalType; use parquet::data_type::{ByteArray, FixedLenByteArray}; use parquet::file::metadata::ColumnChunkMetaData; @@ -719,7 +719,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ])); - let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); + let schema_descr = ArrowSchemaConverter::new().convert(&schema).unwrap(); let expr = col("c1").gt(lit(15)).and(col("c2").is_null()); let expr = logical2physical(&expr, &schema); let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); @@ -748,7 +748,7 @@ mod tests { Field::new("c1", DataType::Int32, false), Field::new("c2", DataType::Boolean, false), ])); - let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); + let schema_descr = ArrowSchemaConverter::new().convert(&schema).unwrap(); let expr = col("c1") .gt(lit(15)) .and(col("c2").eq(lit(ScalarValue::Boolean(None)))); diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index feca5eb1db381..40b145a5be1ff 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -2436,25 +2436,21 @@ mod tests { "labels".to_string(), DataType::Struct( vec![ - Field::new_dict( + Field::new( "a".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, ), - Field::new_dict( + Field::new( "b".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, ), ] .into(), @@ -2466,15 +2462,13 @@ mod tests { vec![ Arc::new(StructArray::from(vec![ ( - Arc::new(Field::new_dict( + Arc::new(Field::new( "a".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, )), Arc::new( vec![Some("a"), None, Some("a")] @@ -2483,15 +2477,13 @@ mod tests { ) as ArrayRef, ), ( - Arc::new(Field::new_dict( + Arc::new(Field::new( "b".to_string(), DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), ), true, - 0, - false, )), Arc::new( vec![Some("b"), Some("c"), Some("b")] diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 2da8b6066742a..d3c71cf628d32 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -472,10 +472,6 @@ message ParquetColumnOptions { oneof bloom_filter_ndv_opt { uint64 bloom_filter_ndv = 7; } - - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } } message ParquetOptions { @@ -513,10 +509,6 @@ message ParquetOptions { string statistics_enabled = 13; } - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } - oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; } diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 14375c0590a4f..658f8c90ccd77 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -321,6 +321,8 @@ impl TryFrom<&protobuf::Field> for Field { fn try_from(field: &protobuf::Field) -> Result { let datatype = field.arrow_type.as_deref().required("arrow_type")?; let field = if field.dict_id != 0 { + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] Self::new_dict( field.name.as_str(), datatype, @@ -365,6 +367,8 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { .as_ref() .ok_or_else(|| Error::required("value"))?; + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] Ok(match value { Value::BoolValue(v) => Self::Boolean(Some(*v)), Value::Utf8Value(v) => Self::Utf8(Some(v.to_owned())), @@ -920,12 +924,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt.as_ref() - .map(|opt| match opt { - protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), - }) - .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -980,12 +978,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt - .map(|opt| match opt { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), - }) - .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 6a75b14d35a85..6a75056ba4f76 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -4448,9 +4448,6 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -4503,13 +4500,6 @@ impl serde::Serialize for ParquetColumnOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - struct_ser.serialize_field("maxStatisticsSize", v)?; - } - } - } struct_ser.end() } } @@ -4532,8 +4522,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", - "max_statistics_size", - "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -4545,7 +4533,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, - MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -4574,7 +4561,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -4601,7 +4587,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; - let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -4646,12 +4631,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } } } Ok(ParquetColumnOptions { @@ -4662,7 +4641,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, - max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -4946,9 +4924,6 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5063,15 +5038,6 @@ impl serde::Serialize for ParquetOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; - } - } - } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5158,8 +5124,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", - "max_statistics_size", - "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "encoding", @@ -5194,7 +5158,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, - MaxStatisticsSize, ColumnIndexTruncateLength, Encoding, BloomFilterFpp, @@ -5243,7 +5206,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "encoding" => Ok(GeneratedField::Encoding), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), @@ -5290,7 +5252,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; - let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut encoding_opt__ = None; let mut bloom_filter_fpp_opt__ = None; @@ -5449,12 +5410,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -5505,7 +5460,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, - max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, encoding_opt: encoding_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index 50a3cff5f5685..d3f2bb5e250da 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -638,27 +638,29 @@ pub struct ParquetColumnSpecificOptions { #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetColumnOptions { #[prost(oneof = "parquet_column_options::BloomFilterEnabledOpt", tags = "1")] - pub bloom_filter_enabled_opt: - ::core::option::Option, + pub bloom_filter_enabled_opt: ::core::option::Option< + parquet_column_options::BloomFilterEnabledOpt, + >, #[prost(oneof = "parquet_column_options::EncodingOpt", tags = "2")] pub encoding_opt: ::core::option::Option, #[prost(oneof = "parquet_column_options::DictionaryEnabledOpt", tags = "3")] - pub dictionary_enabled_opt: - ::core::option::Option, + pub dictionary_enabled_opt: ::core::option::Option< + parquet_column_options::DictionaryEnabledOpt, + >, #[prost(oneof = "parquet_column_options::CompressionOpt", tags = "4")] pub compression_opt: ::core::option::Option, #[prost(oneof = "parquet_column_options::StatisticsEnabledOpt", tags = "5")] - pub statistics_enabled_opt: - ::core::option::Option, + pub statistics_enabled_opt: ::core::option::Option< + parquet_column_options::StatisticsEnabledOpt, + >, #[prost(oneof = "parquet_column_options::BloomFilterFppOpt", tags = "6")] - pub bloom_filter_fpp_opt: - ::core::option::Option, + pub bloom_filter_fpp_opt: ::core::option::Option< + parquet_column_options::BloomFilterFppOpt, + >, #[prost(oneof = "parquet_column_options::BloomFilterNdvOpt", tags = "7")] - pub bloom_filter_ndv_opt: - ::core::option::Option, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: - ::core::option::Option, + pub bloom_filter_ndv_opt: ::core::option::Option< + parquet_column_options::BloomFilterNdvOpt, + >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -697,11 +699,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -763,22 +760,23 @@ pub struct ParquetOptions { #[prost(string, tag = "16")] pub created_by: ::prost::alloc::string::String, #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")] - pub metadata_size_hint_opt: - ::core::option::Option, + pub metadata_size_hint_opt: ::core::option::Option< + parquet_options::MetadataSizeHintOpt, + >, #[prost(oneof = "parquet_options::CompressionOpt", tags = "10")] pub compression_opt: ::core::option::Option, #[prost(oneof = "parquet_options::DictionaryEnabledOpt", tags = "11")] - pub dictionary_enabled_opt: - ::core::option::Option, + pub dictionary_enabled_opt: ::core::option::Option< + parquet_options::DictionaryEnabledOpt, + >, #[prost(oneof = "parquet_options::StatisticsEnabledOpt", tags = "13")] - pub statistics_enabled_opt: - ::core::option::Option, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: - ::core::option::Option, + pub statistics_enabled_opt: ::core::option::Option< + parquet_options::StatisticsEnabledOpt, + >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] - pub column_index_truncate_length_opt: - ::core::option::Option, + pub column_index_truncate_length_opt: ::core::option::Option< + parquet_options::ColumnIndexTruncateLengthOpt, + >, #[prost(oneof = "parquet_options::EncodingOpt", tags = "19")] pub encoding_opt: ::core::option::Option, #[prost(oneof = "parquet_options::BloomFilterFppOpt", tags = "21")] @@ -809,11 +807,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), @@ -861,9 +854,7 @@ pub struct ColumnStats { #[prost(message, optional, tag = "4")] pub distinct_count: ::core::option::Option, } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum JoinType { Inner = 0, @@ -910,9 +901,7 @@ impl JoinType { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum JoinConstraint { On = 0, @@ -938,9 +927,7 @@ impl JoinConstraint { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum TimeUnit { Second = 0, @@ -972,9 +959,7 @@ impl TimeUnit { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum IntervalUnit { YearMonth = 0, @@ -1003,9 +988,7 @@ impl IntervalUnit { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum UnionMode { Sparse = 0, @@ -1031,9 +1014,7 @@ impl UnionMode { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum CompressionTypeVariant { Gzip = 0, @@ -1068,9 +1049,7 @@ impl CompressionTypeVariant { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum JoinSide { LeftSide = 0, @@ -1099,9 +1078,7 @@ impl JoinSide { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum PrecisionInfo { Exact = 0, diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 1b9583516ced3..2cb8e0eb07f5d 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -97,6 +97,8 @@ impl TryFrom<&Field> for protobuf::Field { nullable: field.is_nullable(), children: Vec::new(), metadata: field.metadata().clone(), + // TODO file a ticket about handling deprecated dict_id attributes + #[allow(deprecated)] dict_id: field.dict_id().unwrap_or(0), dict_ordered: field.dict_is_ordered().unwrap_or(false), }) @@ -818,7 +820,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), - max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -855,11 +856,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), - max_statistics_size_opt: value.max_statistics_size.map(|v| { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( - v as u32, - ) - }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index fa77d23a6ae69..d3f2bb5e250da 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -661,10 +661,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -703,11 +699,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -782,10 +773,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -820,11 +807,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 1e2b12dacc290..715c649e97c85 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -379,9 +379,6 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), - max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) - }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -430,9 +427,6 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), - max_statistics_size_opt: options.max_statistics_size.map(|size| { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) - }), }) } }).collect(), @@ -470,9 +464,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), - max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, - }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -523,11 +514,6 @@ impl From for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), - max_statistics_size: proto.max_statistics_size_opt.map( - |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { - v as usize - }, - ), } } } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index f793e96f612b7..94478f3592af0 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1777,6 +1777,8 @@ fn round_trip_datatype() { } } +// TODO file a ticket about handling deprecated dict_id attributes +#[allow(deprecated)] #[test] fn roundtrip_dict_id() -> Result<()> { let dict_id = 42;