From 578fda714b819c05b10914537b934b1998befedf Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Fri, 26 Feb 2021 20:53:03 +0200 Subject: [PATCH 01/10] move schema to v1::schema --- rust/datafusion/src/physical_plan/parquet.rs | 4 +- rust/parquet/src/arrow/array_reader.rs | 8 +-- rust/parquet/src/arrow/arrow_reader.rs | 4 +- rust/parquet/src/arrow/record_reader.rs | 6 +- rust/parquet/src/arrow/schema.rs | 4 +- rust/parquet/src/column/mod.rs | 2 +- rust/parquet/src/column/page.rs | 2 +- rust/parquet/src/column/reader.rs | 4 +- rust/parquet/src/column/writer.rs | 4 +- rust/parquet/src/encodings/decoding.rs | 4 +- rust/parquet/src/encodings/encoding.rs | 4 +- rust/parquet/src/file/footer.rs | 4 +- rust/parquet/src/file/metadata.rs | 2 +- rust/parquet/src/file/mod.rs | 2 +- rust/parquet/src/file/properties.rs | 4 +- rust/parquet/src/file/reader.rs | 2 +- rust/parquet/src/file/serialized_reader.rs | 4 +- rust/parquet/src/file/writer.rs | 2 +- rust/parquet/src/record/api.rs | 4 +- rust/parquet/src/record/reader.rs | 4 +- rust/parquet/src/record/triplet.rs | 4 +- rust/parquet/src/schema/mod.rs | 52 +------------- rust/parquet/src/schema/v1/mod.rs | 67 +++++++++++++++++++ rust/parquet/src/schema/{ => v1}/parser.rs | 8 +-- rust/parquet/src/schema/{ => v1}/printer.rs | 8 +-- rust/parquet/src/schema/{ => v1}/types.rs | 16 ++--- rust/parquet/src/schema/{ => v1}/visitor.rs | 6 +- rust/parquet/src/schema/v2/mod.rs | 0 .../parquet/src/util/test_common/page_util.rs | 2 +- rust/parquet/tests/custom_writer.rs | 2 +- rust/parquet_derive_test/src/lib.rs | 2 +- 31 files changed, 130 insertions(+), 111 deletions(-) create mode 100644 rust/parquet/src/schema/v1/mod.rs rename rust/parquet/src/schema/{ => v1}/parser.rs (99%) rename rust/parquet/src/schema/{ => v1}/printer.rs (98%) rename rust/parquet/src/schema/{ => v1}/types.rs (99%) rename rust/parquet/src/schema/{ => v1}/visitor.rs (98%) create mode 100644 rust/parquet/src/schema/v2/mod.rs diff --git a/rust/datafusion/src/physical_plan/parquet.rs b/rust/datafusion/src/physical_plan/parquet.rs index 348a924040a..3d64f1aaac2 100644 --- a/rust/datafusion/src/physical_plan/parquet.rs +++ b/rust/datafusion/src/physical_plan/parquet.rs @@ -900,7 +900,7 @@ mod tests { use arrow::array::{Int32Array, StringArray}; use futures::StreamExt; use parquet::basic::Type as PhysicalType; - use parquet::schema::types::SchemaDescPtr; + use parquet::schema::v1::types::SchemaDescPtr; #[test] fn test_split_files() { @@ -1429,7 +1429,7 @@ mod tests { } fn get_test_schema_descr(fields: Vec<(&str, PhysicalType)>) -> SchemaDescPtr { - use parquet::schema::types::{SchemaDescriptor, Type as SchemaType}; + use parquet::schema::v1::types::{SchemaDescriptor, Type as SchemaType}; let mut schema_fields = fields .iter() .map(|(n, t)| { diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index dcdfbcbe7b0..804eff33fea 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -73,10 +73,10 @@ use crate::data_type::{ }; use crate::errors::{ParquetError, ParquetError::ArrowError, Result}; use crate::file::reader::{FilePageIterator, FileReader}; -use crate::schema::types::{ +use crate::schema::v1::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, Type, TypePtr, }; -use crate::schema::visitor::TypeVisitor; +use crate::schema::v1::visitor::TypeVisitor; use std::any::Any; /// Array reader reads parquet data into arrow array. @@ -1712,8 +1712,8 @@ mod tests { use crate::data_type::{ByteArray, DataType, Int32Type, Int64Type}; use crate::errors::Result; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::schema::parser::parse_message_type; - use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; + use crate::schema::v1::parser::parse_message_type; + use crate::schema::v1::types::{ColumnDescPtr, SchemaDescriptor}; use crate::util::test_common::page_util::{ DataPageBuilder, DataPageBuilderImpl, InMemoryPageIterator, }; diff --git a/rust/parquet/src/arrow/arrow_reader.rs b/rust/parquet/src/arrow/arrow_reader.rs index 7bbe8de1d64..fbe4ed83e0f 100644 --- a/rust/parquet/src/arrow/arrow_reader.rs +++ b/rust/parquet/src/arrow/arrow_reader.rs @@ -247,8 +247,8 @@ mod tests { use crate::file::properties::WriterProperties; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::file::writer::{FileWriter, SerializedFileWriter}; - use crate::schema::parser::parse_message_type; - use crate::schema::types::TypePtr; + use crate::schema::v1::parser::parse_message_type; + use crate::schema::v1::types::TypePtr; use crate::util::test_common::{get_temp_filename, RandGen}; use arrow::array::*; use arrow::record_batch::RecordBatchReader; diff --git a/rust/parquet/src/arrow/record_reader.rs b/rust/parquet/src/arrow/record_reader.rs index d58d563621f..4bcaaefb1a2 100644 --- a/rust/parquet/src/arrow/record_reader.rs +++ b/rust/parquet/src/arrow/record_reader.rs @@ -21,7 +21,7 @@ use std::mem::{replace, size_of}; use crate::column::{page::PageReader, reader::ColumnReaderImpl}; use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; use arrow::array::BooleanBufferBuilder; use arrow::bitmap::Bitmap; use arrow::buffer::{Buffer, MutableBuffer}; @@ -439,8 +439,8 @@ mod tests { use crate::column::page::PageReader; use crate::data_type::Int32Type; use crate::errors::Result; - use crate::schema::parser::parse_message_type; - use crate::schema::types::SchemaDescriptor; + use crate::schema::v1::parser::parse_message_type; + use crate::schema::v1::types::SchemaDescriptor; use crate::util::test_common::page_util::{DataPageBuilder, DataPageBuilderImpl}; use arrow::array::{BooleanBufferBuilder, Int16BufferBuilder, Int32BufferBuilder}; use arrow::bitmap::Bitmap; diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index fa973b5cc0e..72a5b29582b 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -31,7 +31,7 @@ use arrow::ipc::writer; use crate::errors::{ParquetError::ArrowError, Result}; use crate::file::{metadata::KeyValue, properties::WriterProperties}; -use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; +use crate::schema::v1::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; use crate::{ basic::{LogicalType, Repetition, Type as PhysicalType}, errors::ParquetError, @@ -808,7 +808,7 @@ mod tests { use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; use crate::{ arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}, - schema::{parser::parse_message_type, types::SchemaDescriptor}, + schema::v1::{parser::parse_message_type, types::SchemaDescriptor}, util::test_common::get_temp_file, }; diff --git a/rust/parquet/src/column/mod.rs b/rust/parquet/src/column/mod.rs index 7ed7bfc256e..dc573f68c8d 100644 --- a/rust/parquet/src/column/mod.rs +++ b/rust/parquet/src/column/mod.rs @@ -45,7 +45,7 @@ //! reader::{FileReader, SerializedFileReader}, //! writer::{FileWriter, SerializedFileWriter}, //! }, -//! schema::parser::parse_message_type, +//! schema::v1::parser::parse_message_type, //! }; //! //! let path = Path::new("/path/to/column_sample.parquet"); diff --git a/rust/parquet/src/column/page.rs b/rust/parquet/src/column/page.rs index 0573616fa8d..76ac1708112 100644 --- a/rust/parquet/src/column/page.rs +++ b/rust/parquet/src/column/page.rs @@ -20,7 +20,7 @@ use crate::basic::{Encoding, PageType}; use crate::errors::Result; use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; +use crate::schema::v1::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; /// Parquet Page definition. diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index 91f199bae37..483ed6f9c3c 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -30,7 +30,7 @@ use crate::encodings::{ levels::LevelDecoder, }; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; /// Column reader for a Parquet type. @@ -509,7 +509,7 @@ mod tests { use crate::basic::Type as PhysicalType; use crate::column::page::Page; - use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::schema::v1::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; use crate::util::test_common::make_pages; const NUM_LEVELS: usize = 128; diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index 533a8e69a51..18fc774c4ab 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -33,7 +33,7 @@ use crate::file::{ metadata::ColumnChunkMetaData, properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, }; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; use crate::util::bit_util::FromBytes; use crate::util::memory::{ByteBufferPtr, MemTracker}; @@ -999,7 +999,7 @@ mod tests { properties::WriterProperties, reader::SerializedPageReader, writer::SerializedPageWriter, }; - use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::schema::v1::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; use crate::util::{ io::{FileSink, FileSource}, test_common::{get_temp_file, random_numbers_range}, diff --git a/rust/parquet/src/encodings/decoding.rs b/rust/parquet/src/encodings/decoding.rs index ee7ad5ae95c..b512e55b073 100644 --- a/rust/parquet/src/encodings/decoding.rs +++ b/rust/parquet/src/encodings/decoding.rs @@ -25,7 +25,7 @@ use crate::basic::*; use crate::data_type::private::*; use crate::data_type::*; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; use crate::util::{ bit_util::{self, BitReader, FromBytes}, memory::{ByteBuffer, ByteBufferPtr}, @@ -806,7 +806,7 @@ mod tests { use std::sync::Arc; - use crate::schema::types::{ + use crate::schema::v1::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; use crate::util::{ diff --git a/rust/parquet/src/encodings/encoding.rs b/rust/parquet/src/encodings/encoding.rs index fdd616e9e27..88840d17d20 100644 --- a/rust/parquet/src/encodings/encoding.rs +++ b/rust/parquet/src/encodings/encoding.rs @@ -24,7 +24,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; use crate::util::{ bit_util::{self, log2, num_required_bits, BitWriter}, hash_util, @@ -923,7 +923,7 @@ mod tests { use std::sync::Arc; use crate::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; - use crate::schema::types::{ + use crate::schema::v1::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; use crate::util::{ diff --git a/rust/parquet/src/file/footer.rs b/rust/parquet/src/file/footer.rs index e83d8b2e036..bffe5797c53 100644 --- a/rust/parquet/src/file/footer.rs +++ b/rust/parquet/src/file/footer.rs @@ -33,7 +33,7 @@ use crate::file::{ PARQUET_MAGIC, }; -use crate::schema::types::{self, SchemaDescriptor}; +use crate::schema::v1::types::{self, SchemaDescriptor}; /// Layout of Parquet file /// +---------------------------+-----+---+ @@ -158,7 +158,7 @@ mod tests { use crate::basic::SortOrder; use crate::basic::Type; - use crate::schema::types::Type as SchemaType; + use crate::schema::v1::types::Type as SchemaType; use crate::util::test_common::get_temp_file; use parquet_format::TypeDefinedOrder; diff --git a/rust/parquet/src/file/metadata.rs b/rust/parquet/src/file/metadata.rs index 150c42c578a..84e8f0cd485 100644 --- a/rust/parquet/src/file/metadata.rs +++ b/rust/parquet/src/file/metadata.rs @@ -40,7 +40,7 @@ use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::errors::{ParquetError, Result}; use crate::file::statistics::{self, Statistics}; -use crate::schema::types::{ +use crate::schema::v1::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; diff --git a/rust/parquet/src/file/mod.rs b/rust/parquet/src/file/mod.rs index f85de98ccab..9e418978c79 100644 --- a/rust/parquet/src/file/mod.rs +++ b/rust/parquet/src/file/mod.rs @@ -34,7 +34,7 @@ //! properties::WriterProperties, //! writer::{FileWriter, SerializedFileWriter}, //! }, -//! schema::parser::parse_message_type, +//! schema::v1::parser::parse_message_type, //! }; //! //! let path = Path::new("/path/to/sample.parquet"); diff --git a/rust/parquet/src/file/properties.rs b/rust/parquet/src/file/properties.rs index b0b25f9b952..4a59eab6c71 100644 --- a/rust/parquet/src/file/properties.rs +++ b/rust/parquet/src/file/properties.rs @@ -23,7 +23,7 @@ //! use parquet::{ //! basic::{Compression, Encoding}, //! file::properties::*, -//! schema::types::ColumnPath, +//! schema::v1::types::ColumnPath, //! }; //! //! // Create properties with default configuration. @@ -52,7 +52,7 @@ use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; use crate::file::metadata::KeyValue; -use crate::schema::types::ColumnPath; +use crate::schema::v1::types::ColumnPath; const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; diff --git a/rust/parquet/src/file/reader.rs b/rust/parquet/src/file/reader.rs index 7fb8ee211cd..1f8f14b2c47 100644 --- a/rust/parquet/src/file/reader.rs +++ b/rust/parquet/src/file/reader.rs @@ -26,7 +26,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::metadata::*; pub use crate::file::serialized_reader::{SerializedFileReader, SerializedPageReader}; use crate::record::reader::RowIter; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType}; +use crate::schema::v1::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType}; use crate::basic::Type; diff --git a/rust/parquet/src/file/serialized_reader.rs b/rust/parquet/src/file/serialized_reader.rs index b0d1d0c7b31..1642e3c5611 100644 --- a/rust/parquet/src/file/serialized_reader.rs +++ b/rust/parquet/src/file/serialized_reader.rs @@ -30,7 +30,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::{footer, metadata::*, reader::*, statistics}; use crate::record::reader::RowIter; use crate::record::Row; -use crate::schema::types::Type as SchemaType; +use crate::schema::v1::types::Type as SchemaType; use crate::util::{io::TryClone, memory::ByteBufferPtr}; // export `SliceableCursor` and `FileSource` publically so clients can @@ -394,7 +394,7 @@ mod tests { use super::*; use crate::basic::ColumnOrder; use crate::record::RowAccessor; - use crate::schema::parser::parse_message_type; + use crate::schema::v1::parser::parse_message_type; use crate::util::test_common::{get_test_file, get_test_path}; use std::sync::Arc; diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index 23e39fd855e..47cafc3c6a0 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -37,7 +37,7 @@ use crate::file::{ metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, FOOTER_SIZE, PARQUET_MAGIC, }; -use crate::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::schema::v1::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::util::io::{FileSink, Position}; // Exposed publically so client code can implement [`ParquetWriter`] diff --git a/rust/parquet/src/record/api.rs b/rust/parquet/src/record/api.rs index 9e131b4415c..a9a96dd850a 100644 --- a/rust/parquet/src/record/api.rs +++ b/rust/parquet/src/record/api.rs @@ -25,7 +25,7 @@ use num_bigint::{BigInt, Sign}; use crate::basic::{LogicalType, Type as PhysicalType}; use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; #[cfg(feature = "cli")] use serde_json::Value; @@ -833,7 +833,7 @@ mod tests { use std::sync::Arc; - use crate::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; + use crate::schema::v1::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; /// Creates test column descriptor based on provided type parameters. macro_rules! make_column_descr { diff --git a/rust/parquet/src/record/reader.rs b/rust/parquet/src/record/reader.rs index 5f42d37bac0..eefcae20786 100644 --- a/rust/parquet/src/record/reader.rs +++ b/rust/parquet/src/record/reader.rs @@ -27,7 +27,7 @@ use crate::record::{ api::{make_list, make_map, make_row, Field, Row}, triplet::TripletIter, }; -use crate::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; +use crate::schema::v1::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; /// Default batch size for a reader const DEFAULT_BATCH_SIZE: usize = 1024; @@ -821,7 +821,7 @@ mod tests { use crate::errors::{ParquetError, Result}; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; - use crate::schema::parser::parse_message_type; + use crate::schema::v1::parser::parse_message_type; use crate::util::test_common::{get_test_file, get_test_path}; use std::convert::TryFrom; diff --git a/rust/parquet/src/record/triplet.rs b/rust/parquet/src/record/triplet.rs index bb4f942fd18..de623970704 100644 --- a/rust/parquet/src/record/triplet.rs +++ b/rust/parquet/src/record/triplet.rs @@ -20,7 +20,7 @@ use crate::column::reader::{get_typed_column_reader, ColumnReader, ColumnReaderI use crate::data_type::*; use crate::errors::{ParquetError, Result}; use crate::record::api::Field; -use crate::schema::types::ColumnDescPtr; +use crate::schema::v1::types::ColumnDescPtr; /// Macro to generate simple functions that cover all types of triplet iterator. /// $func is a function of a typed triplet iterator and $token is a either {`ref`} or @@ -359,7 +359,7 @@ mod tests { use super::*; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::schema::types::ColumnPath; + use crate::schema::v1::types::ColumnPath; use crate::util::test_common::get_test_file; #[test] diff --git a/rust/parquet/src/schema/mod.rs b/rust/parquet/src/schema/mod.rs index 47ba3ca8b78..e48acb3c26a 100644 --- a/rust/parquet/src/schema/mod.rs +++ b/rust/parquet/src/schema/mod.rs @@ -15,53 +15,5 @@ // specific language governing permissions and limitations // under the License. -//! Parquet schema definitions and methods to print and parse schema. -//! -//! # Example -//! -//! ```rust -//! use parquet::{ -//! basic::{LogicalType, Repetition, Type as PhysicalType}, -//! schema::{parser, printer, types::Type}, -//! }; -//! use std::sync::Arc; -//! -//! // Create the following schema: -//! // -//! // message schema { -//! // OPTIONAL BYTE_ARRAY a (UTF8); -//! // REQUIRED INT32 b; -//! // } -//! -//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) -//! .with_logical_type(LogicalType::UTF8) -//! .with_repetition(Repetition::OPTIONAL) -//! .build() -//! .unwrap(); -//! -//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) -//! .with_repetition(Repetition::REQUIRED) -//! .build() -//! .unwrap(); -//! -//! let schema = Type::group_type_builder("schema") -//! .with_fields(&mut vec![Arc::new(field_a), Arc::new(field_b)]) -//! .build() -//! .unwrap(); -//! -//! let mut buf = Vec::new(); -//! -//! // Print schema into buffer -//! printer::print_schema(&mut buf, &schema); -//! -//! // Parse schema from the string -//! let string_schema = String::from_utf8(buf).unwrap(); -//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); -//! -//! assert_eq!(schema, parsed_schema); -//! ``` - -pub mod parser; -pub mod printer; -pub mod types; -pub mod visitor; +pub mod v1; +pub mod v2; \ No newline at end of file diff --git a/rust/parquet/src/schema/v1/mod.rs b/rust/parquet/src/schema/v1/mod.rs new file mode 100644 index 00000000000..ce153ec10c3 --- /dev/null +++ b/rust/parquet/src/schema/v1/mod.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet schema definitions and methods to print and parse schema. +//! +//! # Example +//! +//! ```rust +//! use parquet::{ +//! basic::{LogicalType, Repetition, Type as PhysicalType}, +//! schema::v1::{parser, printer, types::Type}, +//! }; +//! use std::sync::Arc; +//! +//! // Create the following schema: +//! // +//! // message schema { +//! // OPTIONAL BYTE_ARRAY a (UTF8); +//! // REQUIRED INT32 b; +//! // } +//! +//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) +//! .with_logical_type(LogicalType::UTF8) +//! .with_repetition(Repetition::OPTIONAL) +//! .build() +//! .unwrap(); +//! +//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) +//! .with_repetition(Repetition::REQUIRED) +//! .build() +//! .unwrap(); +//! +//! let schema = Type::group_type_builder("schema") +//! .with_fields(&mut vec![Arc::new(field_a), Arc::new(field_b)]) +//! .build() +//! .unwrap(); +//! +//! let mut buf = Vec::new(); +//! +//! // Print schema into buffer +//! printer::print_schema(&mut buf, &schema); +//! +//! // Parse schema from the string +//! let string_schema = String::from_utf8(buf).unwrap(); +//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); +//! +//! assert_eq!(schema, parsed_schema); +//! ``` + +pub mod parser; +pub mod printer; +pub mod types; +pub mod visitor; \ No newline at end of file diff --git a/rust/parquet/src/schema/parser.rs b/rust/parquet/src/schema/v1/parser.rs similarity index 99% rename from rust/parquet/src/schema/parser.rs rename to rust/parquet/src/schema/v1/parser.rs index 9f14a550241..a5f9a468417 100644 --- a/rust/parquet/src/schema/parser.rs +++ b/rust/parquet/src/schema/v1/parser.rs @@ -17,12 +17,12 @@ //! Parquet schema parser. //! Provides methods to parse and validate string message type into Parquet -//! [`Type`](crate::schema::types::Type). +//! [`Type`](crate::schema::v1::types::Type). //! //! # Example //! //! ```rust -//! use parquet::schema::parser::parse_message_type; +//! use parquet::schema::v1::parser::parse_message_type; //! //! let message_type = " //! message spark_schema { @@ -46,9 +46,9 @@ use std::sync::Arc; use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; -use crate::schema::types::{Type, TypePtr}; +use crate::schema::v1::types::{Type, TypePtr}; -/// Parses message type as string into a Parquet [`Type`](crate::schema::types::Type) +/// Parses message type as string into a Parquet [`Type`](crate::schema::v1::types::Type) /// which, for example, could be used to extract individual columns. Returns Parquet /// general error when parsing or validation fails. pub fn parse_message_type(message_type: &str) -> Result { diff --git a/rust/parquet/src/schema/printer.rs b/rust/parquet/src/schema/v1/printer.rs similarity index 98% rename from rust/parquet/src/schema/printer.rs rename to rust/parquet/src/schema/v1/printer.rs index 235dd564016..56165478c81 100644 --- a/rust/parquet/src/schema/printer.rs +++ b/rust/parquet/src/schema/v1/printer.rs @@ -23,7 +23,7 @@ //! ```rust //! use parquet::{ //! file::reader::{FileReader, SerializedFileReader}, -//! schema::printer::{print_file_metadata, print_parquet_metadata, print_schema}, +//! schema::v1::printer::{print_file_metadata, print_parquet_metadata, print_schema}, //! }; //! use std::{fs::File, path::Path}; //! @@ -49,7 +49,7 @@ use crate::basic::{LogicalType, Type as PhysicalType}; use crate::file::metadata::{ ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, }; -use crate::schema::types::Type; +use crate::schema::v1::types::Type; /// Prints Parquet metadata [`ParquetMetaData`](crate::file::metadata::ParquetMetaData) /// information. @@ -92,7 +92,7 @@ pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { print_schema(out, schema); } -/// Prints Parquet [`Type`](crate::schema::types::Type) information. +/// Prints Parquet [`Type`](crate::schema::v1::types::Type) information. #[allow(unused_must_use)] pub fn print_schema(out: &mut io::Write, tp: &Type) { // TODO: better if we can pass fmt::Write to Printer. @@ -274,7 +274,7 @@ mod tests { use std::sync::Arc; use crate::basic::{Repetition, Type as PhysicalType}; - use crate::schema::{parser::parse_message_type, types::Type}; + use crate::schema::v1::{parser::parse_message_type, types::Type}; fn assert_print_parse_message(message: Type) { let mut s = String::new(); diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/v1/types.rs similarity index 99% rename from rust/parquet/src/schema/types.rs rename to rust/parquet/src/schema/v1/types.rs index 5c35e1cde2c..00389b7cee2 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/v1/types.rs @@ -538,7 +538,7 @@ impl ColumnPath { /// Returns string representation of this column path. /// ```rust - /// use parquet::schema::types::ColumnPath; + /// use parquet::schema::v1::types::ColumnPath; /// /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]); /// assert_eq!(&path.string(), "a.b.c"); @@ -549,7 +549,7 @@ impl ColumnPath { /// Appends more components to end of column path. /// ```rust - /// use parquet::schema::types::ColumnPath; + /// use parquet::schema::v1::types::ColumnPath; /// /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c" /// .to_string()]); @@ -649,12 +649,12 @@ impl ColumnDescriptor { &self.path } - /// Returns self type [`Type`](crate::schema::types::Type) for this leaf column. + /// Returns self type [`Type`](crate::schema::v1::types::Type) for this leaf column. pub fn self_type(&self) -> &Type { self.primitive_type.as_ref() } - /// Returns self type [`TypePtr`](crate::schema::types::TypePtr) for this leaf + /// Returns self type [`TypePtr`](crate::schema::v1::types::TypePtr) for this leaf /// column. pub fn self_type_ptr(&self) -> TypePtr { self.primitive_type.clone() @@ -775,13 +775,13 @@ impl SchemaDescriptor { self.leaves.len() } - /// Returns column root [`Type`](crate::schema::types::Type) for a field position. + /// Returns column root [`Type`](crate::schema::v1::types::Type) for a field position. pub fn get_column_root(&self, i: usize) -> &Type { let result = self.column_root_of(i); result.as_ref() } - /// Returns column root [`Type`](crate::schema::types::Type) pointer for a field + /// Returns column root [`Type`](crate::schema::v1::types::Type) pointer for a field /// position. pub fn get_column_root_ptr(&self, i: usize) -> TypePtr { let result = self.column_root_of(i); @@ -801,7 +801,7 @@ impl SchemaDescriptor { .unwrap_or_else(|| panic!("Expected a value for index {} but found None", i)) } - /// Returns schema as [`Type`](crate::schema::types::Type). + /// Returns schema as [`Type`](crate::schema::v1::types::Type). pub fn root_schema(&self) -> &Type { self.schema.as_ref() } @@ -1060,7 +1060,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { mod tests { use super::*; - use crate::schema::parser::parse_message_type; + use crate::schema::v1::parser::parse_message_type; #[test] fn test_primitive_type() { diff --git a/rust/parquet/src/schema/visitor.rs b/rust/parquet/src/schema/v1/visitor.rs similarity index 98% rename from rust/parquet/src/schema/visitor.rs rename to rust/parquet/src/schema/v1/visitor.rs index 04d77599c91..85c7843058f 100644 --- a/rust/parquet/src/schema/visitor.rs +++ b/rust/parquet/src/schema/v1/visitor.rs @@ -18,7 +18,7 @@ use crate::basic::{LogicalType, Repetition}; use crate::errors::ParquetError::General; use crate::errors::Result; -use crate::schema::types::{Type, TypePtr}; +use crate::schema::v1::types::{Type, TypePtr}; /// A utility trait to help user to traverse against parquet type. pub trait TypeVisitor { @@ -124,8 +124,8 @@ mod tests { use super::TypeVisitor; use crate::basic::Type as PhysicalType; use crate::errors::Result; - use crate::schema::parser::parse_message_type; - use crate::schema::types::TypePtr; + use crate::schema::v1::parser::parse_message_type; + use crate::schema::v1::types::TypePtr; use std::sync::Arc; struct TestVisitorContext {} diff --git a/rust/parquet/src/schema/v2/mod.rs b/rust/parquet/src/schema/v2/mod.rs new file mode 100644 index 00000000000..e69de29bb2d diff --git a/rust/parquet/src/util/test_common/page_util.rs b/rust/parquet/src/util/test_common/page_util.rs index e360f3da52a..8488bbe53d3 100644 --- a/rust/parquet/src/util/test_common/page_util.rs +++ b/rust/parquet/src/util/test_common/page_util.rs @@ -23,7 +23,7 @@ use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::encodings::levels::max_buffer_size; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; -use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; +use crate::schema::v1::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; use crate::util::memory::MemTracker; use crate::util::memory::MemTrackerPtr; diff --git a/rust/parquet/tests/custom_writer.rs b/rust/parquet/tests/custom_writer.rs index 0a57e79d955..f153857ee24 100644 --- a/rust/parquet/tests/custom_writer.rs +++ b/rust/parquet/tests/custom_writer.rs @@ -25,7 +25,7 @@ use std::{ use parquet::file::writer::TryClone; use parquet::{ basic::Repetition, basic::Type, file::properties::WriterProperties, - file::writer::SerializedFileWriter, schema::types, + file::writer::SerializedFileWriter, schema::v1::types, }; use std::env; diff --git a/rust/parquet_derive_test/src/lib.rs b/rust/parquet_derive_test/src/lib.rs index b4bfc42cab2..505c94843f4 100644 --- a/rust/parquet_derive_test/src/lib.rs +++ b/rust/parquet_derive_test/src/lib.rs @@ -50,7 +50,7 @@ mod tests { properties::WriterProperties, writer::{FileWriter, SerializedFileWriter}, }, - schema::parser::parse_message_type, + schema::v1::parser::parse_message_type, }; use std::{env, fs, io::Write, sync::Arc}; From 8255004f144eb01fe388d9caea9f23501c6f1658 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Fri, 26 Feb 2021 22:57:50 +0200 Subject: [PATCH 02/10] rename LogicalType to ConvertedType V2 of the format has a LogicalType that is different to what we were using as a LogicalType. By renaming our one, this allows us to implement the v2 one. --- rust/parquet/src/arrow/array_reader.rs | 13 +- rust/parquet/src/arrow/schema.rs | 96 ++-- rust/parquet/src/basic.rs | 593 +++++++++++++------------ rust/parquet/src/column/reader.rs | 4 +- rust/parquet/src/file/footer.rs | 2 +- rust/parquet/src/record/api.rs | 116 ++--- rust/parquet/src/record/reader.rs | 14 +- rust/parquet/src/schema/mod.rs | 2 +- rust/parquet/src/schema/v1/mod.rs | 6 +- rust/parquet/src/schema/v1/parser.rs | 39 +- rust/parquet/src/schema/v1/printer.rs | 42 +- rust/parquet/src/schema/v1/types.rs | 158 +++---- rust/parquet/src/schema/v1/visitor.rs | 8 +- 13 files changed, 561 insertions(+), 532 deletions(-) diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index 804eff33fea..8219f7567d1 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -64,7 +64,7 @@ use crate::arrow::converter::{ }; use crate::arrow::record_reader::RecordReader; use crate::arrow::schema::parquet_to_arrow_field; -use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; use crate::column::page::PageIterator; use crate::column::reader::ColumnReaderImpl; use crate::data_type::{ @@ -1463,7 +1463,7 @@ impl<'a> ArrayReaderBuilder { )?)) } PhysicalType::BYTE_ARRAY => { - if cur_type.get_basic_info().logical_type() == LogicalType::UTF8 { + if cur_type.get_basic_info().converted_type() == ConvertedType::UTF8 { if let Some(ArrowType::LargeUtf8) = arrow_type { let converter = LargeUtf8Converter::new(LargeUtf8ArrayConverter {}); @@ -1514,7 +1514,8 @@ impl<'a> ArrayReaderBuilder { } } PhysicalType::FIXED_LEN_BYTE_ARRAY - if cur_type.get_basic_info().logical_type() == LogicalType::DECIMAL => + if cur_type.get_basic_info().converted_type() + == ConvertedType::DECIMAL => { let converter = DecimalConverter::new(DecimalArrayConverter::new( cur_type.get_precision(), @@ -1531,7 +1532,7 @@ impl<'a> ArrayReaderBuilder { )?)) } PhysicalType::FIXED_LEN_BYTE_ARRAY => { - if cur_type.get_basic_info().logical_type() == LogicalType::INTERVAL { + if cur_type.get_basic_info().converted_type() == ConvertedType::INTERVAL { let byte_width = match *cur_type { Type::PrimitiveType { ref type_length, .. @@ -1888,14 +1889,14 @@ mod tests { } macro_rules! test_primitive_array_reader_one_type { - ($arrow_parquet_type:ty, $physical_type:expr, $logical_type_str:expr, $result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty) => {{ + ($arrow_parquet_type:ty, $physical_type:expr, $converted_type_str:expr, $result_arrow_type:ty, $result_arrow_cast_type:ty, $result_primitive_type:ty) => {{ let message_type = format!( " message test_schema {{ REQUIRED {:?} leaf ({}); }} ", - $physical_type, $logical_type_str + $physical_type, $converted_type_str ); let schema = parse_message_type(&message_type) .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 72a5b29582b..91d8c3fd330 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -33,7 +33,7 @@ use crate::errors::{ParquetError::ArrowError, Result}; use crate::file::{metadata::KeyValue, properties::WriterProperties}; use crate::schema::v1::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; use crate::{ - basic::{LogicalType, Repetition, Type as PhysicalType}, + basic::{ConvertedType, Repetition, Type as PhysicalType}, errors::ParquetError, }; @@ -321,18 +321,18 @@ fn arrow_to_parquet_type(field: &Field) -> Result { // create type from field match field.data_type() { DataType::Null => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::NONE) + .with_converted_type(ConvertedType::NONE) .with_repetition(repetition) .build(), DataType::Boolean => Type::primitive_type_builder(name, PhysicalType::BOOLEAN) .with_repetition(repetition) .build(), DataType::Int8 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::INT_8) + .with_converted_type(ConvertedType::INT_8) .with_repetition(repetition) .build(), DataType::Int16 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::INT_16) + .with_converted_type(ConvertedType::INT_16) .with_repetition(repetition) .build(), DataType::Int32 => Type::primitive_type_builder(name, PhysicalType::INT32) @@ -342,19 +342,19 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .build(), DataType::UInt8 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_8) + .with_converted_type(ConvertedType::UINT_8) .with_repetition(repetition) .build(), DataType::UInt16 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_16) + .with_converted_type(ConvertedType::UINT_16) .with_repetition(repetition) .build(), DataType::UInt32 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_32) + .with_converted_type(ConvertedType::UINT_32) .with_repetition(repetition) .build(), DataType::UInt64 => Type::primitive_type_builder(name, PhysicalType::INT64) - .with_logical_type(LogicalType::UINT_64) + .with_converted_type(ConvertedType::UINT_64) .with_repetition(repetition) .build(), DataType::Float16 => Err(ArrowError("Float16 arrays not supported".to_string())), @@ -366,30 +366,30 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .build(), DataType::Timestamp(time_unit, _) => { Type::primitive_type_builder(name, PhysicalType::INT64) - .with_logical_type(match time_unit { - TimeUnit::Second => LogicalType::TIMESTAMP_MILLIS, - TimeUnit::Millisecond => LogicalType::TIMESTAMP_MILLIS, - TimeUnit::Microsecond => LogicalType::TIMESTAMP_MICROS, - TimeUnit::Nanosecond => LogicalType::TIMESTAMP_MICROS, + .with_converted_type(match time_unit { + TimeUnit::Second => ConvertedType::TIMESTAMP_MILLIS, + TimeUnit::Millisecond => ConvertedType::TIMESTAMP_MILLIS, + TimeUnit::Microsecond => ConvertedType::TIMESTAMP_MICROS, + TimeUnit::Nanosecond => ConvertedType::TIMESTAMP_MICROS, }) .with_repetition(repetition) .build() } DataType::Date32 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::DATE) + .with_converted_type(ConvertedType::DATE) .with_repetition(repetition) .build(), // date64 is cast to date32 DataType::Date64 => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::DATE) + .with_converted_type(ConvertedType::DATE) .with_repetition(repetition) .build(), DataType::Time32(_) => Type::primitive_type_builder(name, PhysicalType::INT32) - .with_logical_type(LogicalType::TIME_MILLIS) + .with_converted_type(ConvertedType::TIME_MILLIS) .with_repetition(repetition) .build(), DataType::Time64(_) => Type::primitive_type_builder(name, PhysicalType::INT64) - .with_logical_type(LogicalType::TIME_MICROS) + .with_converted_type(ConvertedType::TIME_MICROS) .with_repetition(repetition) .build(), DataType::Duration(_) => Err(ArrowError( @@ -397,7 +397,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { )), DataType::Interval(_) => { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_repetition(repetition) .with_length(12) .build() @@ -417,14 +417,14 @@ fn arrow_to_parquet_type(field: &Field) -> Result { Type::primitive_type_builder(name, PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(repetition) .with_length(decimal_length_from_precision(*precision) as i32) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(*precision as i32) .with_scale(*scale as i32) .build() } DataType::Utf8 | DataType::LargeUtf8 => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .with_repetition(repetition) .build() } @@ -436,7 +436,7 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(Repetition::REPEATED) .build()?, )]) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_repetition(repetition) .build() } @@ -583,17 +583,17 @@ impl ParquetTypeConverter<'_> { } fn from_int32(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::NONE => Ok(DataType::Int32), - LogicalType::UINT_8 => Ok(DataType::UInt8), - LogicalType::UINT_16 => Ok(DataType::UInt16), - LogicalType::UINT_32 => Ok(DataType::UInt32), - LogicalType::INT_8 => Ok(DataType::Int8), - LogicalType::INT_16 => Ok(DataType::Int16), - LogicalType::INT_32 => Ok(DataType::Int32), - LogicalType::DATE => Ok(DataType::Date32), - LogicalType::TIME_MILLIS => Ok(DataType::Time32(TimeUnit::Millisecond)), - LogicalType::DECIMAL => Ok(self.to_decimal()), + match self.schema.get_basic_info().converted_type() { + ConvertedType::NONE => Ok(DataType::Int32), + ConvertedType::UINT_8 => Ok(DataType::UInt8), + ConvertedType::UINT_16 => Ok(DataType::UInt16), + ConvertedType::UINT_32 => Ok(DataType::UInt32), + ConvertedType::INT_8 => Ok(DataType::Int8), + ConvertedType::INT_16 => Ok(DataType::Int16), + ConvertedType::INT_32 => Ok(DataType::Int32), + ConvertedType::DATE => Ok(DataType::Date32), + ConvertedType::TIME_MILLIS => Ok(DataType::Time32(TimeUnit::Millisecond)), + ConvertedType::DECIMAL => Ok(self.to_decimal()), other => Err(ArrowError(format!( "Unable to convert parquet INT32 logical type {}", other @@ -602,18 +602,18 @@ impl ParquetTypeConverter<'_> { } fn from_int64(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::NONE => Ok(DataType::Int64), - LogicalType::INT_64 => Ok(DataType::Int64), - LogicalType::UINT_64 => Ok(DataType::UInt64), - LogicalType::TIME_MICROS => Ok(DataType::Time64(TimeUnit::Microsecond)), - LogicalType::TIMESTAMP_MILLIS => { + match self.schema.get_basic_info().converted_type() { + ConvertedType::NONE => Ok(DataType::Int64), + ConvertedType::INT_64 => Ok(DataType::Int64), + ConvertedType::UINT_64 => Ok(DataType::UInt64), + ConvertedType::TIME_MICROS => Ok(DataType::Time64(TimeUnit::Microsecond)), + ConvertedType::TIMESTAMP_MILLIS => { Ok(DataType::Timestamp(TimeUnit::Millisecond, None)) } - LogicalType::TIMESTAMP_MICROS => { + ConvertedType::TIMESTAMP_MICROS => { Ok(DataType::Timestamp(TimeUnit::Microsecond, None)) } - LogicalType::DECIMAL => Ok(self.to_decimal()), + ConvertedType::DECIMAL => Ok(self.to_decimal()), other => Err(ArrowError(format!( "Unable to convert parquet INT64 logical type {}", other @@ -622,9 +622,9 @@ impl ParquetTypeConverter<'_> { } fn from_fixed_len_byte_array(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::DECIMAL => Ok(self.to_decimal()), - LogicalType::INTERVAL => { + match self.schema.get_basic_info().converted_type() { + ConvertedType::DECIMAL => Ok(self.to_decimal()), + ConvertedType::INTERVAL => { // There is currently no reliable way of determining which IntervalUnit // to return. Thus without the original Arrow schema, the results // would be incorrect if all 12 bytes of the interval are populated @@ -656,9 +656,9 @@ impl ParquetTypeConverter<'_> { } fn from_byte_array(&self) -> Result { - match self.schema.get_basic_info().logical_type() { - LogicalType::NONE => Ok(DataType::Binary), - LogicalType::UTF8 => Ok(DataType::Utf8), + match self.schema.get_basic_info().converted_type() { + ConvertedType::NONE => Ok(DataType::Binary), + ConvertedType::UTF8 => Ok(DataType::Utf8), other => Err(ArrowError(format!( "Unable to convert parquet BYTE_ARRAY logical type {}", other @@ -683,8 +683,8 @@ impl ParquetTypeConverter<'_> { }) }) } else { - match self.schema.get_basic_info().logical_type() { - LogicalType::LIST => self.to_list(), + match self.schema.get_basic_info().converted_type() { + ConvertedType::LIST => self.to_list(), _ => self.to_struct(), } } diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index bf41d43da90..26d4225ab0c 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -50,11 +50,11 @@ pub enum Type { // ---------------------------------------------------------------------- // Mirrors `parquet::ConvertedType` -/// Common types (logical types) used by frameworks when using Parquet. +/// Common types (converted types) used by frameworks when using Parquet. /// This helps map between types in those frameworks to the base types in Parquet. /// This is only metadata and not needed to read or write the data. #[derive(Debug, Clone, Copy, PartialEq)] -pub enum LogicalType { +pub enum ConvertedType { NONE, /// A BYTE_ARRAY actually contains UTF8 encoded chars. UTF8, @@ -284,42 +284,42 @@ pub enum ColumnOrder { impl ColumnOrder { /// Returns sort order for a physical/logical type. - pub fn get_sort_order(logical_type: LogicalType, physical_type: Type) -> SortOrder { + pub fn get_sort_order(logical_type: ConvertedType, physical_type: Type) -> SortOrder { match logical_type { // Unsigned byte-wise comparison. - LogicalType::UTF8 - | LogicalType::JSON - | LogicalType::BSON - | LogicalType::ENUM => SortOrder::UNSIGNED, + ConvertedType::UTF8 + | ConvertedType::JSON + | ConvertedType::BSON + | ConvertedType::ENUM => SortOrder::UNSIGNED, - LogicalType::INT_8 - | LogicalType::INT_16 - | LogicalType::INT_32 - | LogicalType::INT_64 => SortOrder::SIGNED, + ConvertedType::INT_8 + | ConvertedType::INT_16 + | ConvertedType::INT_32 + | ConvertedType::INT_64 => SortOrder::SIGNED, - LogicalType::UINT_8 - | LogicalType::UINT_16 - | LogicalType::UINT_32 - | LogicalType::UINT_64 => SortOrder::UNSIGNED, + ConvertedType::UINT_8 + | ConvertedType::UINT_16 + | ConvertedType::UINT_32 + | ConvertedType::UINT_64 => SortOrder::UNSIGNED, // Signed comparison of the represented value. - LogicalType::DECIMAL => SortOrder::SIGNED, + ConvertedType::DECIMAL => SortOrder::SIGNED, - LogicalType::DATE => SortOrder::SIGNED, + ConvertedType::DATE => SortOrder::SIGNED, - LogicalType::TIME_MILLIS - | LogicalType::TIME_MICROS - | LogicalType::TIMESTAMP_MILLIS - | LogicalType::TIMESTAMP_MICROS => SortOrder::SIGNED, + ConvertedType::TIME_MILLIS + | ConvertedType::TIME_MICROS + | ConvertedType::TIMESTAMP_MILLIS + | ConvertedType::TIMESTAMP_MICROS => SortOrder::SIGNED, - LogicalType::INTERVAL => SortOrder::UNSIGNED, + ConvertedType::INTERVAL => SortOrder::UNSIGNED, - LogicalType::LIST | LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + ConvertedType::LIST | ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { SortOrder::UNDEFINED } // Fall back to physical type. - LogicalType::NONE => Self::get_default_sort_order(physical_type), + ConvertedType::NONE => Self::get_default_sort_order(physical_type), } } @@ -357,7 +357,7 @@ impl fmt::Display for Type { } } -impl fmt::Display for LogicalType { +impl fmt::Display for ConvertedType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{:?}", self) } @@ -433,70 +433,74 @@ impl convert::From for parquet::Type { } // ---------------------------------------------------------------------- -// parquet::ConvertedType <=> LogicalType conversion +// parquet::ConvertedType <=> ConvertedType conversion -impl convert::From> for LogicalType { +impl convert::From> for ConvertedType { fn from(option: Option) -> Self { match option { - None => LogicalType::NONE, + None => ConvertedType::NONE, Some(value) => match value { - parquet::ConvertedType::Utf8 => LogicalType::UTF8, - parquet::ConvertedType::Map => LogicalType::MAP, - parquet::ConvertedType::MapKeyValue => LogicalType::MAP_KEY_VALUE, - parquet::ConvertedType::List => LogicalType::LIST, - parquet::ConvertedType::Enum => LogicalType::ENUM, - parquet::ConvertedType::Decimal => LogicalType::DECIMAL, - parquet::ConvertedType::Date => LogicalType::DATE, - parquet::ConvertedType::TimeMillis => LogicalType::TIME_MILLIS, - parquet::ConvertedType::TimeMicros => LogicalType::TIME_MICROS, - parquet::ConvertedType::TimestampMillis => LogicalType::TIMESTAMP_MILLIS, - parquet::ConvertedType::TimestampMicros => LogicalType::TIMESTAMP_MICROS, - parquet::ConvertedType::Uint8 => LogicalType::UINT_8, - parquet::ConvertedType::Uint16 => LogicalType::UINT_16, - parquet::ConvertedType::Uint32 => LogicalType::UINT_32, - parquet::ConvertedType::Uint64 => LogicalType::UINT_64, - parquet::ConvertedType::Int8 => LogicalType::INT_8, - parquet::ConvertedType::Int16 => LogicalType::INT_16, - parquet::ConvertedType::Int32 => LogicalType::INT_32, - parquet::ConvertedType::Int64 => LogicalType::INT_64, - parquet::ConvertedType::Json => LogicalType::JSON, - parquet::ConvertedType::Bson => LogicalType::BSON, - parquet::ConvertedType::Interval => LogicalType::INTERVAL, + parquet::ConvertedType::Utf8 => ConvertedType::UTF8, + parquet::ConvertedType::Map => ConvertedType::MAP, + parquet::ConvertedType::MapKeyValue => ConvertedType::MAP_KEY_VALUE, + parquet::ConvertedType::List => ConvertedType::LIST, + parquet::ConvertedType::Enum => ConvertedType::ENUM, + parquet::ConvertedType::Decimal => ConvertedType::DECIMAL, + parquet::ConvertedType::Date => ConvertedType::DATE, + parquet::ConvertedType::TimeMillis => ConvertedType::TIME_MILLIS, + parquet::ConvertedType::TimeMicros => ConvertedType::TIME_MICROS, + parquet::ConvertedType::TimestampMillis => { + ConvertedType::TIMESTAMP_MILLIS + } + parquet::ConvertedType::TimestampMicros => { + ConvertedType::TIMESTAMP_MICROS + } + parquet::ConvertedType::Uint8 => ConvertedType::UINT_8, + parquet::ConvertedType::Uint16 => ConvertedType::UINT_16, + parquet::ConvertedType::Uint32 => ConvertedType::UINT_32, + parquet::ConvertedType::Uint64 => ConvertedType::UINT_64, + parquet::ConvertedType::Int8 => ConvertedType::INT_8, + parquet::ConvertedType::Int16 => ConvertedType::INT_16, + parquet::ConvertedType::Int32 => ConvertedType::INT_32, + parquet::ConvertedType::Int64 => ConvertedType::INT_64, + parquet::ConvertedType::Json => ConvertedType::JSON, + parquet::ConvertedType::Bson => ConvertedType::BSON, + parquet::ConvertedType::Interval => ConvertedType::INTERVAL, }, } } } -impl convert::From for Option { - fn from(value: LogicalType) -> Self { +impl convert::From for Option { + fn from(value: ConvertedType) -> Self { match value { - LogicalType::NONE => None, - LogicalType::UTF8 => Some(parquet::ConvertedType::Utf8), - LogicalType::MAP => Some(parquet::ConvertedType::Map), - LogicalType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MapKeyValue), - LogicalType::LIST => Some(parquet::ConvertedType::List), - LogicalType::ENUM => Some(parquet::ConvertedType::Enum), - LogicalType::DECIMAL => Some(parquet::ConvertedType::Decimal), - LogicalType::DATE => Some(parquet::ConvertedType::Date), - LogicalType::TIME_MILLIS => Some(parquet::ConvertedType::TimeMillis), - LogicalType::TIME_MICROS => Some(parquet::ConvertedType::TimeMicros), - LogicalType::TIMESTAMP_MILLIS => { + ConvertedType::NONE => None, + ConvertedType::UTF8 => Some(parquet::ConvertedType::Utf8), + ConvertedType::MAP => Some(parquet::ConvertedType::Map), + ConvertedType::MAP_KEY_VALUE => Some(parquet::ConvertedType::MapKeyValue), + ConvertedType::LIST => Some(parquet::ConvertedType::List), + ConvertedType::ENUM => Some(parquet::ConvertedType::Enum), + ConvertedType::DECIMAL => Some(parquet::ConvertedType::Decimal), + ConvertedType::DATE => Some(parquet::ConvertedType::Date), + ConvertedType::TIME_MILLIS => Some(parquet::ConvertedType::TimeMillis), + ConvertedType::TIME_MICROS => Some(parquet::ConvertedType::TimeMicros), + ConvertedType::TIMESTAMP_MILLIS => { Some(parquet::ConvertedType::TimestampMillis) } - LogicalType::TIMESTAMP_MICROS => { + ConvertedType::TIMESTAMP_MICROS => { Some(parquet::ConvertedType::TimestampMicros) } - LogicalType::UINT_8 => Some(parquet::ConvertedType::Uint8), - LogicalType::UINT_16 => Some(parquet::ConvertedType::Uint16), - LogicalType::UINT_32 => Some(parquet::ConvertedType::Uint32), - LogicalType::UINT_64 => Some(parquet::ConvertedType::Uint64), - LogicalType::INT_8 => Some(parquet::ConvertedType::Int8), - LogicalType::INT_16 => Some(parquet::ConvertedType::Int16), - LogicalType::INT_32 => Some(parquet::ConvertedType::Int32), - LogicalType::INT_64 => Some(parquet::ConvertedType::Int64), - LogicalType::JSON => Some(parquet::ConvertedType::Json), - LogicalType::BSON => Some(parquet::ConvertedType::Bson), - LogicalType::INTERVAL => Some(parquet::ConvertedType::Interval), + ConvertedType::UINT_8 => Some(parquet::ConvertedType::Uint8), + ConvertedType::UINT_16 => Some(parquet::ConvertedType::Uint16), + ConvertedType::UINT_32 => Some(parquet::ConvertedType::Uint32), + ConvertedType::UINT_64 => Some(parquet::ConvertedType::Uint64), + ConvertedType::INT_8 => Some(parquet::ConvertedType::Int8), + ConvertedType::INT_16 => Some(parquet::ConvertedType::Int16), + ConvertedType::INT_32 => Some(parquet::ConvertedType::Int32), + ConvertedType::INT_64 => Some(parquet::ConvertedType::Int64), + ConvertedType::JSON => Some(parquet::ConvertedType::Json), + ConvertedType::BSON => Some(parquet::ConvertedType::Bson), + ConvertedType::INTERVAL => Some(parquet::ConvertedType::Interval), } } } @@ -647,34 +651,34 @@ impl str::FromStr for Type { } } -impl str::FromStr for LogicalType { +impl str::FromStr for ConvertedType { type Err = ParquetError; fn from_str(s: &str) -> result::Result { match s { - "NONE" => Ok(LogicalType::NONE), - "UTF8" => Ok(LogicalType::UTF8), - "MAP" => Ok(LogicalType::MAP), - "MAP_KEY_VALUE" => Ok(LogicalType::MAP_KEY_VALUE), - "LIST" => Ok(LogicalType::LIST), - "ENUM" => Ok(LogicalType::ENUM), - "DECIMAL" => Ok(LogicalType::DECIMAL), - "DATE" => Ok(LogicalType::DATE), - "TIME_MILLIS" => Ok(LogicalType::TIME_MILLIS), - "TIME_MICROS" => Ok(LogicalType::TIME_MICROS), - "TIMESTAMP_MILLIS" => Ok(LogicalType::TIMESTAMP_MILLIS), - "TIMESTAMP_MICROS" => Ok(LogicalType::TIMESTAMP_MICROS), - "UINT_8" => Ok(LogicalType::UINT_8), - "UINT_16" => Ok(LogicalType::UINT_16), - "UINT_32" => Ok(LogicalType::UINT_32), - "UINT_64" => Ok(LogicalType::UINT_64), - "INT_8" => Ok(LogicalType::INT_8), - "INT_16" => Ok(LogicalType::INT_16), - "INT_32" => Ok(LogicalType::INT_32), - "INT_64" => Ok(LogicalType::INT_64), - "JSON" => Ok(LogicalType::JSON), - "BSON" => Ok(LogicalType::BSON), - "INTERVAL" => Ok(LogicalType::INTERVAL), + "NONE" => Ok(ConvertedType::NONE), + "UTF8" => Ok(ConvertedType::UTF8), + "MAP" => Ok(ConvertedType::MAP), + "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE), + "LIST" => Ok(ConvertedType::LIST), + "ENUM" => Ok(ConvertedType::ENUM), + "DECIMAL" => Ok(ConvertedType::DECIMAL), + "DATE" => Ok(ConvertedType::DATE), + "TIME_MILLIS" => Ok(ConvertedType::TIME_MILLIS), + "TIME_MICROS" => Ok(ConvertedType::TIME_MICROS), + "TIMESTAMP_MILLIS" => Ok(ConvertedType::TIMESTAMP_MILLIS), + "TIMESTAMP_MICROS" => Ok(ConvertedType::TIMESTAMP_MICROS), + "UINT_8" => Ok(ConvertedType::UINT_8), + "UINT_16" => Ok(ConvertedType::UINT_16), + "UINT_32" => Ok(ConvertedType::UINT_32), + "UINT_64" => Ok(ConvertedType::UINT_64), + "INT_8" => Ok(ConvertedType::INT_8), + "INT_16" => Ok(ConvertedType::INT_16), + "INT_32" => Ok(ConvertedType::INT_32), + "INT_64" => Ok(ConvertedType::INT_64), + "JSON" => Ok(ConvertedType::JSON), + "BSON" => Ok(ConvertedType::BSON), + "INTERVAL" => Ok(ConvertedType::INTERVAL), other => Err(general_err!("Invalid logical type {}", other)), } } @@ -771,363 +775,384 @@ mod tests { #[test] fn test_display_logical_type() { - assert_eq!(LogicalType::NONE.to_string(), "NONE"); - assert_eq!(LogicalType::UTF8.to_string(), "UTF8"); - assert_eq!(LogicalType::MAP.to_string(), "MAP"); - assert_eq!(LogicalType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE"); - assert_eq!(LogicalType::LIST.to_string(), "LIST"); - assert_eq!(LogicalType::ENUM.to_string(), "ENUM"); - assert_eq!(LogicalType::DECIMAL.to_string(), "DECIMAL"); - assert_eq!(LogicalType::DATE.to_string(), "DATE"); - assert_eq!(LogicalType::TIME_MILLIS.to_string(), "TIME_MILLIS"); - assert_eq!(LogicalType::DATE.to_string(), "DATE"); - assert_eq!(LogicalType::TIME_MICROS.to_string(), "TIME_MICROS"); - assert_eq!( - LogicalType::TIMESTAMP_MILLIS.to_string(), + assert_eq!(ConvertedType::NONE.to_string(), "NONE"); + assert_eq!(ConvertedType::UTF8.to_string(), "UTF8"); + assert_eq!(ConvertedType::MAP.to_string(), "MAP"); + assert_eq!(ConvertedType::MAP_KEY_VALUE.to_string(), "MAP_KEY_VALUE"); + assert_eq!(ConvertedType::LIST.to_string(), "LIST"); + assert_eq!(ConvertedType::ENUM.to_string(), "ENUM"); + assert_eq!(ConvertedType::DECIMAL.to_string(), "DECIMAL"); + assert_eq!(ConvertedType::DATE.to_string(), "DATE"); + assert_eq!(ConvertedType::TIME_MILLIS.to_string(), "TIME_MILLIS"); + assert_eq!(ConvertedType::DATE.to_string(), "DATE"); + assert_eq!(ConvertedType::TIME_MICROS.to_string(), "TIME_MICROS"); + assert_eq!( + ConvertedType::TIMESTAMP_MILLIS.to_string(), "TIMESTAMP_MILLIS" ); assert_eq!( - LogicalType::TIMESTAMP_MICROS.to_string(), + ConvertedType::TIMESTAMP_MICROS.to_string(), "TIMESTAMP_MICROS" ); - assert_eq!(LogicalType::UINT_8.to_string(), "UINT_8"); - assert_eq!(LogicalType::UINT_16.to_string(), "UINT_16"); - assert_eq!(LogicalType::UINT_32.to_string(), "UINT_32"); - assert_eq!(LogicalType::UINT_64.to_string(), "UINT_64"); - assert_eq!(LogicalType::INT_8.to_string(), "INT_8"); - assert_eq!(LogicalType::INT_16.to_string(), "INT_16"); - assert_eq!(LogicalType::INT_32.to_string(), "INT_32"); - assert_eq!(LogicalType::INT_64.to_string(), "INT_64"); - assert_eq!(LogicalType::JSON.to_string(), "JSON"); - assert_eq!(LogicalType::BSON.to_string(), "BSON"); - assert_eq!(LogicalType::INTERVAL.to_string(), "INTERVAL"); + assert_eq!(ConvertedType::UINT_8.to_string(), "UINT_8"); + assert_eq!(ConvertedType::UINT_16.to_string(), "UINT_16"); + assert_eq!(ConvertedType::UINT_32.to_string(), "UINT_32"); + assert_eq!(ConvertedType::UINT_64.to_string(), "UINT_64"); + assert_eq!(ConvertedType::INT_8.to_string(), "INT_8"); + assert_eq!(ConvertedType::INT_16.to_string(), "INT_16"); + assert_eq!(ConvertedType::INT_32.to_string(), "INT_32"); + assert_eq!(ConvertedType::INT_64.to_string(), "INT_64"); + assert_eq!(ConvertedType::JSON.to_string(), "JSON"); + assert_eq!(ConvertedType::BSON.to_string(), "BSON"); + assert_eq!(ConvertedType::INTERVAL.to_string(), "INTERVAL"); } #[test] fn test_from_logical_type() { - assert_eq!(LogicalType::from(None), LogicalType::NONE); + assert_eq!(ConvertedType::from(None), ConvertedType::NONE); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Utf8)), - LogicalType::UTF8 + ConvertedType::from(Some(parquet::ConvertedType::Utf8)), + ConvertedType::UTF8 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Map)), - LogicalType::MAP + ConvertedType::from(Some(parquet::ConvertedType::Map)), + ConvertedType::MAP ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::MapKeyValue)), - LogicalType::MAP_KEY_VALUE + ConvertedType::from(Some(parquet::ConvertedType::MapKeyValue)), + ConvertedType::MAP_KEY_VALUE ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::List)), - LogicalType::LIST + ConvertedType::from(Some(parquet::ConvertedType::List)), + ConvertedType::LIST ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Enum)), - LogicalType::ENUM + ConvertedType::from(Some(parquet::ConvertedType::Enum)), + ConvertedType::ENUM ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Decimal)), - LogicalType::DECIMAL + ConvertedType::from(Some(parquet::ConvertedType::Decimal)), + ConvertedType::DECIMAL ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Date)), - LogicalType::DATE + ConvertedType::from(Some(parquet::ConvertedType::Date)), + ConvertedType::DATE ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimeMillis)), - LogicalType::TIME_MILLIS + ConvertedType::from(Some(parquet::ConvertedType::TimeMillis)), + ConvertedType::TIME_MILLIS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimeMicros)), - LogicalType::TIME_MICROS + ConvertedType::from(Some(parquet::ConvertedType::TimeMicros)), + ConvertedType::TIME_MICROS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimestampMillis)), - LogicalType::TIMESTAMP_MILLIS + ConvertedType::from(Some(parquet::ConvertedType::TimestampMillis)), + ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::TimestampMicros)), - LogicalType::TIMESTAMP_MICROS + ConvertedType::from(Some(parquet::ConvertedType::TimestampMicros)), + ConvertedType::TIMESTAMP_MICROS ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint8)), - LogicalType::UINT_8 + ConvertedType::from(Some(parquet::ConvertedType::Uint8)), + ConvertedType::UINT_8 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint16)), - LogicalType::UINT_16 + ConvertedType::from(Some(parquet::ConvertedType::Uint16)), + ConvertedType::UINT_16 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint32)), - LogicalType::UINT_32 + ConvertedType::from(Some(parquet::ConvertedType::Uint32)), + ConvertedType::UINT_32 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Uint64)), - LogicalType::UINT_64 + ConvertedType::from(Some(parquet::ConvertedType::Uint64)), + ConvertedType::UINT_64 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int8)), - LogicalType::INT_8 + ConvertedType::from(Some(parquet::ConvertedType::Int8)), + ConvertedType::INT_8 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int16)), - LogicalType::INT_16 + ConvertedType::from(Some(parquet::ConvertedType::Int16)), + ConvertedType::INT_16 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int32)), - LogicalType::INT_32 + ConvertedType::from(Some(parquet::ConvertedType::Int32)), + ConvertedType::INT_32 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Int64)), - LogicalType::INT_64 + ConvertedType::from(Some(parquet::ConvertedType::Int64)), + ConvertedType::INT_64 ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Json)), - LogicalType::JSON + ConvertedType::from(Some(parquet::ConvertedType::Json)), + ConvertedType::JSON ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Bson)), - LogicalType::BSON + ConvertedType::from(Some(parquet::ConvertedType::Bson)), + ConvertedType::BSON ); assert_eq!( - LogicalType::from(Some(parquet::ConvertedType::Interval)), - LogicalType::INTERVAL + ConvertedType::from(Some(parquet::ConvertedType::Interval)), + ConvertedType::INTERVAL ); } #[test] fn test_into_logical_type() { let converted_type: Option = None; - assert_eq!(converted_type, LogicalType::NONE.into()); - assert_eq!(Some(parquet::ConvertedType::Utf8), LogicalType::UTF8.into()); - assert_eq!(Some(parquet::ConvertedType::Map), LogicalType::MAP.into()); + assert_eq!(converted_type, ConvertedType::NONE.into()); + assert_eq!( + Some(parquet::ConvertedType::Utf8), + ConvertedType::UTF8.into() + ); + assert_eq!(Some(parquet::ConvertedType::Map), ConvertedType::MAP.into()); assert_eq!( Some(parquet::ConvertedType::MapKeyValue), - LogicalType::MAP_KEY_VALUE.into() + ConvertedType::MAP_KEY_VALUE.into() + ); + assert_eq!( + Some(parquet::ConvertedType::List), + ConvertedType::LIST.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Enum), + ConvertedType::ENUM.into() ); - assert_eq!(Some(parquet::ConvertedType::List), LogicalType::LIST.into()); - assert_eq!(Some(parquet::ConvertedType::Enum), LogicalType::ENUM.into()); assert_eq!( Some(parquet::ConvertedType::Decimal), - LogicalType::DECIMAL.into() + ConvertedType::DECIMAL.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Date), + ConvertedType::DATE.into() ); - assert_eq!(Some(parquet::ConvertedType::Date), LogicalType::DATE.into()); assert_eq!( Some(parquet::ConvertedType::TimeMillis), - LogicalType::TIME_MILLIS.into() + ConvertedType::TIME_MILLIS.into() ); assert_eq!( Some(parquet::ConvertedType::TimeMicros), - LogicalType::TIME_MICROS.into() + ConvertedType::TIME_MICROS.into() ); assert_eq!( Some(parquet::ConvertedType::TimestampMillis), - LogicalType::TIMESTAMP_MILLIS.into() + ConvertedType::TIMESTAMP_MILLIS.into() ); assert_eq!( Some(parquet::ConvertedType::TimestampMicros), - LogicalType::TIMESTAMP_MICROS.into() + ConvertedType::TIMESTAMP_MICROS.into() ); assert_eq!( Some(parquet::ConvertedType::Uint8), - LogicalType::UINT_8.into() + ConvertedType::UINT_8.into() ); assert_eq!( Some(parquet::ConvertedType::Uint16), - LogicalType::UINT_16.into() + ConvertedType::UINT_16.into() ); assert_eq!( Some(parquet::ConvertedType::Uint32), - LogicalType::UINT_32.into() + ConvertedType::UINT_32.into() ); assert_eq!( Some(parquet::ConvertedType::Uint64), - LogicalType::UINT_64.into() + ConvertedType::UINT_64.into() ); assert_eq!( Some(parquet::ConvertedType::Int8), - LogicalType::INT_8.into() + ConvertedType::INT_8.into() ); assert_eq!( Some(parquet::ConvertedType::Int16), - LogicalType::INT_16.into() + ConvertedType::INT_16.into() ); assert_eq!( Some(parquet::ConvertedType::Int32), - LogicalType::INT_32.into() + ConvertedType::INT_32.into() ); assert_eq!( Some(parquet::ConvertedType::Int64), - LogicalType::INT_64.into() + ConvertedType::INT_64.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Json), + ConvertedType::JSON.into() + ); + assert_eq!( + Some(parquet::ConvertedType::Bson), + ConvertedType::BSON.into() ); - assert_eq!(Some(parquet::ConvertedType::Json), LogicalType::JSON.into()); - assert_eq!(Some(parquet::ConvertedType::Bson), LogicalType::BSON.into()); assert_eq!( Some(parquet::ConvertedType::Interval), - LogicalType::INTERVAL.into() + ConvertedType::INTERVAL.into() ); } #[test] fn test_from_string_into_logical_type() { assert_eq!( - LogicalType::NONE + ConvertedType::NONE .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::NONE + ConvertedType::NONE ); assert_eq!( - LogicalType::UTF8 + ConvertedType::UTF8 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UTF8 + ConvertedType::UTF8 ); assert_eq!( - LogicalType::MAP.to_string().parse::().unwrap(), - LogicalType::MAP + ConvertedType::MAP + .to_string() + .parse::() + .unwrap(), + ConvertedType::MAP ); assert_eq!( - LogicalType::MAP_KEY_VALUE + ConvertedType::MAP_KEY_VALUE .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::MAP_KEY_VALUE + ConvertedType::MAP_KEY_VALUE ); assert_eq!( - LogicalType::LIST + ConvertedType::LIST .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::LIST + ConvertedType::LIST ); assert_eq!( - LogicalType::ENUM + ConvertedType::ENUM .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::ENUM + ConvertedType::ENUM ); assert_eq!( - LogicalType::DECIMAL + ConvertedType::DECIMAL .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::DECIMAL + ConvertedType::DECIMAL ); assert_eq!( - LogicalType::DATE + ConvertedType::DATE .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::DATE + ConvertedType::DATE ); assert_eq!( - LogicalType::TIME_MILLIS + ConvertedType::TIME_MILLIS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIME_MILLIS + ConvertedType::TIME_MILLIS ); assert_eq!( - LogicalType::TIME_MICROS + ConvertedType::TIME_MICROS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIME_MICROS + ConvertedType::TIME_MICROS ); assert_eq!( - LogicalType::TIMESTAMP_MILLIS + ConvertedType::TIMESTAMP_MILLIS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIMESTAMP_MILLIS + ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( - LogicalType::TIMESTAMP_MICROS + ConvertedType::TIMESTAMP_MICROS .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::TIMESTAMP_MICROS + ConvertedType::TIMESTAMP_MICROS ); assert_eq!( - LogicalType::UINT_8 + ConvertedType::UINT_8 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_8 + ConvertedType::UINT_8 ); assert_eq!( - LogicalType::UINT_16 + ConvertedType::UINT_16 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_16 + ConvertedType::UINT_16 ); assert_eq!( - LogicalType::UINT_32 + ConvertedType::UINT_32 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_32 + ConvertedType::UINT_32 ); assert_eq!( - LogicalType::UINT_64 + ConvertedType::UINT_64 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::UINT_64 + ConvertedType::UINT_64 ); assert_eq!( - LogicalType::INT_8 + ConvertedType::INT_8 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_8 + ConvertedType::INT_8 ); assert_eq!( - LogicalType::INT_16 + ConvertedType::INT_16 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_16 + ConvertedType::INT_16 ); assert_eq!( - LogicalType::INT_32 + ConvertedType::INT_32 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_32 + ConvertedType::INT_32 ); assert_eq!( - LogicalType::INT_64 + ConvertedType::INT_64 .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INT_64 + ConvertedType::INT_64 ); assert_eq!( - LogicalType::JSON + ConvertedType::JSON .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::JSON + ConvertedType::JSON ); assert_eq!( - LogicalType::BSON + ConvertedType::BSON .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::BSON + ConvertedType::BSON ); assert_eq!( - LogicalType::INTERVAL + ConvertedType::INTERVAL .to_string() - .parse::() + .parse::() .unwrap(), - LogicalType::INTERVAL + ConvertedType::INTERVAL ); } @@ -1392,7 +1417,7 @@ mod tests { fn test_column_order_get_sort_order() { // Helper to check the order in a list of values. // Only logical type is checked. - fn check_sort_order(types: Vec, expected_order: SortOrder) { + fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( ColumnOrder::get_sort_order(tpe, Type::BYTE_ARRAY), @@ -1403,44 +1428,44 @@ mod tests { // Unsigned comparison (physical type does not matter) let unsigned = vec![ - LogicalType::UTF8, - LogicalType::JSON, - LogicalType::BSON, - LogicalType::ENUM, - LogicalType::UINT_8, - LogicalType::UINT_16, - LogicalType::UINT_32, - LogicalType::UINT_64, - LogicalType::INTERVAL, + ConvertedType::UTF8, + ConvertedType::JSON, + ConvertedType::BSON, + ConvertedType::ENUM, + ConvertedType::UINT_8, + ConvertedType::UINT_16, + ConvertedType::UINT_32, + ConvertedType::UINT_64, + ConvertedType::INTERVAL, ]; check_sort_order(unsigned, SortOrder::UNSIGNED); // Signed comparison (physical type does not matter) let signed = vec![ - LogicalType::INT_8, - LogicalType::INT_16, - LogicalType::INT_32, - LogicalType::INT_64, - LogicalType::DECIMAL, - LogicalType::DATE, - LogicalType::TIME_MILLIS, - LogicalType::TIME_MICROS, - LogicalType::TIMESTAMP_MILLIS, - LogicalType::TIMESTAMP_MICROS, + ConvertedType::INT_8, + ConvertedType::INT_16, + ConvertedType::INT_32, + ConvertedType::INT_64, + ConvertedType::DECIMAL, + ConvertedType::DATE, + ConvertedType::TIME_MILLIS, + ConvertedType::TIME_MICROS, + ConvertedType::TIMESTAMP_MILLIS, + ConvertedType::TIMESTAMP_MICROS, ]; check_sort_order(signed, SortOrder::SIGNED); // Undefined comparison let undefined = vec![ - LogicalType::LIST, - LogicalType::MAP, - LogicalType::MAP_KEY_VALUE, + ConvertedType::LIST, + ConvertedType::MAP, + ConvertedType::MAP_KEY_VALUE, ]; check_sort_order(undefined, SortOrder::UNDEFINED); // Check None logical type // This should return a sort order for byte array type. - check_sort_order(vec![LogicalType::NONE], SortOrder::UNSIGNED); + check_sort_order(vec![ConvertedType::NONE], SortOrder::UNSIGNED); } #[test] diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index 483ed6f9c3c..1704c8ef610 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -1024,7 +1024,7 @@ mod tests { fn get_test_int32_type() -> SchemaType { SchemaType::primitive_type_builder("a", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_length(-1) .build() .expect("build() should be OK") @@ -1034,7 +1034,7 @@ mod tests { fn get_test_int64_type() -> SchemaType { SchemaType::primitive_type_builder("a", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_64) + .with_converted_type(ConvertedType::INT_64) .with_length(-1) .build() .expect("build() should be OK") diff --git a/rust/parquet/src/file/footer.rs b/rust/parquet/src/file/footer.rs index bffe5797c53..ca896a8ad13 100644 --- a/rust/parquet/src/file/footer.rs +++ b/rust/parquet/src/file/footer.rs @@ -139,7 +139,7 @@ fn parse_column_orders( match orders[i] { TColumnOrder::TYPEORDER(_) => { let sort_order = ColumnOrder::get_sort_order( - column.logical_type(), + column.converted_type(), column.physical_type(), ); res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order)); diff --git a/rust/parquet/src/record/api.rs b/rust/parquet/src/record/api.rs index a9a96dd850a..28f8dc949e6 100644 --- a/rust/parquet/src/record/api.rs +++ b/rust/parquet/src/record/api.rs @@ -22,7 +22,7 @@ use std::fmt; use chrono::{TimeZone, Utc}; use num_bigint::{BigInt, Sign}; -use crate::basic::{LogicalType, Type as PhysicalType}; +use crate::basic::{ConvertedType, Type as PhysicalType}; use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::v1::types::ColumnDescPtr; @@ -34,9 +34,9 @@ use serde_json::Value; macro_rules! nyi { ($column_descr:ident, $value:ident) => {{ unimplemented!( - "Conversion for physical type {}, logical type {}, value {:?}", + "Conversion for physical type {}, converted type {}, value {:?}", $column_descr.physical_type(), - $column_descr.logical_type(), + $column_descr.converted_type(), $value ); }}; @@ -562,18 +562,18 @@ impl Field { Field::Bool(value) } - /// Converts Parquet INT32 type with logical type into `i32` value. + /// Converts Parquet INT32 type with converted type into `i32` value. #[inline] pub fn convert_int32(descr: &ColumnDescPtr, value: i32) -> Self { - match descr.logical_type() { - LogicalType::INT_8 => Field::Byte(value as i8), - LogicalType::INT_16 => Field::Short(value as i16), - LogicalType::INT_32 | LogicalType::NONE => Field::Int(value), - LogicalType::UINT_8 => Field::UByte(value as u8), - LogicalType::UINT_16 => Field::UShort(value as u16), - LogicalType::UINT_32 => Field::UInt(value as u32), - LogicalType::DATE => Field::Date(value as u32), - LogicalType::DECIMAL => Field::Decimal(Decimal::from_i32( + match descr.converted_type() { + ConvertedType::INT_8 => Field::Byte(value as i8), + ConvertedType::INT_16 => Field::Short(value as i16), + ConvertedType::INT_32 | ConvertedType::NONE => Field::Int(value), + ConvertedType::UINT_8 => Field::UByte(value as u8), + ConvertedType::UINT_16 => Field::UShort(value as u16), + ConvertedType::UINT_32 => Field::UInt(value as u32), + ConvertedType::DATE => Field::Date(value as u32), + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_i32( value, descr.type_precision(), descr.type_scale(), @@ -582,15 +582,15 @@ impl Field { } } - /// Converts Parquet INT64 type with logical type into `i64` value. + /// Converts Parquet INT64 type with converted type into `i64` value. #[inline] pub fn convert_int64(descr: &ColumnDescPtr, value: i64) -> Self { - match descr.logical_type() { - LogicalType::INT_64 | LogicalType::NONE => Field::Long(value), - LogicalType::UINT_64 => Field::ULong(value as u64), - LogicalType::TIMESTAMP_MILLIS => Field::TimestampMillis(value as u64), - LogicalType::TIMESTAMP_MICROS => Field::TimestampMicros(value as u64), - LogicalType::DECIMAL => Field::Decimal(Decimal::from_i64( + match descr.converted_type() { + ConvertedType::INT_64 | ConvertedType::NONE => Field::Long(value), + ConvertedType::UINT_64 => Field::ULong(value as u64), + ConvertedType::TIMESTAMP_MILLIS => Field::TimestampMillis(value as u64), + ConvertedType::TIMESTAMP_MICROS => Field::TimestampMicros(value as u64), + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_i64( value, descr.type_precision(), descr.type_scale(), @@ -612,37 +612,37 @@ impl Field { Field::Float(value) } - /// Converts Parquet DOUBLE type with logical type into `f64` value. + /// Converts Parquet DOUBLE type with converted type into `f64` value. #[inline] pub fn convert_double(_descr: &ColumnDescPtr, value: f64) -> Self { Field::Double(value) } - /// Converts Parquet BYTE_ARRAY type with logical type into either UTF8 string or + /// Converts Parquet BYTE_ARRAY type with converted type into either UTF8 string or /// array of bytes. #[inline] pub fn convert_byte_array(descr: &ColumnDescPtr, value: ByteArray) -> Self { match descr.physical_type() { - PhysicalType::BYTE_ARRAY => match descr.logical_type() { - LogicalType::UTF8 | LogicalType::ENUM | LogicalType::JSON => { + PhysicalType::BYTE_ARRAY => match descr.converted_type() { + ConvertedType::UTF8 | ConvertedType::ENUM | ConvertedType::JSON => { let value = String::from_utf8(value.data().to_vec()).unwrap(); Field::Str(value) } - LogicalType::BSON | LogicalType::NONE => Field::Bytes(value), - LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + ConvertedType::BSON | ConvertedType::NONE => Field::Bytes(value), + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_bytes( value, descr.type_precision(), descr.type_scale(), )), _ => nyi!(descr, value), }, - PhysicalType::FIXED_LEN_BYTE_ARRAY => match descr.logical_type() { - LogicalType::DECIMAL => Field::Decimal(Decimal::from_bytes( + PhysicalType::FIXED_LEN_BYTE_ARRAY => match descr.converted_type() { + ConvertedType::DECIMAL => Field::Decimal(Decimal::from_bytes( value, descr.type_precision(), descr.type_scale(), )), - LogicalType::NONE => Field::Bytes(value), + ConvertedType::NONE => Field::Bytes(value), _ => nyi!(descr, value), }, _ => nyi!(descr, value), @@ -839,7 +839,7 @@ mod tests { macro_rules! make_column_descr { ($physical_type:expr, $logical_type:expr) => {{ let tpe = PrimitiveTypeBuilder::new("col", $physical_type) - .with_logical_type($logical_type) + .with_converted_type($logical_type) .build() .unwrap(); Arc::new(ColumnDescriptor::new( @@ -851,7 +851,7 @@ mod tests { }}; ($physical_type:expr, $logical_type:expr, $len:expr, $prec:expr, $scale:expr) => {{ let tpe = PrimitiveTypeBuilder::new("col", $physical_type) - .with_logical_type($logical_type) + .with_converted_type($logical_type) .with_length($len) .with_precision($prec) .with_scale($scale) @@ -869,7 +869,7 @@ mod tests { #[test] fn test_row_convert_bool() { // BOOLEAN value does not depend on logical type - let descr = make_column_descr![PhysicalType::BOOLEAN, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::BOOLEAN, ConvertedType::NONE]; let row = Field::convert_bool(&descr, true); assert_eq!(row, Field::Bool(true)); @@ -880,70 +880,70 @@ mod tests { #[test] fn test_row_convert_int32() { - let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_8]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::INT_8]; let row = Field::convert_int32(&descr, 111); assert_eq!(row, Field::Byte(111)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_16]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::INT_16]; let row = Field::convert_int32(&descr, 222); assert_eq!(row, Field::Short(222)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::INT_32]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::INT_32]; let row = Field::convert_int32(&descr, 333); assert_eq!(row, Field::Int(333)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_8]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::UINT_8]; let row = Field::convert_int32(&descr, -1); assert_eq!(row, Field::UByte(255)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_16]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::UINT_16]; let row = Field::convert_int32(&descr, 256); assert_eq!(row, Field::UShort(256)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::UINT_32]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::UINT_32]; let row = Field::convert_int32(&descr, 1234); assert_eq!(row, Field::UInt(1234)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::NONE]; let row = Field::convert_int32(&descr, 444); assert_eq!(row, Field::Int(444)); - let descr = make_column_descr![PhysicalType::INT32, LogicalType::DATE]; + let descr = make_column_descr![PhysicalType::INT32, ConvertedType::DATE]; let row = Field::convert_int32(&descr, 14611); assert_eq!(row, Field::Date(14611)); let descr = - make_column_descr![PhysicalType::INT32, LogicalType::DECIMAL, 0, 8, 2]; + make_column_descr![PhysicalType::INT32, ConvertedType::DECIMAL, 0, 8, 2]; let row = Field::convert_int32(&descr, 444); assert_eq!(row, Field::Decimal(Decimal::from_i32(444, 8, 2))); } #[test] fn test_row_convert_int64() { - let descr = make_column_descr![PhysicalType::INT64, LogicalType::INT_64]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::INT_64]; let row = Field::convert_int64(&descr, 1111); assert_eq!(row, Field::Long(1111)); - let descr = make_column_descr![PhysicalType::INT64, LogicalType::UINT_64]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::UINT_64]; let row = Field::convert_int64(&descr, 78239823); assert_eq!(row, Field::ULong(78239823)); let descr = - make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MILLIS]; + make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MILLIS]; let row = Field::convert_int64(&descr, 1541186529153); assert_eq!(row, Field::TimestampMillis(1541186529153)); let descr = - make_column_descr![PhysicalType::INT64, LogicalType::TIMESTAMP_MICROS]; + make_column_descr![PhysicalType::INT64, ConvertedType::TIMESTAMP_MICROS]; let row = Field::convert_int64(&descr, 1541186529153123); assert_eq!(row, Field::TimestampMicros(1541186529153123)); - let descr = make_column_descr![PhysicalType::INT64, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::INT64, ConvertedType::NONE]; let row = Field::convert_int64(&descr, 2222); assert_eq!(row, Field::Long(2222)); let descr = - make_column_descr![PhysicalType::INT64, LogicalType::DECIMAL, 0, 8, 2]; + make_column_descr![PhysicalType::INT64, ConvertedType::DECIMAL, 0, 8, 2]; let row = Field::convert_int64(&descr, 3333); assert_eq!(row, Field::Decimal(Decimal::from_i64(3333, 8, 2))); } @@ -951,7 +951,7 @@ mod tests { #[test] fn test_row_convert_int96() { // INT96 value does not depend on logical type - let descr = make_column_descr![PhysicalType::INT96, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::INT96, ConvertedType::NONE]; let value = Int96::from(vec![0, 0, 2454923]); let row = Field::convert_int96(&descr, value); @@ -965,7 +965,7 @@ mod tests { #[test] fn test_row_convert_float() { // FLOAT value does not depend on logical type - let descr = make_column_descr![PhysicalType::FLOAT, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::FLOAT, ConvertedType::NONE]; let row = Field::convert_float(&descr, 2.31); assert_eq!(row, Field::Float(2.31)); } @@ -973,7 +973,7 @@ mod tests { #[test] fn test_row_convert_double() { // DOUBLE value does not depend on logical type - let descr = make_column_descr![PhysicalType::DOUBLE, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::DOUBLE, ConvertedType::NONE]; let row = Field::convert_double(&descr, 1.56); assert_eq!(row, Field::Double(1.56)); } @@ -981,38 +981,38 @@ mod tests { #[test] fn test_row_convert_byte_array() { // UTF8 - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::UTF8]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::UTF8]; let value = ByteArray::from(vec![b'A', b'B', b'C', b'D']); let row = Field::convert_byte_array(&descr, value); assert_eq!(row, Field::Str("ABCD".to_string())); // ENUM - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::ENUM]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::ENUM]; let value = ByteArray::from(vec![b'1', b'2', b'3']); let row = Field::convert_byte_array(&descr, value); assert_eq!(row, Field::Str("123".to_string())); // JSON - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::JSON]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::JSON]; let value = ByteArray::from(vec![b'{', b'"', b'a', b'"', b':', b'1', b'}']); let row = Field::convert_byte_array(&descr, value); assert_eq!(row, Field::Str("{\"a\":1}".to_string())); // NONE - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::NONE]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::NONE]; let value = ByteArray::from(vec![1, 2, 3, 4, 5]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Bytes(value)); // BSON - let descr = make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::BSON]; + let descr = make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::BSON]; let value = ByteArray::from(vec![1, 2, 3, 4, 5]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Bytes(value)); // DECIMAL let descr = - make_column_descr![PhysicalType::BYTE_ARRAY, LogicalType::DECIMAL, 0, 8, 2]; + make_column_descr![PhysicalType::BYTE_ARRAY, ConvertedType::DECIMAL, 0, 8, 2]; let value = ByteArray::from(vec![207, 200]); let row = Field::convert_byte_array(&descr, value.clone()); assert_eq!(row, Field::Decimal(Decimal::from_bytes(value, 8, 2))); @@ -1020,7 +1020,7 @@ mod tests { // DECIMAL (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ PhysicalType::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, + ConvertedType::DECIMAL, 8, 17, 5 @@ -1032,7 +1032,7 @@ mod tests { // NONE (FIXED_LEN_BYTE_ARRAY) let descr = make_column_descr![ PhysicalType::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, + ConvertedType::NONE, 6, 0, 0 diff --git a/rust/parquet/src/record/reader.rs b/rust/parquet/src/record/reader.rs index eefcae20786..d8902480cfa 100644 --- a/rust/parquet/src/record/reader.rs +++ b/rust/parquet/src/record/reader.rs @@ -20,14 +20,16 @@ use std::{collections::HashMap, fmt, sync::Arc}; -use crate::basic::{LogicalType, Repetition}; +use crate::basic::{ConvertedType, Repetition}; use crate::errors::{ParquetError, Result}; use crate::file::reader::{FileReader, RowGroupReader}; use crate::record::{ api::{make_list, make_map, make_row, Field, Row}, triplet::TripletIter, }; -use crate::schema::v1::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; +use crate::schema::v1::types::{ + ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr, +}; /// Default batch size for a reader const DEFAULT_BATCH_SIZE: usize = 1024; @@ -138,9 +140,9 @@ impl TreeBuilder { let column = TripletIter::new(col_descr, col_reader, self.batch_size); Reader::PrimitiveReader(field, column) } else { - match field.get_basic_info().logical_type() { + match field.get_basic_info().converted_type() { // List types - LogicalType::LIST => { + ConvertedType::LIST => { assert_eq!( field.get_fields().len(), 1, @@ -198,7 +200,7 @@ impl TreeBuilder { } } // Map types (key-value pairs) - LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { assert_eq!( field.get_fields().len(), 1, @@ -269,7 +271,7 @@ impl TreeBuilder { _ if repetition == Repetition::REPEATED => { let required_field = Type::group_type_builder(field.name()) .with_repetition(Repetition::REQUIRED) - .with_logical_type(field.get_basic_info().logical_type()) + .with_converted_type(field.get_basic_info().converted_type()) .with_fields(&mut Vec::from(field.get_fields())) .build() .unwrap(); diff --git a/rust/parquet/src/schema/mod.rs b/rust/parquet/src/schema/mod.rs index e48acb3c26a..749ab1fe783 100644 --- a/rust/parquet/src/schema/mod.rs +++ b/rust/parquet/src/schema/mod.rs @@ -16,4 +16,4 @@ // under the License. pub mod v1; -pub mod v2; \ No newline at end of file +pub mod v2; diff --git a/rust/parquet/src/schema/v1/mod.rs b/rust/parquet/src/schema/v1/mod.rs index ce153ec10c3..8cb7f23b57a 100644 --- a/rust/parquet/src/schema/v1/mod.rs +++ b/rust/parquet/src/schema/v1/mod.rs @@ -21,7 +21,7 @@ //! //! ```rust //! use parquet::{ -//! basic::{LogicalType, Repetition, Type as PhysicalType}, +//! basic::{ConvertedType, Repetition, Type as PhysicalType}, //! schema::v1::{parser, printer, types::Type}, //! }; //! use std::sync::Arc; @@ -34,7 +34,7 @@ //! // } //! //! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) -//! .with_logical_type(LogicalType::UTF8) +//! .with_converted_type(ConvertedType::UTF8) //! .with_repetition(Repetition::OPTIONAL) //! .build() //! .unwrap(); @@ -64,4 +64,4 @@ pub mod parser; pub mod printer; pub mod types; -pub mod visitor; \ No newline at end of file +pub mod visitor; diff --git a/rust/parquet/src/schema/v1/parser.rs b/rust/parquet/src/schema/v1/parser.rs index a5f9a468417..b9b8c77aeb1 100644 --- a/rust/parquet/src/schema/v1/parser.rs +++ b/rust/parquet/src/schema/v1/parser.rs @@ -44,7 +44,7 @@ use std::sync::Arc; -use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; use crate::schema::v1::types::{Type, TypePtr}; @@ -223,17 +223,17 @@ impl<'a> Parser<'a> { .ok_or_else(|| general_err!("Expected name, found None"))?; // Parse logical type if exists - let logical_type = if let Some("(") = self.tokenizer.next() { + let converted_type = if let Some("(") = self.tokenizer.next() { let tpe = self .tokenizer .next() .ok_or_else(|| general_err!("Expected logical type, found None")) - .and_then(|v| v.to_uppercase().parse::())?; + .and_then(|v| v.to_uppercase().parse::())?; assert_token(self.tokenizer.next(), ")")?; tpe } else { self.tokenizer.backtrack(); - LogicalType::NONE + ConvertedType::NONE }; // Parse optional id @@ -246,7 +246,7 @@ impl<'a> Parser<'a> { let mut fields = self.parse_child_types()?; let mut builder = Type::group_type_builder(name) - .with_logical_type(logical_type) + .with_converted_type(converted_type) .with_fields(&mut fields); if let Some(rep) = repetition { builder = builder.with_repetition(rep); @@ -281,18 +281,19 @@ impl<'a> Parser<'a> { .ok_or_else(|| general_err!("Expected name, found None"))?; // Parse logical type - let (logical_type, precision, scale) = if let Some("(") = self.tokenizer.next() { + let (converted_type, precision, scale) = if let Some("(") = self.tokenizer.next() + { let tpe = self .tokenizer .next() .ok_or_else(|| general_err!("Expected logical type, found None")) - .and_then(|v| v.to_uppercase().parse::())?; + .and_then(|v| v.to_uppercase().parse::())?; // Parse precision and scale for decimals let mut precision: i32 = -1; let mut scale: i32 = -1; - if tpe == LogicalType::DECIMAL { + if tpe == ConvertedType::DECIMAL { if let Some("(") = self.tokenizer.next() { // Parse precision precision = parse_i32( @@ -324,7 +325,7 @@ impl<'a> Parser<'a> { (tpe, precision, scale) } else { self.tokenizer.backtrack(); - (LogicalType::NONE, -1, -1) + (ConvertedType::NONE, -1, -1) }; // Parse optional id @@ -338,7 +339,7 @@ impl<'a> Parser<'a> { let mut builder = Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) - .with_logical_type(logical_type) + .with_converted_type(converted_type) .with_length(length) .with_precision(precision) .with_scale(scale); @@ -597,7 +598,7 @@ mod tests { "f1", PhysicalType::FIXED_LEN_BYTE_ARRAY, ) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(5) .with_precision(9) .with_scale(3) @@ -609,7 +610,7 @@ mod tests { "f2", PhysicalType::FIXED_LEN_BYTE_ARRAY, ) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(16) .with_precision(38) .with_scale(18) @@ -656,14 +657,14 @@ mod tests { Arc::new( Type::group_type_builder("a1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new( Type::primitive_type_builder( "a2", PhysicalType::BYTE_ARRAY, ) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(), )]) @@ -673,7 +674,7 @@ mod tests { Arc::new( Type::group_type_builder("b1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new( Type::group_type_builder("b2") .with_repetition(Repetition::REPEATED) @@ -734,14 +735,14 @@ mod tests { Arc::new( Type::primitive_type_builder("_1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_8) + .with_converted_type(ConvertedType::INT_8) .build() .unwrap(), ), Arc::new( Type::primitive_type_builder("_2", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_16) + .with_converted_type(ConvertedType::INT_16) .build() .unwrap(), ), @@ -759,13 +760,13 @@ mod tests { ), Arc::new( Type::primitive_type_builder("_5", PhysicalType::INT32) - .with_logical_type(LogicalType::DATE) + .with_converted_type(ConvertedType::DATE) .build() .unwrap(), ), Arc::new( Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(), ), diff --git a/rust/parquet/src/schema/v1/printer.rs b/rust/parquet/src/schema/v1/printer.rs index 56165478c81..75b33ef14d9 100644 --- a/rust/parquet/src/schema/v1/printer.rs +++ b/rust/parquet/src/schema/v1/printer.rs @@ -45,7 +45,7 @@ use std::{fmt, io}; -use crate::basic::{LogicalType, Type as PhysicalType}; +use crate::basic::{ConvertedType, Type as PhysicalType}; use crate::file::metadata::{ ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, }; @@ -215,9 +215,9 @@ impl<'a> Printer<'a> { _ => format!("{}", physical_type), }; // Also print logical type if it is available - let logical_type_str = match basic_info.logical_type() { - LogicalType::NONE => format!(""), - decimal @ LogicalType::DECIMAL => { + let converted_type_str = match basic_info.converted_type() { + ConvertedType::NONE => format!(""), + decimal @ ConvertedType::DECIMAL => { // For decimal type we should print precision and scale if they // are > 0, e.g. DECIMAL(9, 2) - // DECIMAL(9) - DECIMAL @@ -228,7 +228,7 @@ impl<'a> Printer<'a> { }; format!(" ({}{})", decimal, precision_scale) } - other_logical_type => format!(" ({})", other_logical_type), + other_converted_type => format!(" ({})", other_converted_type), }; write!( self.output, @@ -236,7 +236,7 @@ impl<'a> Printer<'a> { basic_info.repetition(), phys_type_str, basic_info.name(), - logical_type_str + converted_type_str ); } Type::GroupType { @@ -246,8 +246,8 @@ impl<'a> Printer<'a> { if basic_info.has_repetition() { let r = basic_info.repetition(); write!(self.output, "{} group {} ", r, basic_info.name()); - if basic_info.logical_type() != LogicalType::NONE { - write!(self.output, "({}) ", basic_info.logical_type()); + if basic_info.converted_type() != ConvertedType::NONE { + write!(self.output, "({}) ", basic_info.converted_type()); } writeln!(self.output, "{{"); } else { @@ -293,7 +293,7 @@ mod tests { let mut p = Printer::new(&mut s); let field = Type::primitive_type_builder("field", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build() .unwrap(); p.print(&field); @@ -322,17 +322,17 @@ mod tests { let mut p = Printer::new(&mut s); let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_id(0) .build(); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .with_id(1) .build(); let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_length(12) .with_id(2) .build(); @@ -369,13 +369,13 @@ mod tests { fn test_print_and_parse_primitive() { let a2 = Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(); let a1 = Type::group_type_builder("a1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new(a2)]) .build() .unwrap(); @@ -392,14 +392,14 @@ mod tests { let b2 = Type::group_type_builder("b2") .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::NONE) + .with_converted_type(ConvertedType::NONE) .with_fields(&mut vec![Arc::new(b3), Arc::new(b4)]) .build() .unwrap(); let b1 = Type::group_type_builder("b1") .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new(b2)]) .build() .unwrap(); @@ -422,13 +422,13 @@ mod tests { fn test_print_and_parse_nested() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build() .unwrap(); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build() .unwrap(); @@ -440,7 +440,7 @@ mod tests { let f3 = Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_length(12) .build() .unwrap(); @@ -457,7 +457,7 @@ mod tests { fn test_print_and_parse_decimal() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(9) .with_scale(2) .build() @@ -465,7 +465,7 @@ mod tests { let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32) .with_repetition(Repetition::OPTIONAL) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(9) .with_scale(0) .build() diff --git a/rust/parquet/src/schema/v1/types.rs b/rust/parquet/src/schema/v1/types.rs index 00389b7cee2..4e555beb669 100644 --- a/rust/parquet/src/schema/v1/types.rs +++ b/rust/parquet/src/schema/v1/types.rs @@ -21,7 +21,7 @@ use std::{collections::HashMap, convert::From, fmt, sync::Arc}; use parquet_format::SchemaElement; -use crate::basic::{LogicalType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; // ---------------------------------------------------------------------- @@ -192,7 +192,7 @@ pub struct PrimitiveTypeBuilder<'a> { name: &'a str, repetition: Repetition, physical_type: PhysicalType, - logical_type: LogicalType, + converted_type: ConvertedType, length: i32, precision: i32, scale: i32, @@ -206,7 +206,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { name, repetition: Repetition::OPTIONAL, physical_type, - logical_type: LogicalType::NONE, + converted_type: ConvertedType::NONE, length: -1, precision: -1, scale: -1, @@ -220,9 +220,9 @@ impl<'a> PrimitiveTypeBuilder<'a> { self } - /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. - pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { - self.logical_type = logical_type; + /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. + pub fn with_converted_type(mut self, converted_type: ConvertedType) -> Self { + self.converted_type = converted_type; self } @@ -261,7 +261,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { let basic_info = BasicTypeInfo { name: String::from(self.name), repetition: Some(self.repetition), - logical_type: self.logical_type, + converted_type: self.converted_type, id: self.id, }; @@ -273,17 +273,17 @@ impl<'a> PrimitiveTypeBuilder<'a> { )); } - match self.logical_type { - LogicalType::NONE => {} - LogicalType::UTF8 | LogicalType::BSON | LogicalType::JSON => { + match self.converted_type { + ConvertedType::NONE => {} + ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => { if self.physical_type != PhysicalType::BYTE_ARRAY { return Err(general_err!( "{} can only annotate BYTE_ARRAY fields", - self.logical_type + self.converted_type )); } } - LogicalType::DECIMAL => { + ConvertedType::DECIMAL => { match self.physical_type { PhysicalType::INT32 | PhysicalType::INT64 @@ -353,34 +353,34 @@ impl<'a> PrimitiveTypeBuilder<'a> { _ => (), // For BYTE_ARRAY precision is not limited } } - LogicalType::DATE - | LogicalType::TIME_MILLIS - | LogicalType::UINT_8 - | LogicalType::UINT_16 - | LogicalType::UINT_32 - | LogicalType::INT_8 - | LogicalType::INT_16 - | LogicalType::INT_32 => { + ConvertedType::DATE + | ConvertedType::TIME_MILLIS + | ConvertedType::UINT_8 + | ConvertedType::UINT_16 + | ConvertedType::UINT_32 + | ConvertedType::INT_8 + | ConvertedType::INT_16 + | ConvertedType::INT_32 => { if self.physical_type != PhysicalType::INT32 { return Err(general_err!( "{} can only annotate INT32", - self.logical_type + self.converted_type )); } } - LogicalType::TIME_MICROS - | LogicalType::TIMESTAMP_MILLIS - | LogicalType::TIMESTAMP_MICROS - | LogicalType::UINT_64 - | LogicalType::INT_64 => { + ConvertedType::TIME_MICROS + | ConvertedType::TIMESTAMP_MILLIS + | ConvertedType::TIMESTAMP_MICROS + | ConvertedType::UINT_64 + | ConvertedType::INT_64 => { if self.physical_type != PhysicalType::INT64 { return Err(general_err!( "{} can only annotate INT64", - self.logical_type + self.converted_type )); } } - LogicalType::INTERVAL => { + ConvertedType::INTERVAL => { if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 { @@ -389,7 +389,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { )); } } - LogicalType::ENUM => { + ConvertedType::ENUM => { if self.physical_type != PhysicalType::BYTE_ARRAY { return Err(general_err!("ENUM can only annotate BYTE_ARRAY fields")); } @@ -397,7 +397,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { _ => { return Err(general_err!( "{} cannot be applied to a primitive type", - self.logical_type + self.converted_type )); } } @@ -418,7 +418,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { pub struct GroupTypeBuilder<'a> { name: &'a str, repetition: Option, - logical_type: LogicalType, + converted_type: ConvertedType, fields: Vec, id: Option, } @@ -429,7 +429,7 @@ impl<'a> GroupTypeBuilder<'a> { Self { name, repetition: None, - logical_type: LogicalType::NONE, + converted_type: ConvertedType::NONE, fields: Vec::new(), id: None, } @@ -441,9 +441,9 @@ impl<'a> GroupTypeBuilder<'a> { self } - /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. - pub fn with_logical_type(mut self, logical_type: LogicalType) -> Self { - self.logical_type = logical_type; + /// Sets [`ConvertedType`](crate::basic::ConvertedType) for this field and returns itself. + pub fn with_converted_type(mut self, converted_type: ConvertedType) -> Self { + self.converted_type = converted_type; self } @@ -465,7 +465,7 @@ impl<'a> GroupTypeBuilder<'a> { let basic_info = BasicTypeInfo { name: String::from(self.name), repetition: self.repetition, - logical_type: self.logical_type, + converted_type: self.converted_type, id: self.id, }; Ok(Type::GroupType { @@ -481,7 +481,7 @@ impl<'a> GroupTypeBuilder<'a> { pub struct BasicTypeInfo { name: String, repetition: Option, - logical_type: LogicalType, + converted_type: ConvertedType, id: Option, } @@ -504,9 +504,9 @@ impl BasicTypeInfo { self.repetition.unwrap() } - /// Returns [`LogicalType`](crate::basic::LogicalType) value for the type. - pub fn logical_type(&self) -> LogicalType { - self.logical_type + /// Returns [`ConvertedType`](crate::basic::ConvertedType) value for the type. + pub fn converted_type(&self) -> ConvertedType { + self.converted_type } /// Returns `true` if id is set, `false` otherwise. @@ -665,9 +665,9 @@ impl ColumnDescriptor { self.primitive_type.name() } - /// Returns [`LogicalType`](crate::basic::LogicalType) for this column. - pub fn logical_type(&self) -> LogicalType { - self.primitive_type.get_basic_info().logical_type() + /// Returns [`ConvertedType`](crate::basic::ConvertedType) for this column. + pub fn converted_type(&self) -> ConvertedType { + self.primitive_type.get_basic_info().converted_type() } /// Returns physical type for this column. @@ -906,7 +906,7 @@ fn from_thrift_helper( elements.len() )); } - let logical_type = LogicalType::from(elements[index].converted_type); + let converted_type = ConvertedType::from(elements[index].converted_type); let field_id = elements[index].field_id; match elements[index].num_children { // From parquet-format: @@ -929,7 +929,7 @@ fn from_thrift_helper( let name = &elements[index].name; let mut builder = Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) - .with_logical_type(logical_type) + .with_converted_type(converted_type) .with_length(length) .with_precision(precision) .with_scale(scale); @@ -949,7 +949,7 @@ fn from_thrift_helper( } let mut builder = Type::group_type_builder(&elements[index].name) - .with_logical_type(logical_type) + .with_converted_type(converted_type) .with_fields(&mut fields); if let Some(rep) = repetition { // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or @@ -1002,7 +1002,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { repetition_type: Some(basic_info.repetition().into()), name: basic_info.name().to_owned(), num_children: None, - converted_type: basic_info.logical_type().into(), + converted_type: basic_info.converted_type().into(), scale: if scale >= 0 { Some(scale) } else { None }, precision: if precision >= 0 { Some(precision) @@ -1035,7 +1035,7 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { repetition_type: repetition, name: basic_info.name().to_owned(), num_children: Some(fields.len() as i32), - converted_type: basic_info.logical_type().into(), + converted_type: basic_info.converted_type().into(), scale: None, precision: None, field_id: if basic_info.has_id() { @@ -1065,7 +1065,7 @@ mod tests { #[test] fn test_primitive_type() { let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_id(0) .build(); assert!(result.is_ok()); @@ -1075,7 +1075,7 @@ mod tests { assert!(!tp.is_group()); let basic_info = tp.get_basic_info(); assert_eq!(basic_info.repetition(), Repetition::OPTIONAL); - assert_eq!(basic_info.logical_type(), LogicalType::INT_32); + assert_eq!(basic_info.converted_type(), ConvertedType::INT_32); assert_eq!(basic_info.id(), 0); match tp { Type::PrimitiveType { physical_type, .. } => { @@ -1088,7 +1088,7 @@ mod tests { // Test illegal inputs result = Type::primitive_type_builder("foo", PhysicalType::INT64) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::BSON) + .with_converted_type(ConvertedType::BSON) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1100,7 +1100,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT96) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(-1) .with_scale(-1) .build(); @@ -1114,7 +1114,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(-1) .with_scale(-1) .build(); @@ -1128,7 +1128,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(0) .with_scale(-1) .build(); @@ -1142,7 +1142,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(1) .with_scale(-1) .build(); @@ -1153,7 +1153,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(1) .with_scale(2) .build(); @@ -1167,7 +1167,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(18) .with_scale(2) .build(); @@ -1181,7 +1181,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_precision(32) .with_scale(2) .build(); @@ -1195,7 +1195,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(5) .with_precision(12) .with_scale(2) @@ -1210,7 +1210,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::UINT_8) + .with_converted_type(ConvertedType::UINT_8) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1222,7 +1222,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::TIME_MICROS) + .with_converted_type(ConvertedType::TIME_MICROS) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1234,7 +1234,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1246,7 +1246,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INTERVAL) + .with_converted_type(ConvertedType::INTERVAL) .with_length(1) .build(); assert!(result.is_err()); @@ -1259,7 +1259,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::ENUM) + .with_converted_type(ConvertedType::ENUM) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1271,7 +1271,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::MAP) + .with_converted_type(ConvertedType::MAP) .build(); assert!(result.is_err()); if let Err(e) = result { @@ -1283,7 +1283,7 @@ mod tests { result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::DECIMAL) + .with_converted_type(ConvertedType::DECIMAL) .with_length(-1) .build(); assert!(result.is_err()); @@ -1298,12 +1298,12 @@ mod tests { #[test] fn test_group_type() { let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .with_id(0) .build(); assert!(f1.is_ok()); let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .with_id(1) .build(); assert!(f2.is_ok()); @@ -1324,7 +1324,7 @@ mod tests { assert!(tp.is_group()); assert!(!tp.is_primitive()); assert_eq!(basic_info.repetition(), Repetition::REPEATED); - assert_eq!(basic_info.logical_type(), LogicalType::NONE); + assert_eq!(basic_info.converted_type(), ConvertedType::NONE); assert_eq!(basic_info.id(), 1); assert_eq!(tp.get_fields().len(), 2); assert_eq!(tp.get_fields()[0].name(), "f1"); @@ -1343,13 +1343,13 @@ mod tests { fn test_column_descriptor_helper() -> Result<()> { let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build()?; let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name")); assert_eq!(descr.path(), &ColumnPath::from("name")); - assert_eq!(descr.logical_type(), LogicalType::UTF8); + assert_eq!(descr.converted_type(), ConvertedType::UTF8); assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY); assert_eq!(descr.max_def_level(), 4); assert_eq!(descr.max_rep_level(), 1); @@ -1377,33 +1377,33 @@ mod tests { let inta = Type::primitive_type_builder("a", PhysicalType::INT32) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build()?; fields.push(Arc::new(inta)); let intb = Type::primitive_type_builder("b", PhysicalType::INT64) - .with_logical_type(LogicalType::INT_64) + .with_converted_type(ConvertedType::INT_64) .build()?; fields.push(Arc::new(intb)); let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::UTF8) + .with_converted_type(ConvertedType::UTF8) .build()?; fields.push(Arc::new(intc)); // 3-level list encoding let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64) .with_repetition(Repetition::REQUIRED) - .with_logical_type(LogicalType::INT_64) + .with_converted_type(ConvertedType::INT_64) .build()?; let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?; let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32) .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::INT_32) + .with_converted_type(ConvertedType::INT_32) .build()?; let list = Type::group_type_builder("records") .with_repetition(Repetition::REPEATED) - .with_logical_type(LogicalType::LIST) + .with_converted_type(ConvertedType::LIST) .with_fields(&mut vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)]) .build()?; let bag = Type::group_type_builder("bag") @@ -1522,11 +1522,11 @@ mod tests { // OK: different logical type does not affect check_contains let f1 = Type::primitive_type_builder("f", PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_8) + .with_converted_type(ConvertedType::UINT_8) .build() .unwrap(); let f2 = Type::primitive_type_builder("f", PhysicalType::INT32) - .with_logical_type(LogicalType::UINT_16) + .with_converted_type(ConvertedType::UINT_16) .build() .unwrap(); assert!(f1.check_contains(&f2)); diff --git a/rust/parquet/src/schema/v1/visitor.rs b/rust/parquet/src/schema/v1/visitor.rs index 85c7843058f..c9574fb9bdf 100644 --- a/rust/parquet/src/schema/v1/visitor.rs +++ b/rust/parquet/src/schema/v1/visitor.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::basic::{LogicalType, Repetition}; +use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError::General; use crate::errors::Result; use crate::schema::v1::types::{Type, TypePtr}; @@ -100,9 +100,9 @@ pub trait TypeVisitor { if cur_type.is_primitive() { self.visit_primitive(cur_type, context) } else { - match cur_type.get_basic_info().logical_type() { - LogicalType::LIST => self.visit_list(cur_type, context), - LogicalType::MAP | LogicalType::MAP_KEY_VALUE => { + match cur_type.get_basic_info().converted_type() { + ConvertedType::LIST => self.visit_list(cur_type, context), + ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { self.visit_map(cur_type, context) } _ => self.visit_struct(cur_type, context), From b1e351f2cd303937aadfe01126b72a99932259b5 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 27 Feb 2021 00:17:32 +0200 Subject: [PATCH 03/10] add v2 logical type --- rust/parquet/src/basic.rs | 164 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index 26d4225ab0c..519a6b415a4 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -24,6 +24,12 @@ use parquet_format as parquet; use crate::errors::ParquetError; +// Re-export parquet_format types used in this module +pub use parquet_format::{ + BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, + NullType, StringType, TimeType, TimestampType, UUIDType, +}; + // ---------------------------------------------------------------------- // Types from the Thrift definition @@ -146,6 +152,27 @@ pub enum ConvertedType { INTERVAL, } +// ---------------------------------------------------------------------- +// Mirrors `parquet::LogicalType` + +/// Logical types used by version 2 of the Parquet format. +#[derive(Debug, Clone, PartialEq)] +pub enum LogicalType { + STRING(StringType), + MAP(MapType), + LIST(ListType), + ENUM(EnumType), + DECIMAL(DecimalType), + DATE(DateType), + TIME(TimeType), + TIMESTAMP(TimestampType), + INTEGER(IntType), + UNKNOWN(NullType), + JSON(JsonType), + BSON(BsonType), + UUID(UUIDType), +} + // ---------------------------------------------------------------------- // Mirrors `parquet::FieldRepetitionType` @@ -505,6 +532,49 @@ impl convert::From for Option { } } +// ---------------------------------------------------------------------- +// parquet::LogicalType <=> LogicalType conversion + +impl convert::From for LogicalType { + fn from(value: parquet::LogicalType) -> Self { + match value { + parquet::LogicalType::STRING(t) => LogicalType::STRING(t), + parquet::LogicalType::MAP(t) => LogicalType::MAP(t), + parquet::LogicalType::LIST(t) => LogicalType::LIST(t), + parquet::LogicalType::ENUM(t) => LogicalType::ENUM(t), + parquet::LogicalType::DECIMAL(t) => LogicalType::DECIMAL(t), + parquet::LogicalType::DATE(t) => LogicalType::DATE(t), + parquet::LogicalType::TIME(t) => LogicalType::TIME(t), + parquet::LogicalType::TIMESTAMP(t) => LogicalType::TIMESTAMP(t), + parquet::LogicalType::INTEGER(t) => LogicalType::INTEGER(t), + parquet::LogicalType::UNKNOWN(t) => LogicalType::UNKNOWN(t), + parquet::LogicalType::JSON(t) => LogicalType::JSON(t), + parquet::LogicalType::BSON(t) => LogicalType::BSON(t), + parquet::LogicalType::UUID(t) => LogicalType::UUID(t), + } + } +} + +impl convert::From for parquet::LogicalType { + fn from(value: LogicalType) -> Self { + match value { + LogicalType::STRING(t) => parquet::LogicalType::STRING(t), + LogicalType::MAP(t) => parquet::LogicalType::MAP(t), + LogicalType::LIST(t) => parquet::LogicalType::LIST(t), + LogicalType::ENUM(t) => parquet::LogicalType::ENUM(t), + LogicalType::DECIMAL(t) => parquet::LogicalType::DECIMAL(t), + LogicalType::DATE(t) => parquet::LogicalType::DATE(t), + LogicalType::TIME(t) => parquet::LogicalType::TIME(t), + LogicalType::TIMESTAMP(t) => parquet::LogicalType::TIMESTAMP(t), + LogicalType::INTEGER(t) => parquet::LogicalType::INTEGER(t), + LogicalType::UNKNOWN(t) => parquet::LogicalType::UNKNOWN(t), + LogicalType::JSON(t) => parquet::LogicalType::JSON(t), + LogicalType::BSON(t) => parquet::LogicalType::BSON(t), + LogicalType::UUID(t) => parquet::LogicalType::UUID(t), + } + } +} + // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion @@ -684,6 +754,100 @@ impl str::FromStr for ConvertedType { } } +impl str::FromStr for LogicalType { + type Err = ParquetError; + + fn from_str(s: &str) -> result::Result { + match s { + "INTEGER(8,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + })), + "INTEGER(16,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: true, + })), + "INTEGER(32,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: true, + })), + "INTEGER(64,true)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: true, + })), + "INTEGER(8,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + })), + "INTEGER(16,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false, + })), + "INTEGER(32,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false, + })), + "INTEGER(64,false)" => Ok(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false, + })), + "MAP" => Ok(LogicalType::MAP(MapType {})), + // "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE), + "LIST" => Ok(LogicalType::LIST(ListType {})), + "ENUM" => Ok(LogicalType::ENUM(EnumType {})), + // "DECIMAL" => Ok(ConvertedType::DECIMAL), + "DATE" => Ok(LogicalType::DATE(DateType {})), + "TIME(MILLIS,true)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: true, + unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIME(MILLIS,false)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIME(MICROS,true)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: true, + unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIME(MICROS,false)" => Ok(LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(MILLIS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIMESTAMP(MILLIS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + })), + "TIMESTAMP(MICROS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(MICROS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(NANOS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "TIMESTAMP(NANOS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + })), + "STRING" => Ok(LogicalType::STRING(StringType {})), + "JSON" => Ok(LogicalType::JSON(JsonType {})), + "BSON" => Ok(LogicalType::BSON(BsonType {})), + "UUID" => Ok(LogicalType::UUID(UUIDType {})), + "UNKNOWN" => Ok(LogicalType::UNKNOWN(NullType {})), + "INTERVAL" => Err(general_err!("Interval logical type not yet supported")), + other => Err(general_err!("Invalid logical type {}", other)), + } + } +} + #[cfg(test)] mod tests { use super::*; From 3075d7c91ce49e88ad702ee921868c5c242c2360 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 27 Feb 2021 01:06:04 +0200 Subject: [PATCH 04/10] add logical type to types, write it with v2 writer This adds LogicalType to the internal types and builders. It also populates the thrift type with logical types if v2 of the writer is used. Added a TODO for tests that should be added. --- rust/parquet/src/file/writer.rs | 5 +- rust/parquet/src/schema/v1/types.rs | 74 ++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 15 deletions(-) diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index 47cafc3c6a0..422646b9ae4 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -175,9 +175,10 @@ impl SerializedFileWriter { /// Assembles and writes metadata at the end of the file. fn write_metadata(&mut self) -> Result<()> { + let writer_version = self.props.writer_version().as_num(); let file_metadata = parquet::FileMetaData { - version: self.props.writer_version().as_num(), - schema: types::to_thrift(self.schema.as_ref())?, + version: writer_version, + schema: types::to_thrift(self.schema.as_ref(), writer_version)?, num_rows: self.total_num_rows as i64, row_groups: self .row_groups diff --git a/rust/parquet/src/schema/v1/types.rs b/rust/parquet/src/schema/v1/types.rs index 4e555beb669..b2219dedc79 100644 --- a/rust/parquet/src/schema/v1/types.rs +++ b/rust/parquet/src/schema/v1/types.rs @@ -21,7 +21,7 @@ use std::{collections::HashMap, convert::From, fmt, sync::Arc}; use parquet_format::SchemaElement; -use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; +use crate::basic::{ConvertedType, LogicalType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; // ---------------------------------------------------------------------- @@ -193,6 +193,7 @@ pub struct PrimitiveTypeBuilder<'a> { repetition: Repetition, physical_type: PhysicalType, converted_type: ConvertedType, + logical_type: Option, length: i32, precision: i32, scale: i32, @@ -207,6 +208,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { repetition: Repetition::OPTIONAL, physical_type, converted_type: ConvertedType::NONE, + logical_type: None, length: -1, precision: -1, scale: -1, @@ -226,6 +228,12 @@ impl<'a> PrimitiveTypeBuilder<'a> { self } + /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. + pub fn with_logical_type(mut self, logical_type: Option) -> Self { + self.logical_type = logical_type; + self + } + /// Sets type length and returns itself. /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because /// they maintain fixed size underlying byte array. @@ -262,6 +270,7 @@ impl<'a> PrimitiveTypeBuilder<'a> { name: String::from(self.name), repetition: Some(self.repetition), converted_type: self.converted_type, + logical_type: self.logical_type, id: self.id, }; @@ -419,6 +428,7 @@ pub struct GroupTypeBuilder<'a> { name: &'a str, repetition: Option, converted_type: ConvertedType, + logical_type: Option, fields: Vec, id: Option, } @@ -430,6 +440,7 @@ impl<'a> GroupTypeBuilder<'a> { name, repetition: None, converted_type: ConvertedType::NONE, + logical_type: None, fields: Vec::new(), id: None, } @@ -447,6 +458,12 @@ impl<'a> GroupTypeBuilder<'a> { self } + /// Sets [`LogicalType`](crate::basic::LogicalType) for this field and returns itself. + pub fn with_logical_type(mut self, logical_type: Option) -> Self { + self.logical_type = logical_type; + self + } + /// Sets a list of fields that should be child nodes of this field. /// Returns updated self. pub fn with_fields(mut self, fields: &mut Vec) -> Self { @@ -466,6 +483,7 @@ impl<'a> GroupTypeBuilder<'a> { name: String::from(self.name), repetition: self.repetition, converted_type: self.converted_type, + logical_type: self.logical_type, id: self.id, }; Ok(Type::GroupType { @@ -482,6 +500,7 @@ pub struct BasicTypeInfo { name: String, repetition: Option, converted_type: ConvertedType, + logical_type: Option, id: Option, } @@ -509,6 +528,12 @@ impl BasicTypeInfo { self.converted_type } + /// Returns [`LogicalType`](crate::basic::LogicalType) value for the type. + pub fn logical_type(&self) -> Option { + // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it + self.logical_type.clone() + } + /// Returns `true` if id is set, `false` otherwise. pub fn has_id(&self) -> bool { self.id.is_some() @@ -906,7 +931,14 @@ fn from_thrift_helper( elements.len() )); } - let converted_type = ConvertedType::from(elements[index].converted_type); + let element = &elements[index]; + let converted_type = ConvertedType::from(element.converted_type); + // LogicalType is only present in v2 Parquet files. ConvertedType is always + // populated, regardless of the version of the file (v1 or v2). + let logical_type = element + .logical_type + .as_ref() + .map(|value| LogicalType::from(value.clone())); let field_id = elements[index].field_id; match elements[index].num_children { // From parquet-format: @@ -930,6 +962,7 @@ fn from_thrift_helper( let mut builder = Type::primitive_type_builder(name, physical_type) .with_repetition(repetition) .with_converted_type(converted_type) + .with_logical_type(logical_type) .with_length(length) .with_precision(precision) .with_scale(scale); @@ -950,6 +983,7 @@ fn from_thrift_helper( let mut builder = Type::group_type_builder(&elements[index].name) .with_converted_type(converted_type) + .with_logical_type(logical_type) .with_fields(&mut fields); if let Some(rep) = repetition { // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or @@ -972,18 +1006,22 @@ fn from_thrift_helper( } /// Method to convert to Thrift. -pub fn to_thrift(schema: &Type) -> Result> { +pub fn to_thrift(schema: &Type, writer_version: i32) -> Result> { if !schema.is_group() { return Err(general_err!("Root schema must be Group type")); } let mut elements: Vec = Vec::new(); - to_thrift_helper(schema, &mut elements); + to_thrift_helper(schema, &mut elements, writer_version); Ok(elements) } /// Constructs list of `SchemaElement` from the schema using depth-first traversal. /// Here we assume that schema is always valid and starts with group type. -fn to_thrift_helper(schema: &Type, elements: &mut Vec) { +fn to_thrift_helper( + schema: &Type, + elements: &mut Vec, + writer_version: i32, +) { match *schema { Type::PrimitiveType { ref basic_info, @@ -1014,7 +1052,11 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { } else { None }, - logical_type: None, + logical_type: if writer_version > 1 { + basic_info.logical_type().map(|value| value.into()) + } else { + None + }, }; elements.push(element); @@ -1043,14 +1085,18 @@ fn to_thrift_helper(schema: &Type, elements: &mut Vec) { } else { None }, - logical_type: None, + logical_type: if writer_version > 1 { + basic_info.logical_type().map(|value| value.into()) + } else { + None + }, }; elements.push(element); // Add child elements for a group for field in fields { - to_thrift_helper(field, elements); + to_thrift_helper(field, elements, writer_version); } } } @@ -1062,6 +1108,8 @@ mod tests { use crate::schema::v1::parser::parse_message_type; + // TODO: add tests for v2 types + #[test] fn test_primitive_type() { let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32) @@ -1760,7 +1808,7 @@ mod tests { let schema = Type::primitive_type_builder("col", PhysicalType::INT32) .build() .unwrap(); - let thrift_schema = to_thrift(&schema); + let thrift_schema = to_thrift(&schema, 1); assert!(thrift_schema.is_err()); if let Err(e) = thrift_schema { assert_eq!( @@ -1819,7 +1867,7 @@ mod tests { } "; let expected_schema = parse_message_type(message_type).unwrap(); - let thrift_schema = to_thrift(&expected_schema).unwrap(); + let thrift_schema = to_thrift(&expected_schema, 1).unwrap(); let result_schema = from_thrift(&thrift_schema).unwrap(); assert_eq!(result_schema, Arc::new(expected_schema)); } @@ -1835,7 +1883,7 @@ mod tests { } "; let expected_schema = parse_message_type(message_type).unwrap(); - let thrift_schema = to_thrift(&expected_schema).unwrap(); + let thrift_schema = to_thrift(&expected_schema, 1).unwrap(); let result_schema = from_thrift(&thrift_schema).unwrap(); assert_eq!(result_schema, Arc::new(expected_schema)); } @@ -1857,7 +1905,7 @@ mod tests { "; let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema, 1).unwrap(); // Change all of None to Some(0) for mut elem in &mut thrift_schema[..] { if elem.num_children == None { @@ -1882,7 +1930,7 @@ mod tests { "; let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema, 1).unwrap(); thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); let result_schema = from_thrift(&thrift_schema).unwrap(); From 2a678d2a1ce9c25d24432a2004279f2608f9d2b8 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 27 Feb 2021 10:11:54 +0200 Subject: [PATCH 05/10] Implement conversion from logical to converted type --- rust/parquet/src/basic.rs | 208 +++++++++++++++++++++++++++++++++++++- 1 file changed, 203 insertions(+), 5 deletions(-) diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index 519a6b415a4..9a0f331f3fe 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -575,6 +575,56 @@ impl convert::From for parquet::LogicalType { } } +// ---------------------------------------------------------------------- +// LogicalType <=> ConvertedType conversion + +// Note: To prevent type loss when converting from ConvertedType to LogicalType, +// the conversion from ConvertedType -> LogicalType is not implemented. +// Such type loss includes: +// - Not knowing the decimal scale and precision of ConvertedType +// - Time and timestamp nanosecond precision, that is not supported in ConvertedType. + +impl From> for ConvertedType { + fn from(value: Option) -> Self { + match value { + Some(value) => match value { + LogicalType::STRING(_) => ConvertedType::UTF8, + LogicalType::MAP(_) => ConvertedType::MAP, + LogicalType::LIST(_) => ConvertedType::LIST, + LogicalType::ENUM(_) => ConvertedType::ENUM, + LogicalType::DECIMAL(_) => ConvertedType::DECIMAL, + LogicalType::DATE(_) => ConvertedType::DATE, + LogicalType::TIME(t) => match t.unit { + parquet::TimeUnit::MILLIS(_) => ConvertedType::TIME_MILLIS, + parquet::TimeUnit::MICROS(_) => ConvertedType::TIME_MICROS, + parquet::TimeUnit::NANOS(_) => ConvertedType::NONE, + }, + LogicalType::TIMESTAMP(t) => match t.unit { + parquet::TimeUnit::MILLIS(_) => ConvertedType::TIMESTAMP_MILLIS, + parquet::TimeUnit::MICROS(_) => ConvertedType::TIMESTAMP_MICROS, + parquet::TimeUnit::NANOS(_) => ConvertedType::NONE, + }, + LogicalType::INTEGER(t) => match (t.bit_width, t.is_signed) { + (8, true) => ConvertedType::INT_8, + (16, true) => ConvertedType::INT_16, + (32, true) => ConvertedType::INT_32, + (64, true) => ConvertedType::INT_64, + (8, false) => ConvertedType::UINT_8, + (16, false) => ConvertedType::UINT_16, + (32, false) => ConvertedType::UINT_32, + (64, false) => ConvertedType::UINT_64, + t => panic!("Integer type {:?} is not supported", t), + }, + LogicalType::UNKNOWN(_) => ConvertedType::NONE, + LogicalType::JSON(_) => ConvertedType::JSON, + LogicalType::BSON(_) => ConvertedType::BSON, + LogicalType::UUID(_) => ConvertedType::NONE, + }, + None => ConvertedType::NONE, + } + } +} + // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion @@ -938,7 +988,7 @@ mod tests { } #[test] - fn test_display_logical_type() { + fn test_display_converted_type() { assert_eq!(ConvertedType::NONE.to_string(), "NONE"); assert_eq!(ConvertedType::UTF8.to_string(), "UTF8"); assert_eq!(ConvertedType::MAP.to_string(), "MAP"); @@ -972,8 +1022,9 @@ mod tests { } #[test] - fn test_from_logical_type() { - assert_eq!(ConvertedType::from(None), ConvertedType::NONE); + fn test_from_converted_type() { + let parquet_conv_none: Option = None; + assert_eq!(ConvertedType::from(parquet_conv_none), ConvertedType::NONE); assert_eq!( ConvertedType::from(Some(parquet::ConvertedType::Utf8)), ConvertedType::UTF8 @@ -1065,7 +1116,7 @@ mod tests { } #[test] - fn test_into_logical_type() { + fn test_into_converted_type() { let converted_type: Option = None; assert_eq!(converted_type, ConvertedType::NONE.into()); assert_eq!( @@ -1156,7 +1207,7 @@ mod tests { } #[test] - fn test_from_string_into_logical_type() { + fn test_from_string_into_converted_type() { assert_eq!( ConvertedType::NONE .to_string() @@ -1320,6 +1371,153 @@ mod tests { ); } + #[test] + fn test_logical_to_converted_type() { + let logical_none: Option = None; + assert_eq!(ConvertedType::from(logical_none), ConvertedType::NONE); + assert_eq!( + ConvertedType::from(Some(LogicalType::DECIMAL(DecimalType { + precision: 20, + scale: 5 + }))), + ConvertedType::DECIMAL + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::BSON(Default::default()))), + ConvertedType::BSON + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::JSON(Default::default()))), + ConvertedType::JSON + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::STRING(Default::default()))), + ConvertedType::UTF8 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::DATE(Default::default()))), + ConvertedType::DATE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIME(TimeType { + unit: parquet::TimeUnit::MILLIS(Default::default()), + is_adjusted_to_u_t_c: true, + }))), + ConvertedType::TIME_MILLIS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIME(TimeType { + unit: parquet::TimeUnit::MICROS(Default::default()), + is_adjusted_to_u_t_c: true, + }))), + ConvertedType::TIME_MICROS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIME(TimeType { + unit: parquet::TimeUnit::NANOS(Default::default()), + is_adjusted_to_u_t_c: false, + }))), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { + unit: parquet::TimeUnit::MILLIS(Default::default()), + is_adjusted_to_u_t_c: true, + }))), + ConvertedType::TIMESTAMP_MILLIS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { + unit: parquet::TimeUnit::MICROS(Default::default()), + is_adjusted_to_u_t_c: false, + }))), + ConvertedType::TIMESTAMP_MICROS + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { + unit: parquet::TimeUnit::NANOS(Default::default()), + is_adjusted_to_u_t_c: false, + }))), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false + }))), + ConvertedType::UINT_8 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true + }))), + ConvertedType::INT_8 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false + }))), + ConvertedType::UINT_16 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: true + }))), + ConvertedType::INT_16 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false + }))), + ConvertedType::UINT_32 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: true + }))), + ConvertedType::INT_32 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false + }))), + ConvertedType::UINT_64 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: true + }))), + ConvertedType::INT_64 + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::LIST(Default::default()))), + ConvertedType::LIST + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::MAP(Default::default()))), + ConvertedType::MAP + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::UUID(Default::default()))), + ConvertedType::NONE + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::ENUM(Default::default()))), + ConvertedType::ENUM + ); + assert_eq!( + ConvertedType::from(Some(LogicalType::UNKNOWN(Default::default()))), + ConvertedType::NONE + ); + } + #[test] fn test_display_repetition() { assert_eq!(Repetition::REQUIRED.to_string(), "REQUIRED"); From 2757b6d00c44a62f53dd3009647a029e09948166 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 27 Feb 2021 10:27:31 +0200 Subject: [PATCH 06/10] Use logicaltype for sort order Also addresses some deviations with the spec on sorting intervals --- rust/parquet/src/basic.rs | 49 ++++++++++++++++++++++++----- rust/parquet/src/file/footer.rs | 1 + rust/parquet/src/schema/v1/types.rs | 5 +++ rust/parquet/src/schema/v2/mod.rs | 1 + 4 files changed, 49 insertions(+), 7 deletions(-) diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index 9a0f331f3fe..1f061bb837b 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -311,8 +311,40 @@ pub enum ColumnOrder { impl ColumnOrder { /// Returns sort order for a physical/logical type. - pub fn get_sort_order(logical_type: ConvertedType, physical_type: Type) -> SortOrder { + pub fn get_sort_order( + logical_type: Option, + converted_type: ConvertedType, + physical_type: Type, + ) -> SortOrder { + // TODO: Should this take converted and logical type, for compatibility? match logical_type { + Some(logical) => match logical { + LogicalType::STRING(_) + | LogicalType::ENUM(_) + | LogicalType::JSON(_) + | LogicalType::BSON(_) => SortOrder::UNSIGNED, + LogicalType::INTEGER(t) => match t.is_signed { + true => SortOrder::SIGNED, + false => SortOrder::UNSIGNED, + }, + LogicalType::MAP(_) | LogicalType::LIST(_) => SortOrder::UNDEFINED, + LogicalType::DECIMAL(_) => SortOrder::SIGNED, + LogicalType::DATE(_) => SortOrder::SIGNED, + LogicalType::TIME(_) => SortOrder::SIGNED, + LogicalType::TIMESTAMP(_) => SortOrder::SIGNED, + LogicalType::UNKNOWN(_) => SortOrder::UNDEFINED, + LogicalType::UUID(_) => SortOrder::UNSIGNED, + }, + // Fall back to converted type + None => Self::get_converted_sort_order(converted_type, physical_type), + } + } + + fn get_converted_sort_order( + converted_type: ConvertedType, + physical_type: Type, + ) -> SortOrder { + match converted_type { // Unsigned byte-wise comparison. ConvertedType::UTF8 | ConvertedType::JSON @@ -339,7 +371,7 @@ impl ColumnOrder { | ConvertedType::TIMESTAMP_MILLIS | ConvertedType::TIMESTAMP_MICROS => SortOrder::SIGNED, - ConvertedType::INTERVAL => SortOrder::UNSIGNED, + ConvertedType::INTERVAL => SortOrder::UNDEFINED, ConvertedType::LIST | ConvertedType::MAP | ConvertedType::MAP_KEY_VALUE => { SortOrder::UNDEFINED @@ -364,8 +396,11 @@ impl ColumnOrder { // If the max is -0, the row group may contain +0 values as well. // When looking for NaN values, min and max should be ignored. Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, - // unsigned byte-wise comparison - Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, + // Unsigned byte-wise comparison + Type::BYTE_ARRAY => SortOrder::UNSIGNED, + // Only unsigned if there was a logical type that supports unsigned sort. + // Interval has no defined sort order, and should not use UNSIGNED. + Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNDEFINED, } } @@ -1782,7 +1817,7 @@ mod tests { fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( - ColumnOrder::get_sort_order(tpe, Type::BYTE_ARRAY), + ColumnOrder::get_sort_order(None, tpe, Type::BYTE_ARRAY), expected_order ); } @@ -1798,7 +1833,6 @@ mod tests { ConvertedType::UINT_16, ConvertedType::UINT_32, ConvertedType::UINT_64, - ConvertedType::INTERVAL, ]; check_sort_order(unsigned, SortOrder::UNSIGNED); @@ -1822,6 +1856,7 @@ mod tests { ConvertedType::LIST, ConvertedType::MAP, ConvertedType::MAP_KEY_VALUE, + ConvertedType::INTERVAL, ]; check_sort_order(undefined, SortOrder::UNDEFINED); @@ -1863,7 +1898,7 @@ mod tests { ); assert_eq!( ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY), - SortOrder::UNSIGNED + SortOrder::UNDEFINED ); } diff --git a/rust/parquet/src/file/footer.rs b/rust/parquet/src/file/footer.rs index ca896a8ad13..f2ed5c16304 100644 --- a/rust/parquet/src/file/footer.rs +++ b/rust/parquet/src/file/footer.rs @@ -139,6 +139,7 @@ fn parse_column_orders( match orders[i] { TColumnOrder::TYPEORDER(_) => { let sort_order = ColumnOrder::get_sort_order( + column.logical_type(), column.converted_type(), column.physical_type(), ); diff --git a/rust/parquet/src/schema/v1/types.rs b/rust/parquet/src/schema/v1/types.rs index b2219dedc79..56c2d4444f4 100644 --- a/rust/parquet/src/schema/v1/types.rs +++ b/rust/parquet/src/schema/v1/types.rs @@ -695,6 +695,11 @@ impl ColumnDescriptor { self.primitive_type.get_basic_info().converted_type() } + /// Returns [`LogicalType`](crate::basic::LogicalType) for this column. + pub fn logical_type(&self) -> Option { + self.primitive_type.get_basic_info().logical_type() + } + /// Returns physical type for this column. /// Note that it will panic if called on a non-primitive type. pub fn physical_type(&self) -> PhysicalType { diff --git a/rust/parquet/src/schema/v2/mod.rs b/rust/parquet/src/schema/v2/mod.rs index e69de29bb2d..8b137891791 100644 --- a/rust/parquet/src/schema/v2/mod.rs +++ b/rust/parquet/src/schema/v2/mod.rs @@ -0,0 +1 @@ + From 107a6387099e8b6ed163580c0797269f16294bbd Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 27 Feb 2021 10:49:02 +0200 Subject: [PATCH 07/10] revert schema version split It might be premature to do this now. Can be done as part of ARROW-11365 if necessary. --- rust/datafusion/src/physical_plan/parquet.rs | 4 +- rust/parquet/src/arrow/array_reader.rs | 8 +-- rust/parquet/src/arrow/arrow_reader.rs | 4 +- rust/parquet/src/arrow/record_reader.rs | 6 +- rust/parquet/src/arrow/schema.rs | 4 +- rust/parquet/src/column/mod.rs | 2 +- rust/parquet/src/column/page.rs | 2 +- rust/parquet/src/column/reader.rs | 4 +- rust/parquet/src/column/writer.rs | 4 +- rust/parquet/src/encodings/decoding.rs | 4 +- rust/parquet/src/encodings/encoding.rs | 4 +- rust/parquet/src/file/footer.rs | 4 +- rust/parquet/src/file/metadata.rs | 2 +- rust/parquet/src/file/mod.rs | 2 +- rust/parquet/src/file/properties.rs | 4 +- rust/parquet/src/file/reader.rs | 2 +- rust/parquet/src/file/serialized_reader.rs | 4 +- rust/parquet/src/file/writer.rs | 2 +- rust/parquet/src/record/api.rs | 4 +- rust/parquet/src/record/reader.rs | 6 +- rust/parquet/src/record/triplet.rs | 4 +- rust/parquet/src/schema/mod.rs | 52 +++++++++++++- rust/parquet/src/schema/{v1 => }/parser.rs | 8 +-- rust/parquet/src/schema/{v1 => }/printer.rs | 8 +-- rust/parquet/src/schema/{v1 => }/types.rs | 16 ++--- rust/parquet/src/schema/v1/mod.rs | 67 ------------------- rust/parquet/src/schema/v2/mod.rs | 1 - rust/parquet/src/schema/{v1 => }/visitor.rs | 6 +- .../parquet/src/util/test_common/page_util.rs | 2 +- rust/parquet/tests/custom_writer.rs | 2 +- rust/parquet_derive_test/src/lib.rs | 2 +- 31 files changed, 111 insertions(+), 133 deletions(-) rename rust/parquet/src/schema/{v1 => }/parser.rs (99%) rename rust/parquet/src/schema/{v1 => }/printer.rs (98%) rename rust/parquet/src/schema/{v1 => }/types.rs (99%) delete mode 100644 rust/parquet/src/schema/v1/mod.rs delete mode 100644 rust/parquet/src/schema/v2/mod.rs rename rust/parquet/src/schema/{v1 => }/visitor.rs (98%) diff --git a/rust/datafusion/src/physical_plan/parquet.rs b/rust/datafusion/src/physical_plan/parquet.rs index 3d64f1aaac2..348a924040a 100644 --- a/rust/datafusion/src/physical_plan/parquet.rs +++ b/rust/datafusion/src/physical_plan/parquet.rs @@ -900,7 +900,7 @@ mod tests { use arrow::array::{Int32Array, StringArray}; use futures::StreamExt; use parquet::basic::Type as PhysicalType; - use parquet::schema::v1::types::SchemaDescPtr; + use parquet::schema::types::SchemaDescPtr; #[test] fn test_split_files() { @@ -1429,7 +1429,7 @@ mod tests { } fn get_test_schema_descr(fields: Vec<(&str, PhysicalType)>) -> SchemaDescPtr { - use parquet::schema::v1::types::{SchemaDescriptor, Type as SchemaType}; + use parquet::schema::types::{SchemaDescriptor, Type as SchemaType}; let mut schema_fields = fields .iter() .map(|(n, t)| { diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs index 8219f7567d1..83ae04215b5 100644 --- a/rust/parquet/src/arrow/array_reader.rs +++ b/rust/parquet/src/arrow/array_reader.rs @@ -73,10 +73,10 @@ use crate::data_type::{ }; use crate::errors::{ParquetError, ParquetError::ArrowError, Result}; use crate::file::reader::{FilePageIterator, FileReader}; -use crate::schema::v1::types::{ +use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, Type, TypePtr, }; -use crate::schema::v1::visitor::TypeVisitor; +use crate::schema::visitor::TypeVisitor; use std::any::Any; /// Array reader reads parquet data into arrow array. @@ -1713,8 +1713,8 @@ mod tests { use crate::data_type::{ByteArray, DataType, Int32Type, Int64Type}; use crate::errors::Result; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::schema::v1::parser::parse_message_type; - use crate::schema::v1::types::{ColumnDescPtr, SchemaDescriptor}; + use crate::schema::parser::parse_message_type; + use crate::schema::types::{ColumnDescPtr, SchemaDescriptor}; use crate::util::test_common::page_util::{ DataPageBuilder, DataPageBuilderImpl, InMemoryPageIterator, }; diff --git a/rust/parquet/src/arrow/arrow_reader.rs b/rust/parquet/src/arrow/arrow_reader.rs index fbe4ed83e0f..7bbe8de1d64 100644 --- a/rust/parquet/src/arrow/arrow_reader.rs +++ b/rust/parquet/src/arrow/arrow_reader.rs @@ -247,8 +247,8 @@ mod tests { use crate::file::properties::WriterProperties; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::file::writer::{FileWriter, SerializedFileWriter}; - use crate::schema::v1::parser::parse_message_type; - use crate::schema::v1::types::TypePtr; + use crate::schema::parser::parse_message_type; + use crate::schema::types::TypePtr; use crate::util::test_common::{get_temp_filename, RandGen}; use arrow::array::*; use arrow::record_batch::RecordBatchReader; diff --git a/rust/parquet/src/arrow/record_reader.rs b/rust/parquet/src/arrow/record_reader.rs index 4bcaaefb1a2..d58d563621f 100644 --- a/rust/parquet/src/arrow/record_reader.rs +++ b/rust/parquet/src/arrow/record_reader.rs @@ -21,7 +21,7 @@ use std::mem::{replace, size_of}; use crate::column::{page::PageReader, reader::ColumnReaderImpl}; use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; use arrow::array::BooleanBufferBuilder; use arrow::bitmap::Bitmap; use arrow::buffer::{Buffer, MutableBuffer}; @@ -439,8 +439,8 @@ mod tests { use crate::column::page::PageReader; use crate::data_type::Int32Type; use crate::errors::Result; - use crate::schema::v1::parser::parse_message_type; - use crate::schema::v1::types::SchemaDescriptor; + use crate::schema::parser::parse_message_type; + use crate::schema::types::SchemaDescriptor; use crate::util::test_common::page_util::{DataPageBuilder, DataPageBuilderImpl}; use arrow::array::{BooleanBufferBuilder, Int16BufferBuilder, Int32BufferBuilder}; use arrow::bitmap::Bitmap; diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs index 91d8c3fd330..84b7a4daf66 100644 --- a/rust/parquet/src/arrow/schema.rs +++ b/rust/parquet/src/arrow/schema.rs @@ -31,7 +31,7 @@ use arrow::ipc::writer; use crate::errors::{ParquetError::ArrowError, Result}; use crate::file::{metadata::KeyValue, properties::WriterProperties}; -use crate::schema::v1::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; +use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr}; use crate::{ basic::{ConvertedType, Repetition, Type as PhysicalType}, errors::ParquetError, @@ -808,7 +808,7 @@ mod tests { use crate::file::{metadata::KeyValue, reader::SerializedFileReader}; use crate::{ arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}, - schema::v1::{parser::parse_message_type, types::SchemaDescriptor}, + schema::{parser::parse_message_type, types::SchemaDescriptor}, util::test_common::get_temp_file, }; diff --git a/rust/parquet/src/column/mod.rs b/rust/parquet/src/column/mod.rs index dc573f68c8d..7ed7bfc256e 100644 --- a/rust/parquet/src/column/mod.rs +++ b/rust/parquet/src/column/mod.rs @@ -45,7 +45,7 @@ //! reader::{FileReader, SerializedFileReader}, //! writer::{FileWriter, SerializedFileWriter}, //! }, -//! schema::v1::parser::parse_message_type, +//! schema::parser::parse_message_type, //! }; //! //! let path = Path::new("/path/to/column_sample.parquet"); diff --git a/rust/parquet/src/column/page.rs b/rust/parquet/src/column/page.rs index 76ac1708112..0573616fa8d 100644 --- a/rust/parquet/src/column/page.rs +++ b/rust/parquet/src/column/page.rs @@ -20,7 +20,7 @@ use crate::basic::{Encoding, PageType}; use crate::errors::Result; use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; -use crate::schema::v1::types::{ColumnDescPtr, SchemaDescPtr}; +use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; /// Parquet Page definition. diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index 1704c8ef610..d8c2e7a8ebd 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -30,7 +30,7 @@ use crate::encodings::{ levels::LevelDecoder, }; use crate::errors::{ParquetError, Result}; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; /// Column reader for a Parquet type. @@ -509,7 +509,7 @@ mod tests { use crate::basic::Type as PhysicalType; use crate::column::page::Page; - use crate::schema::v1::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; use crate::util::test_common::make_pages; const NUM_LEVELS: usize = 128; diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index 18fc774c4ab..533a8e69a51 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -33,7 +33,7 @@ use crate::file::{ metadata::ColumnChunkMetaData, properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}, }; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::FromBytes; use crate::util::memory::{ByteBufferPtr, MemTracker}; @@ -999,7 +999,7 @@ mod tests { properties::WriterProperties, reader::SerializedPageReader, writer::SerializedPageWriter, }; - use crate::schema::v1::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; use crate::util::{ io::{FileSink, FileSource}, test_common::{get_temp_file, random_numbers_range}, diff --git a/rust/parquet/src/encodings/decoding.rs b/rust/parquet/src/encodings/decoding.rs index b512e55b073..ee7ad5ae95c 100644 --- a/rust/parquet/src/encodings/decoding.rs +++ b/rust/parquet/src/encodings/decoding.rs @@ -25,7 +25,7 @@ use crate::basic::*; use crate::data_type::private::*; use crate::data_type::*; use crate::errors::{ParquetError, Result}; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; use crate::util::{ bit_util::{self, BitReader, FromBytes}, memory::{ByteBuffer, ByteBufferPtr}, @@ -806,7 +806,7 @@ mod tests { use std::sync::Arc; - use crate::schema::v1::types::{ + use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; use crate::util::{ diff --git a/rust/parquet/src/encodings/encoding.rs b/rust/parquet/src/encodings/encoding.rs index 88840d17d20..fdd616e9e27 100644 --- a/rust/parquet/src/encodings/encoding.rs +++ b/rust/parquet/src/encodings/encoding.rs @@ -24,7 +24,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; use crate::util::{ bit_util::{self, log2, num_required_bits, BitWriter}, hash_util, @@ -923,7 +923,7 @@ mod tests { use std::sync::Arc; use crate::decoding::{get_decoder, Decoder, DictDecoder, PlainDecoder}; - use crate::schema::v1::types::{ + use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; use crate::util::{ diff --git a/rust/parquet/src/file/footer.rs b/rust/parquet/src/file/footer.rs index f2ed5c16304..2e572944868 100644 --- a/rust/parquet/src/file/footer.rs +++ b/rust/parquet/src/file/footer.rs @@ -33,7 +33,7 @@ use crate::file::{ PARQUET_MAGIC, }; -use crate::schema::v1::types::{self, SchemaDescriptor}; +use crate::schema::types::{self, SchemaDescriptor}; /// Layout of Parquet file /// +---------------------------+-----+---+ @@ -159,7 +159,7 @@ mod tests { use crate::basic::SortOrder; use crate::basic::Type; - use crate::schema::v1::types::Type as SchemaType; + use crate::schema::types::Type as SchemaType; use crate::util::test_common::get_temp_file; use parquet_format::TypeDefinedOrder; diff --git a/rust/parquet/src/file/metadata.rs b/rust/parquet/src/file/metadata.rs index 84e8f0cd485..150c42c578a 100644 --- a/rust/parquet/src/file/metadata.rs +++ b/rust/parquet/src/file/metadata.rs @@ -40,7 +40,7 @@ use parquet_format::{ColumnChunk, ColumnMetaData, RowGroup}; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; use crate::errors::{ParquetError, Result}; use crate::file::statistics::{self, Statistics}; -use crate::schema::v1::types::{ +use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, Type as SchemaType, }; diff --git a/rust/parquet/src/file/mod.rs b/rust/parquet/src/file/mod.rs index 9e418978c79..f85de98ccab 100644 --- a/rust/parquet/src/file/mod.rs +++ b/rust/parquet/src/file/mod.rs @@ -34,7 +34,7 @@ //! properties::WriterProperties, //! writer::{FileWriter, SerializedFileWriter}, //! }, -//! schema::v1::parser::parse_message_type, +//! schema::parser::parse_message_type, //! }; //! //! let path = Path::new("/path/to/sample.parquet"); diff --git a/rust/parquet/src/file/properties.rs b/rust/parquet/src/file/properties.rs index 4a59eab6c71..b0b25f9b952 100644 --- a/rust/parquet/src/file/properties.rs +++ b/rust/parquet/src/file/properties.rs @@ -23,7 +23,7 @@ //! use parquet::{ //! basic::{Compression, Encoding}, //! file::properties::*, -//! schema::v1::types::ColumnPath, +//! schema::types::ColumnPath, //! }; //! //! // Create properties with default configuration. @@ -52,7 +52,7 @@ use std::{collections::HashMap, sync::Arc}; use crate::basic::{Compression, Encoding}; use crate::file::metadata::KeyValue; -use crate::schema::v1::types::ColumnPath; +use crate::schema::types::ColumnPath; const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; const DEFAULT_WRITE_BATCH_SIZE: usize = 1024; diff --git a/rust/parquet/src/file/reader.rs b/rust/parquet/src/file/reader.rs index 1f8f14b2c47..7fb8ee211cd 100644 --- a/rust/parquet/src/file/reader.rs +++ b/rust/parquet/src/file/reader.rs @@ -26,7 +26,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::metadata::*; pub use crate::file::serialized_reader::{SerializedFileReader, SerializedPageReader}; use crate::record::reader::RowIter; -use crate::schema::v1::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType}; +use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType}; use crate::basic::Type; diff --git a/rust/parquet/src/file/serialized_reader.rs b/rust/parquet/src/file/serialized_reader.rs index 1642e3c5611..b0d1d0c7b31 100644 --- a/rust/parquet/src/file/serialized_reader.rs +++ b/rust/parquet/src/file/serialized_reader.rs @@ -30,7 +30,7 @@ use crate::errors::{ParquetError, Result}; use crate::file::{footer, metadata::*, reader::*, statistics}; use crate::record::reader::RowIter; use crate::record::Row; -use crate::schema::v1::types::Type as SchemaType; +use crate::schema::types::Type as SchemaType; use crate::util::{io::TryClone, memory::ByteBufferPtr}; // export `SliceableCursor` and `FileSource` publically so clients can @@ -394,7 +394,7 @@ mod tests { use super::*; use crate::basic::ColumnOrder; use crate::record::RowAccessor; - use crate::schema::v1::parser::parse_message_type; + use crate::schema::parser::parse_message_type; use crate::util::test_common::{get_test_file, get_test_path}; use std::sync::Arc; diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index 422646b9ae4..3125c652c26 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -37,7 +37,7 @@ use crate::file::{ metadata::*, properties::WriterPropertiesPtr, statistics::to_thrift as statistics_to_thrift, FOOTER_SIZE, PARQUET_MAGIC, }; -use crate::schema::v1::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; +use crate::schema::types::{self, SchemaDescPtr, SchemaDescriptor, TypePtr}; use crate::util::io::{FileSink, Position}; // Exposed publically so client code can implement [`ParquetWriter`] diff --git a/rust/parquet/src/record/api.rs b/rust/parquet/src/record/api.rs index 28f8dc949e6..07f82160db4 100644 --- a/rust/parquet/src/record/api.rs +++ b/rust/parquet/src/record/api.rs @@ -25,7 +25,7 @@ use num_bigint::{BigInt, Sign}; use crate::basic::{ConvertedType, Type as PhysicalType}; use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; #[cfg(feature = "cli")] use serde_json::Value; @@ -833,7 +833,7 @@ mod tests { use std::sync::Arc; - use crate::schema::v1::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; + use crate::schema::types::{ColumnDescriptor, ColumnPath, PrimitiveTypeBuilder}; /// Creates test column descriptor based on provided type parameters. macro_rules! make_column_descr { diff --git a/rust/parquet/src/record/reader.rs b/rust/parquet/src/record/reader.rs index d8902480cfa..2323cd17b71 100644 --- a/rust/parquet/src/record/reader.rs +++ b/rust/parquet/src/record/reader.rs @@ -27,9 +27,7 @@ use crate::record::{ api::{make_list, make_map, make_row, Field, Row}, triplet::TripletIter, }; -use crate::schema::v1::types::{ - ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr, -}; +use crate::schema::types::{ColumnPath, SchemaDescPtr, SchemaDescriptor, Type, TypePtr}; /// Default batch size for a reader const DEFAULT_BATCH_SIZE: usize = 1024; @@ -823,7 +821,7 @@ mod tests { use crate::errors::{ParquetError, Result}; use crate::file::reader::{FileReader, SerializedFileReader}; use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; - use crate::schema::v1::parser::parse_message_type; + use crate::schema::parser::parse_message_type; use crate::util::test_common::{get_test_file, get_test_path}; use std::convert::TryFrom; diff --git a/rust/parquet/src/record/triplet.rs b/rust/parquet/src/record/triplet.rs index de623970704..bb4f942fd18 100644 --- a/rust/parquet/src/record/triplet.rs +++ b/rust/parquet/src/record/triplet.rs @@ -20,7 +20,7 @@ use crate::column::reader::{get_typed_column_reader, ColumnReader, ColumnReaderI use crate::data_type::*; use crate::errors::{ParquetError, Result}; use crate::record::api::Field; -use crate::schema::v1::types::ColumnDescPtr; +use crate::schema::types::ColumnDescPtr; /// Macro to generate simple functions that cover all types of triplet iterator. /// $func is a function of a typed triplet iterator and $token is a either {`ref`} or @@ -359,7 +359,7 @@ mod tests { use super::*; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::schema::v1::types::ColumnPath; + use crate::schema::types::ColumnPath; use crate::util::test_common::get_test_file; #[test] diff --git a/rust/parquet/src/schema/mod.rs b/rust/parquet/src/schema/mod.rs index 749ab1fe783..1ebee2e06e8 100644 --- a/rust/parquet/src/schema/mod.rs +++ b/rust/parquet/src/schema/mod.rs @@ -15,5 +15,53 @@ // specific language governing permissions and limitations // under the License. -pub mod v1; -pub mod v2; +//! Parquet schema definitions and methods to print and parse schema. +//! +//! # Example +//! +//! ```rust +//! use parquet::{ +//! basic::{ConvertedType, Repetition, Type as PhysicalType}, +//! schema::{parser, printer, types::Type}, +//! }; +//! use std::sync::Arc; +//! +//! // Create the following schema: +//! // +//! // message schema { +//! // OPTIONAL BYTE_ARRAY a (UTF8); +//! // REQUIRED INT32 b; +//! // } +//! +//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) +//! .with_converted_type(ConvertedType::UTF8) +//! .with_repetition(Repetition::OPTIONAL) +//! .build() +//! .unwrap(); +//! +//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) +//! .with_repetition(Repetition::REQUIRED) +//! .build() +//! .unwrap(); +//! +//! let schema = Type::group_type_builder("schema") +//! .with_fields(&mut vec![Arc::new(field_a), Arc::new(field_b)]) +//! .build() +//! .unwrap(); +//! +//! let mut buf = Vec::new(); +//! +//! // Print schema into buffer +//! printer::print_schema(&mut buf, &schema); +//! +//! // Parse schema from the string +//! let string_schema = String::from_utf8(buf).unwrap(); +//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); +//! +//! assert_eq!(schema, parsed_schema); +//! ``` + +pub mod parser; +pub mod printer; +pub mod types; +pub mod visitor; diff --git a/rust/parquet/src/schema/v1/parser.rs b/rust/parquet/src/schema/parser.rs similarity index 99% rename from rust/parquet/src/schema/v1/parser.rs rename to rust/parquet/src/schema/parser.rs index b9b8c77aeb1..50f00bb5534 100644 --- a/rust/parquet/src/schema/v1/parser.rs +++ b/rust/parquet/src/schema/parser.rs @@ -17,12 +17,12 @@ //! Parquet schema parser. //! Provides methods to parse and validate string message type into Parquet -//! [`Type`](crate::schema::v1::types::Type). +//! [`Type`](crate::schema::types::Type). //! //! # Example //! //! ```rust -//! use parquet::schema::v1::parser::parse_message_type; +//! use parquet::schema::parser::parse_message_type; //! //! let message_type = " //! message spark_schema { @@ -46,9 +46,9 @@ use std::sync::Arc; use crate::basic::{ConvertedType, Repetition, Type as PhysicalType}; use crate::errors::{ParquetError, Result}; -use crate::schema::v1::types::{Type, TypePtr}; +use crate::schema::types::{Type, TypePtr}; -/// Parses message type as string into a Parquet [`Type`](crate::schema::v1::types::Type) +/// Parses message type as string into a Parquet [`Type`](crate::schema::types::Type) /// which, for example, could be used to extract individual columns. Returns Parquet /// general error when parsing or validation fails. pub fn parse_message_type(message_type: &str) -> Result { diff --git a/rust/parquet/src/schema/v1/printer.rs b/rust/parquet/src/schema/printer.rs similarity index 98% rename from rust/parquet/src/schema/v1/printer.rs rename to rust/parquet/src/schema/printer.rs index 75b33ef14d9..81ada8f6f99 100644 --- a/rust/parquet/src/schema/v1/printer.rs +++ b/rust/parquet/src/schema/printer.rs @@ -23,7 +23,7 @@ //! ```rust //! use parquet::{ //! file::reader::{FileReader, SerializedFileReader}, -//! schema::v1::printer::{print_file_metadata, print_parquet_metadata, print_schema}, +//! schema::printer::{print_file_metadata, print_parquet_metadata, print_schema}, //! }; //! use std::{fs::File, path::Path}; //! @@ -49,7 +49,7 @@ use crate::basic::{ConvertedType, Type as PhysicalType}; use crate::file::metadata::{ ColumnChunkMetaData, FileMetaData, ParquetMetaData, RowGroupMetaData, }; -use crate::schema::v1::types::Type; +use crate::schema::types::Type; /// Prints Parquet metadata [`ParquetMetaData`](crate::file::metadata::ParquetMetaData) /// information. @@ -92,7 +92,7 @@ pub fn print_file_metadata(out: &mut io::Write, file_metadata: &FileMetaData) { print_schema(out, schema); } -/// Prints Parquet [`Type`](crate::schema::v1::types::Type) information. +/// Prints Parquet [`Type`](crate::schema::types::Type) information. #[allow(unused_must_use)] pub fn print_schema(out: &mut io::Write, tp: &Type) { // TODO: better if we can pass fmt::Write to Printer. @@ -274,7 +274,7 @@ mod tests { use std::sync::Arc; use crate::basic::{Repetition, Type as PhysicalType}; - use crate::schema::v1::{parser::parse_message_type, types::Type}; + use crate::schema::{parser::parse_message_type, types::Type}; fn assert_print_parse_message(message: Type) { let mut s = String::new(); diff --git a/rust/parquet/src/schema/v1/types.rs b/rust/parquet/src/schema/types.rs similarity index 99% rename from rust/parquet/src/schema/v1/types.rs rename to rust/parquet/src/schema/types.rs index 56c2d4444f4..5f03b0dcb3e 100644 --- a/rust/parquet/src/schema/v1/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -563,7 +563,7 @@ impl ColumnPath { /// Returns string representation of this column path. /// ```rust - /// use parquet::schema::v1::types::ColumnPath; + /// use parquet::schema::types::ColumnPath; /// /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]); /// assert_eq!(&path.string(), "a.b.c"); @@ -574,7 +574,7 @@ impl ColumnPath { /// Appends more components to end of column path. /// ```rust - /// use parquet::schema::v1::types::ColumnPath; + /// use parquet::schema::types::ColumnPath; /// /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c" /// .to_string()]); @@ -674,12 +674,12 @@ impl ColumnDescriptor { &self.path } - /// Returns self type [`Type`](crate::schema::v1::types::Type) for this leaf column. + /// Returns self type [`Type`](crate::schema::types::Type) for this leaf column. pub fn self_type(&self) -> &Type { self.primitive_type.as_ref() } - /// Returns self type [`TypePtr`](crate::schema::v1::types::TypePtr) for this leaf + /// Returns self type [`TypePtr`](crate::schema::types::TypePtr) for this leaf /// column. pub fn self_type_ptr(&self) -> TypePtr { self.primitive_type.clone() @@ -805,13 +805,13 @@ impl SchemaDescriptor { self.leaves.len() } - /// Returns column root [`Type`](crate::schema::v1::types::Type) for a field position. + /// Returns column root [`Type`](crate::schema::types::Type) for a field position. pub fn get_column_root(&self, i: usize) -> &Type { let result = self.column_root_of(i); result.as_ref() } - /// Returns column root [`Type`](crate::schema::v1::types::Type) pointer for a field + /// Returns column root [`Type`](crate::schema::types::Type) pointer for a field /// position. pub fn get_column_root_ptr(&self, i: usize) -> TypePtr { let result = self.column_root_of(i); @@ -831,7 +831,7 @@ impl SchemaDescriptor { .unwrap_or_else(|| panic!("Expected a value for index {} but found None", i)) } - /// Returns schema as [`Type`](crate::schema::v1::types::Type). + /// Returns schema as [`Type`](crate::schema::types::Type). pub fn root_schema(&self) -> &Type { self.schema.as_ref() } @@ -1111,7 +1111,7 @@ fn to_thrift_helper( mod tests { use super::*; - use crate::schema::v1::parser::parse_message_type; + use crate::schema::parser::parse_message_type; // TODO: add tests for v2 types diff --git a/rust/parquet/src/schema/v1/mod.rs b/rust/parquet/src/schema/v1/mod.rs deleted file mode 100644 index 8cb7f23b57a..00000000000 --- a/rust/parquet/src/schema/v1/mod.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Parquet schema definitions and methods to print and parse schema. -//! -//! # Example -//! -//! ```rust -//! use parquet::{ -//! basic::{ConvertedType, Repetition, Type as PhysicalType}, -//! schema::v1::{parser, printer, types::Type}, -//! }; -//! use std::sync::Arc; -//! -//! // Create the following schema: -//! // -//! // message schema { -//! // OPTIONAL BYTE_ARRAY a (UTF8); -//! // REQUIRED INT32 b; -//! // } -//! -//! let field_a = Type::primitive_type_builder("a", PhysicalType::BYTE_ARRAY) -//! .with_converted_type(ConvertedType::UTF8) -//! .with_repetition(Repetition::OPTIONAL) -//! .build() -//! .unwrap(); -//! -//! let field_b = Type::primitive_type_builder("b", PhysicalType::INT32) -//! .with_repetition(Repetition::REQUIRED) -//! .build() -//! .unwrap(); -//! -//! let schema = Type::group_type_builder("schema") -//! .with_fields(&mut vec![Arc::new(field_a), Arc::new(field_b)]) -//! .build() -//! .unwrap(); -//! -//! let mut buf = Vec::new(); -//! -//! // Print schema into buffer -//! printer::print_schema(&mut buf, &schema); -//! -//! // Parse schema from the string -//! let string_schema = String::from_utf8(buf).unwrap(); -//! let parsed_schema = parser::parse_message_type(&string_schema).unwrap(); -//! -//! assert_eq!(schema, parsed_schema); -//! ``` - -pub mod parser; -pub mod printer; -pub mod types; -pub mod visitor; diff --git a/rust/parquet/src/schema/v2/mod.rs b/rust/parquet/src/schema/v2/mod.rs deleted file mode 100644 index 8b137891791..00000000000 --- a/rust/parquet/src/schema/v2/mod.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/rust/parquet/src/schema/v1/visitor.rs b/rust/parquet/src/schema/visitor.rs similarity index 98% rename from rust/parquet/src/schema/v1/visitor.rs rename to rust/parquet/src/schema/visitor.rs index c9574fb9bdf..61bc3be951d 100644 --- a/rust/parquet/src/schema/v1/visitor.rs +++ b/rust/parquet/src/schema/visitor.rs @@ -18,7 +18,7 @@ use crate::basic::{ConvertedType, Repetition}; use crate::errors::ParquetError::General; use crate::errors::Result; -use crate::schema::v1::types::{Type, TypePtr}; +use crate::schema::types::{Type, TypePtr}; /// A utility trait to help user to traverse against parquet type. pub trait TypeVisitor { @@ -124,8 +124,8 @@ mod tests { use super::TypeVisitor; use crate::basic::Type as PhysicalType; use crate::errors::Result; - use crate::schema::v1::parser::parse_message_type; - use crate::schema::v1::types::TypePtr; + use crate::schema::parser::parse_message_type; + use crate::schema::types::TypePtr; use std::sync::Arc; struct TestVisitorContext {} diff --git a/rust/parquet/src/util/test_common/page_util.rs b/rust/parquet/src/util/test_common/page_util.rs index 8488bbe53d3..e360f3da52a 100644 --- a/rust/parquet/src/util/test_common/page_util.rs +++ b/rust/parquet/src/util/test_common/page_util.rs @@ -23,7 +23,7 @@ use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; use crate::encodings::levels::max_buffer_size; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; -use crate::schema::v1::types::{ColumnDescPtr, SchemaDescPtr}; +use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; use crate::util::memory::MemTracker; use crate::util::memory::MemTrackerPtr; diff --git a/rust/parquet/tests/custom_writer.rs b/rust/parquet/tests/custom_writer.rs index f153857ee24..0a57e79d955 100644 --- a/rust/parquet/tests/custom_writer.rs +++ b/rust/parquet/tests/custom_writer.rs @@ -25,7 +25,7 @@ use std::{ use parquet::file::writer::TryClone; use parquet::{ basic::Repetition, basic::Type, file::properties::WriterProperties, - file::writer::SerializedFileWriter, schema::v1::types, + file::writer::SerializedFileWriter, schema::types, }; use std::env; diff --git a/rust/parquet_derive_test/src/lib.rs b/rust/parquet_derive_test/src/lib.rs index 505c94843f4..b4bfc42cab2 100644 --- a/rust/parquet_derive_test/src/lib.rs +++ b/rust/parquet_derive_test/src/lib.rs @@ -50,7 +50,7 @@ mod tests { properties::WriterProperties, writer::{FileWriter, SerializedFileWriter}, }, - schema::v1::parser::parse_message_type, + schema::parser::parse_message_type, }; use std::{env, fs, io::Write, sync::Arc}; From 396b97199e4d159c01f1cc5025db49c14ab74551 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Mon, 1 Mar 2021 06:03:44 +0200 Subject: [PATCH 08/10] address review feedback, add more tests --- rust/parquet/src/basic.rs | 162 +++++++++++++++++++++++++------ rust/parquet/src/file/writer.rs | 57 ++++++++++- rust/parquet/src/schema/types.rs | 2 + 3 files changed, 187 insertions(+), 34 deletions(-) diff --git a/rust/parquet/src/basic.rs b/rust/parquet/src/basic.rs index 1f061bb837b..73142f75052 100644 --- a/rust/parquet/src/basic.rs +++ b/rust/parquet/src/basic.rs @@ -27,7 +27,7 @@ use crate::errors::ParquetError; // Re-export parquet_format types used in this module pub use parquet_format::{ BsonType, DateType, DecimalType, EnumType, IntType, JsonType, ListType, MapType, - NullType, StringType, TimeType, TimestampType, UUIDType, + NullType, StringType, TimeType, TimeUnit, TimestampType, UUIDType, }; // ---------------------------------------------------------------------- @@ -397,10 +397,7 @@ impl ColumnOrder { // When looking for NaN values, min and max should be ignored. Type::FLOAT | Type::DOUBLE => SortOrder::SIGNED, // Unsigned byte-wise comparison - Type::BYTE_ARRAY => SortOrder::UNSIGNED, - // Only unsigned if there was a logical type that supports unsigned sort. - // Interval has no defined sort order, and should not use UNSIGNED. - Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNDEFINED, + Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => SortOrder::UNSIGNED, } } @@ -630,14 +627,14 @@ impl From> for ConvertedType { LogicalType::DECIMAL(_) => ConvertedType::DECIMAL, LogicalType::DATE(_) => ConvertedType::DATE, LogicalType::TIME(t) => match t.unit { - parquet::TimeUnit::MILLIS(_) => ConvertedType::TIME_MILLIS, - parquet::TimeUnit::MICROS(_) => ConvertedType::TIME_MICROS, - parquet::TimeUnit::NANOS(_) => ConvertedType::NONE, + TimeUnit::MILLIS(_) => ConvertedType::TIME_MILLIS, + TimeUnit::MICROS(_) => ConvertedType::TIME_MICROS, + TimeUnit::NANOS(_) => ConvertedType::NONE, }, LogicalType::TIMESTAMP(t) => match t.unit { - parquet::TimeUnit::MILLIS(_) => ConvertedType::TIMESTAMP_MILLIS, - parquet::TimeUnit::MICROS(_) => ConvertedType::TIMESTAMP_MICROS, - parquet::TimeUnit::NANOS(_) => ConvertedType::NONE, + TimeUnit::MILLIS(_) => ConvertedType::TIMESTAMP_MILLIS, + TimeUnit::MICROS(_) => ConvertedType::TIMESTAMP_MICROS, + TimeUnit::NANOS(_) => ConvertedType::NONE, }, LogicalType::INTEGER(t) => match (t.bit_width, t.is_signed) { (8, true) => ConvertedType::INT_8, @@ -834,7 +831,7 @@ impl str::FromStr for ConvertedType { "JSON" => Ok(ConvertedType::JSON), "BSON" => Ok(ConvertedType::BSON), "INTERVAL" => Ok(ConvertedType::INTERVAL), - other => Err(general_err!("Invalid logical type {}", other)), + other => Err(general_err!("Invalid converted type {}", other)), } } } @@ -877,50 +874,50 @@ impl str::FromStr for LogicalType { is_signed: false, })), "MAP" => Ok(LogicalType::MAP(MapType {})), - // "MAP_KEY_VALUE" => Ok(ConvertedType::MAP_KEY_VALUE), "LIST" => Ok(LogicalType::LIST(ListType {})), "ENUM" => Ok(LogicalType::ENUM(EnumType {})), - // "DECIMAL" => Ok(ConvertedType::DECIMAL), + // TODO: ARROW-11365 + // "DECIMAL" => Ok(LogicalType::DECIMAL), "DATE" => Ok(LogicalType::DATE(DateType {})), "TIME(MILLIS,true)" => Ok(LogicalType::TIME(TimeType { is_adjusted_to_u_t_c: true, - unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), })), "TIME(MILLIS,false)" => Ok(LogicalType::TIME(TimeType { is_adjusted_to_u_t_c: false, - unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), })), "TIME(MICROS,true)" => Ok(LogicalType::TIME(TimeType { is_adjusted_to_u_t_c: true, - unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), })), "TIME(MICROS,false)" => Ok(LogicalType::TIME(TimeType { is_adjusted_to_u_t_c: false, - unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), })), "TIMESTAMP(MILLIS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { is_adjusted_to_u_t_c: true, - unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), })), "TIMESTAMP(MILLIS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { is_adjusted_to_u_t_c: false, - unit: parquet::TimeUnit::MILLIS(parquet::MilliSeconds {}), + unit: TimeUnit::MILLIS(parquet::MilliSeconds {}), })), "TIMESTAMP(MICROS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { is_adjusted_to_u_t_c: true, - unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), })), "TIMESTAMP(MICROS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { is_adjusted_to_u_t_c: false, - unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), })), "TIMESTAMP(NANOS,true)" => Ok(LogicalType::TIMESTAMP(TimestampType { is_adjusted_to_u_t_c: true, - unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), })), "TIMESTAMP(NANOS,false)" => Ok(LogicalType::TIMESTAMP(TimestampType { is_adjusted_to_u_t_c: false, - unit: parquet::TimeUnit::MICROS(parquet::MicroSeconds {}), + unit: TimeUnit::MICROS(parquet::MicroSeconds {}), })), "STRING" => Ok(LogicalType::STRING(StringType {})), "JSON" => Ok(LogicalType::JSON(JsonType {})), @@ -1435,42 +1432,42 @@ mod tests { ); assert_eq!( ConvertedType::from(Some(LogicalType::TIME(TimeType { - unit: parquet::TimeUnit::MILLIS(Default::default()), + unit: TimeUnit::MILLIS(Default::default()), is_adjusted_to_u_t_c: true, }))), ConvertedType::TIME_MILLIS ); assert_eq!( ConvertedType::from(Some(LogicalType::TIME(TimeType { - unit: parquet::TimeUnit::MICROS(Default::default()), + unit: TimeUnit::MICROS(Default::default()), is_adjusted_to_u_t_c: true, }))), ConvertedType::TIME_MICROS ); assert_eq!( ConvertedType::from(Some(LogicalType::TIME(TimeType { - unit: parquet::TimeUnit::NANOS(Default::default()), + unit: TimeUnit::NANOS(Default::default()), is_adjusted_to_u_t_c: false, }))), ConvertedType::NONE ); assert_eq!( ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { - unit: parquet::TimeUnit::MILLIS(Default::default()), + unit: TimeUnit::MILLIS(Default::default()), is_adjusted_to_u_t_c: true, }))), ConvertedType::TIMESTAMP_MILLIS ); assert_eq!( ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { - unit: parquet::TimeUnit::MICROS(Default::default()), + unit: TimeUnit::MICROS(Default::default()), is_adjusted_to_u_t_c: false, }))), ConvertedType::TIMESTAMP_MICROS ); assert_eq!( ConvertedType::from(Some(LogicalType::TIMESTAMP(TimestampType { - unit: parquet::TimeUnit::NANOS(Default::default()), + unit: TimeUnit::NANOS(Default::default()), is_adjusted_to_u_t_c: false, }))), ConvertedType::NONE @@ -1811,9 +1808,110 @@ mod tests { } #[test] - fn test_column_order_get_sort_order() { + fn test_column_order_get_logical_type_sort_order() { // Helper to check the order in a list of values. // Only logical type is checked. + fn check_sort_order(types: Vec, expected_order: SortOrder) { + for tpe in types { + assert_eq!( + ColumnOrder::get_sort_order( + Some(tpe), + ConvertedType::NONE, + Type::BYTE_ARRAY + ), + expected_order + ); + } + } + + // Unsigned comparison (physical type does not matter) + let unsigned = vec![ + LogicalType::STRING(Default::default()), + LogicalType::JSON(Default::default()), + LogicalType::BSON(Default::default()), + LogicalType::ENUM(Default::default()), + LogicalType::UUID(Default::default()), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + }), + LogicalType::INTEGER(IntType { + bit_width: 16, + is_signed: false, + }), + LogicalType::INTEGER(IntType { + bit_width: 32, + is_signed: false, + }), + LogicalType::INTEGER(IntType { + bit_width: 64, + is_signed: false, + }), + ]; + check_sort_order(unsigned, SortOrder::UNSIGNED); + + // Signed comparison (physical type does not matter) + let signed = vec![ + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: true, + }), + LogicalType::DECIMAL(DecimalType { + scale: 20, + precision: 4, + }), + LogicalType::DATE(Default::default()), + LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS(Default::default()), + }), + LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(Default::default()), + }), + LogicalType::TIME(TimeType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::NANOS(Default::default()), + }), + LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MILLIS(Default::default()), + }), + LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: false, + unit: TimeUnit::MICROS(Default::default()), + }), + LogicalType::TIMESTAMP(TimestampType { + is_adjusted_to_u_t_c: true, + unit: TimeUnit::NANOS(Default::default()), + }), + ]; + check_sort_order(signed, SortOrder::SIGNED); + + // Undefined comparison + let undefined = vec![ + LogicalType::LIST(Default::default()), + LogicalType::MAP(Default::default()), + ]; + check_sort_order(undefined, SortOrder::UNDEFINED); + } + + #[test] + fn test_column_order_get_coverted_type_sort_order() { + // Helper to check the order in a list of values. + // Only converted type is checked. fn check_sort_order(types: Vec, expected_order: SortOrder) { for tpe in types { assert_eq!( @@ -1898,7 +1996,7 @@ mod tests { ); assert_eq!( ColumnOrder::get_default_sort_order(Type::FIXED_LEN_BYTE_ARRAY), - SortOrder::UNDEFINED + SortOrder::UNSIGNED ); } diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index 3125c652c26..ce66962cc88 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -534,11 +534,11 @@ mod tests { use std::{fs::File, io::Cursor}; - use crate::basic::{Compression, Encoding, Repetition, Type}; + use crate::basic::{Compression, Encoding, IntType, LogicalType, Repetition, Type}; use crate::column::page::PageReader; use crate::compression::{create_codec, Codec}; use crate::file::{ - properties::WriterProperties, + properties::{WriterProperties, WriterVersion}, reader::{FileReader, SerializedFileReader, SerializedPageReader}, statistics::{from_thrift, to_thrift, Statistics}, }; @@ -723,6 +723,59 @@ mod tests { ); } + #[test] + fn test_file_writer_v2_with_metadata() { + let file = get_temp_file("test_file_writer_write_with_metadata", &[]); + let field_logical_type = Some(LogicalType::INTEGER(IntType { + bit_width: 8, + is_signed: false, + })); + let field = Arc::new( + types::Type::primitive_type_builder("col1", Type::INT32) + .with_logical_type(field_logical_type.clone()) + .with_converted_type(field_logical_type.into()) + .build() + .unwrap(), + ); + let schema = Arc::new( + types::Type::group_type_builder("schema") + .with_fields(&mut vec![field.clone()]) + .build() + .unwrap(), + ); + let props = Arc::new( + WriterProperties::builder() + .set_key_value_metadata(Some(vec![KeyValue::new( + "key".to_string(), + "value".to_string(), + )])) + .set_writer_version(WriterVersion::PARQUET_2_0) + .build(), + ); + let mut writer = + SerializedFileWriter::new(file.try_clone().unwrap(), schema, props).unwrap(); + writer.close().unwrap(); + + let reader = SerializedFileReader::new(file).unwrap(); + + assert_eq!( + reader + .metadata() + .file_metadata() + .key_value_metadata() + .to_owned() + .unwrap() + .len(), + 1 + ); + + // ARROW-11803: Test that the converted and logical types have been populated + let fields = reader.metadata().file_metadata().schema().get_fields(); + assert_eq!(fields.len(), 1); + let read_field = fields.get(0).unwrap(); + assert_eq!(read_field, &field); + } + #[test] fn test_file_writer_empty_row_groups() { let file = get_temp_file("test_file_writer_write_empty_row_groups", &[]); diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs index 5f03b0dcb3e..9b55f5e2ee2 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -1011,6 +1011,8 @@ fn from_thrift_helper( } /// Method to convert to Thrift. +/// The `writer_version` is used to determine whether to populate `LogicalType`. +/// Only the `ConvertedType` is populated if using version 1 of the writer. pub fn to_thrift(schema: &Type, writer_version: i32) -> Result> { if !schema.is_group() { return Err(general_err!("Root schema must be Group type")); From 7233930c12009eb495a5033178d58deca1851397 Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Tue, 2 Mar 2021 06:35:29 +0200 Subject: [PATCH 09/10] Always write logical types --- rust/parquet/src/file/writer.rs | 5 ++--- rust/parquet/src/schema/types.rs | 36 ++++++++++---------------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index ce66962cc88..bfc6534c72a 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -175,10 +175,9 @@ impl SerializedFileWriter { /// Assembles and writes metadata at the end of the file. fn write_metadata(&mut self) -> Result<()> { - let writer_version = self.props.writer_version().as_num(); let file_metadata = parquet::FileMetaData { - version: writer_version, - schema: types::to_thrift(self.schema.as_ref(), writer_version)?, + version: self.props.writer_version().as_num(), + schema: types::to_thrift(self.schema.as_ref())?, num_rows: self.total_num_rows as i64, row_groups: self .row_groups diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs index 9b55f5e2ee2..d80fe0d011f 100644 --- a/rust/parquet/src/schema/types.rs +++ b/rust/parquet/src/schema/types.rs @@ -1011,24 +1011,18 @@ fn from_thrift_helper( } /// Method to convert to Thrift. -/// The `writer_version` is used to determine whether to populate `LogicalType`. -/// Only the `ConvertedType` is populated if using version 1 of the writer. -pub fn to_thrift(schema: &Type, writer_version: i32) -> Result> { +pub fn to_thrift(schema: &Type) -> Result> { if !schema.is_group() { return Err(general_err!("Root schema must be Group type")); } let mut elements: Vec = Vec::new(); - to_thrift_helper(schema, &mut elements, writer_version); + to_thrift_helper(schema, &mut elements); Ok(elements) } /// Constructs list of `SchemaElement` from the schema using depth-first traversal. /// Here we assume that schema is always valid and starts with group type. -fn to_thrift_helper( - schema: &Type, - elements: &mut Vec, - writer_version: i32, -) { +fn to_thrift_helper(schema: &Type, elements: &mut Vec) { match *schema { Type::PrimitiveType { ref basic_info, @@ -1059,11 +1053,7 @@ fn to_thrift_helper( } else { None }, - logical_type: if writer_version > 1 { - basic_info.logical_type().map(|value| value.into()) - } else { - None - }, + logical_type: basic_info.logical_type().map(|value| value.into()), }; elements.push(element); @@ -1092,18 +1082,14 @@ fn to_thrift_helper( } else { None }, - logical_type: if writer_version > 1 { - basic_info.logical_type().map(|value| value.into()) - } else { - None - }, + logical_type: basic_info.logical_type().map(|value| value.into()), }; elements.push(element); // Add child elements for a group for field in fields { - to_thrift_helper(field, elements, writer_version); + to_thrift_helper(field, elements); } } } @@ -1815,7 +1801,7 @@ mod tests { let schema = Type::primitive_type_builder("col", PhysicalType::INT32) .build() .unwrap(); - let thrift_schema = to_thrift(&schema, 1); + let thrift_schema = to_thrift(&schema); assert!(thrift_schema.is_err()); if let Err(e) = thrift_schema { assert_eq!( @@ -1874,7 +1860,7 @@ mod tests { } "; let expected_schema = parse_message_type(message_type).unwrap(); - let thrift_schema = to_thrift(&expected_schema, 1).unwrap(); + let thrift_schema = to_thrift(&expected_schema).unwrap(); let result_schema = from_thrift(&thrift_schema).unwrap(); assert_eq!(result_schema, Arc::new(expected_schema)); } @@ -1890,7 +1876,7 @@ mod tests { } "; let expected_schema = parse_message_type(message_type).unwrap(); - let thrift_schema = to_thrift(&expected_schema, 1).unwrap(); + let thrift_schema = to_thrift(&expected_schema).unwrap(); let result_schema = from_thrift(&thrift_schema).unwrap(); assert_eq!(result_schema, Arc::new(expected_schema)); } @@ -1912,7 +1898,7 @@ mod tests { "; let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema, 1).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema).unwrap(); // Change all of None to Some(0) for mut elem in &mut thrift_schema[..] { if elem.num_children == None { @@ -1937,7 +1923,7 @@ mod tests { "; let expected_schema = parse_message_type(message_type).unwrap(); - let mut thrift_schema = to_thrift(&expected_schema, 1).unwrap(); + let mut thrift_schema = to_thrift(&expected_schema).unwrap(); thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into()); let result_schema = from_thrift(&thrift_schema).unwrap(); From 360855f48fbf0f85896bf1e13af8a7890e74c27b Mon Sep 17 00:00:00 2001 From: Neville Dipale Date: Sat, 6 Mar 2021 08:54:22 +0200 Subject: [PATCH 10/10] don't write same write for 2 tests --- rust/parquet/src/file/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/parquet/src/file/writer.rs b/rust/parquet/src/file/writer.rs index bfc6534c72a..265014bf683 100644 --- a/rust/parquet/src/file/writer.rs +++ b/rust/parquet/src/file/writer.rs @@ -724,7 +724,7 @@ mod tests { #[test] fn test_file_writer_v2_with_metadata() { - let file = get_temp_file("test_file_writer_write_with_metadata", &[]); + let file = get_temp_file("test_file_writer_v2_write_with_metadata", &[]); let field_logical_type = Some(LogicalType::INTEGER(IntType { bit_width: 8, is_signed: false,