From 66ce5b10937e4d7a89ec32d6a9544d4968da9552 Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Wed, 22 Jan 2020 16:16:53 +0100 Subject: [PATCH 1/4] fix: Don't let safe code arbitrarily transmute readers and writers This code were quite clearly unsafe (which seems to be known, judging from the comment about it). As the functions appeared to callable indirectly from other public functions I changed these to just panic instead. --- rust/parquet/src/column/reader.rs | 15 +--- rust/parquet/src/column/writer.rs | 42 ++--------- rust/parquet/src/data_type.rs | 118 ++++++++++++++++++++++++++++-- 3 files changed, 118 insertions(+), 57 deletions(-) diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index 887443f40be..a59dba0418f 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -20,7 +20,6 @@ use std::{ cmp::{max, min}, collections::HashMap, - mem, }; use super::page::{Page, PageReader}; @@ -90,21 +89,11 @@ pub fn get_column_reader( /// Gets a typed column reader for the specific type `T`, by "up-casting" `col_reader` of /// non-generic type to a generic column reader type `ColumnReaderImpl`. /// -/// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches -/// the type `T`. Otherwise, disastrous consequence could happen. +/// Panics if actual enum value for `col_reader` does not match the type `T`. pub fn get_typed_column_reader( col_reader: ColumnReader, ) -> ColumnReaderImpl { - match col_reader { - ColumnReader::BoolColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::Int32ColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::Int64ColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::Int96ColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::FloatColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::DoubleColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::ByteArrayColumnReader(r) => unsafe { mem::transmute(r) }, - ColumnReader::FixedLenByteArrayColumnReader(r) => unsafe { mem::transmute(r) }, - } + T::get_column_reader(col_reader).expect("Column type mismatch") } /// Typed value reader for a particular primitive column. diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index 1be6b80073a..575d52dfba9 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -17,7 +17,7 @@ //! Contains column writer API. -use std::{cmp, collections::VecDeque, mem, rc::Rc}; +use std::{cmp, collections::VecDeque, rc::Rc}; use crate::basic::{Compression, Encoding, PageType, Type}; use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter}; @@ -98,57 +98,25 @@ pub fn get_column_writer( /// Gets a typed column writer for the specific type `T`, by "up-casting" `col_writer` of /// non-generic type to a generic column writer type `ColumnWriterImpl`. /// -/// NOTE: the caller MUST guarantee that the actual enum value for `col_writer` matches -/// the type `T`. Otherwise, disastrous consequence could happen. +/// Panics if actual enum value for `col_writer` does not match the type `T`. pub fn get_typed_column_writer( col_writer: ColumnWriter, ) -> ColumnWriterImpl { - match col_writer { - ColumnWriter::BoolColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int32ColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int64ColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int96ColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::FloatColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::DoubleColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::ByteArrayColumnWriter(r) => unsafe { mem::transmute(r) }, - ColumnWriter::FixedLenByteArrayColumnWriter(r) => unsafe { mem::transmute(r) }, - } + T::get_column_writer(col_writer).expect("Column type mismatch") } /// Similar to `get_typed_column_writer` but returns a reference. pub fn get_typed_column_writer_ref( col_writer: &ColumnWriter, ) -> &ColumnWriterImpl { - match col_writer { - ColumnWriter::BoolColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int32ColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int64ColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int96ColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::FloatColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::DoubleColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::ByteArrayColumnWriter(ref r) => unsafe { mem::transmute(r) }, - ColumnWriter::FixedLenByteArrayColumnWriter(ref r) => unsafe { - mem::transmute(r) - }, - } + T::get_column_writer_ref(col_writer).expect("Column type mismatch") } /// Similar to `get_typed_column_writer` but returns a reference. pub fn get_typed_column_writer_mut( col_writer: &mut ColumnWriter, ) -> &mut ColumnWriterImpl { - match col_writer { - ColumnWriter::BoolColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int32ColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int64ColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::Int96ColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::FloatColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::DoubleColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::ByteArrayColumnWriter(ref mut r) => unsafe { mem::transmute(r) }, - ColumnWriter::FixedLenByteArrayColumnWriter(ref mut r) => unsafe { - mem::transmute(r) - }, - } + T::get_column_writer_mut(col_writer).expect("Column type mismatch") } /// Typed column writer for a primitive column. diff --git a/rust/parquet/src/data_type.rs b/rust/parquet/src/data_type.rs index ef0299a79d2..d7e20fa5533 100644 --- a/rust/parquet/src/data_type.rs +++ b/rust/parquet/src/data_type.rs @@ -23,6 +23,8 @@ use std::mem; use byteorder::{BigEndian, ByteOrder}; use crate::basic::Type; +use crate::column::reader::{ColumnReader, ColumnReaderImpl}; +use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; use crate::util::memory::{ByteBuffer, ByteBufferPtr}; use std::str::from_utf8; @@ -376,10 +378,30 @@ pub trait DataType: 'static { /// Returns size in bytes for Rust representation of the physical type. fn get_type_size() -> usize; + + fn get_column_reader(column_writer: ColumnReader) -> Option> + where + Self: Sized; + + fn get_column_writer(column_writer: ColumnWriter) -> Option> + where + Self: Sized; + + fn get_column_writer_ref( + column_writer: &ColumnWriter, + ) -> Option<&ColumnWriterImpl> + where + Self: Sized; + + fn get_column_writer_mut( + column_writer: &mut ColumnWriter, + ) -> Option<&mut ColumnWriterImpl> + where + Self: Sized; } macro_rules! make_type { - ($name:ident, $physical_ty:path, $native_ty:ty, $size:expr) => { + ($name:ident, $physical_ty:path, $reader_ident: ident, $writer_ident: ident, $native_ty:ty, $size:expr) => { pub struct $name {} impl DataType for $name { @@ -392,27 +414,109 @@ macro_rules! make_type { fn get_type_size() -> usize { $size } + + fn get_column_reader( + column_writer: ColumnReader, + ) -> Option> { + match column_writer { + ColumnReader::$reader_ident(w) => Some(w), + _ => None, + } + } + + fn get_column_writer( + column_writer: ColumnWriter, + ) -> Option> { + match column_writer { + ColumnWriter::$writer_ident(w) => Some(w), + _ => None, + } + } + + fn get_column_writer_ref( + column_writer: &ColumnWriter, + ) -> Option<&ColumnWriterImpl> { + match column_writer { + ColumnWriter::$writer_ident(w) => Some(w), + _ => None, + } + } + + fn get_column_writer_mut( + column_writer: &mut ColumnWriter, + ) -> Option<&mut ColumnWriterImpl> { + match column_writer { + ColumnWriter::$writer_ident(w) => Some(w), + _ => None, + } + } } }; } // Generate struct definitions for all physical types -make_type!(BoolType, Type::BOOLEAN, bool, 1); -make_type!(Int32Type, Type::INT32, i32, 4); -make_type!(Int64Type, Type::INT64, i64, 8); -make_type!(Int96Type, Type::INT96, Int96, mem::size_of::()); -make_type!(FloatType, Type::FLOAT, f32, 4); -make_type!(DoubleType, Type::DOUBLE, f64, 8); +make_type!( + BoolType, + Type::BOOLEAN, + BoolColumnReader, + BoolColumnWriter, + bool, + 1 +); +make_type!( + Int32Type, + Type::INT32, + Int32ColumnReader, + Int32ColumnWriter, + i32, + 4 +); +make_type!( + Int64Type, + Type::INT64, + Int64ColumnReader, + Int64ColumnWriter, + i64, + 8 +); +make_type!( + Int96Type, + Type::INT96, + Int96ColumnReader, + Int96ColumnWriter, + Int96, + mem::size_of::() +); +make_type!( + FloatType, + Type::FLOAT, + FloatColumnReader, + FloatColumnWriter, + f32, + 4 +); +make_type!( + DoubleType, + Type::DOUBLE, + DoubleColumnReader, + DoubleColumnWriter, + f64, + 8 +); make_type!( ByteArrayType, Type::BYTE_ARRAY, + ByteArrayColumnReader, + ByteArrayColumnWriter, ByteArray, mem::size_of::() ); make_type!( FixedLenByteArrayType, Type::FIXED_LEN_BYTE_ARRAY, + FixedLenByteArrayColumnReader, + FixedLenByteArrayColumnWriter, ByteArray, mem::size_of::() ); From d484e17e386f489434a87dab27cb4fc03b73767a Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Mon, 27 Jan 2020 11:26:24 +0100 Subject: [PATCH 2/4] Better error message for typed conversion --- rust/parquet/src/column/reader.rs | 7 ++++++- rust/parquet/src/column/writer.rs | 21 ++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index a59dba0418f..dc1b07dcc7d 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -93,7 +93,12 @@ pub fn get_column_reader( pub fn get_typed_column_reader( col_reader: ColumnReader, ) -> ColumnReaderImpl { - T::get_column_reader(col_reader).expect("Column type mismatch") + T::get_column_reader(col_reader).unwrap_or_else(|| { + panic!( + "Failed to convert column reader into a typed column reader for `{}` typ", + T::get_physical_type() + ) + }) } /// Typed value reader for a particular primitive column. diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index 575d52dfba9..d6ebd78ef8b 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -102,21 +102,36 @@ pub fn get_column_writer( pub fn get_typed_column_writer( col_writer: ColumnWriter, ) -> ColumnWriterImpl { - T::get_column_writer(col_writer).expect("Column type mismatch") + T::get_column_writer(col_writer).unwrap_or_else(|| { + panic!( + "Failed to convert column writer into a typed column writer for `{}` typ", + T::get_physical_type() + ) + }) } /// Similar to `get_typed_column_writer` but returns a reference. pub fn get_typed_column_writer_ref( col_writer: &ColumnWriter, ) -> &ColumnWriterImpl { - T::get_column_writer_ref(col_writer).expect("Column type mismatch") + T::get_column_writer_ref(col_writer).unwrap_or_else(|| { + panic!( + "Failed to convert column writer into a typed column writer for `{}` typ", + T::get_physical_type() + ) + }) } /// Similar to `get_typed_column_writer` but returns a reference. pub fn get_typed_column_writer_mut( col_writer: &mut ColumnWriter, ) -> &mut ColumnWriterImpl { - T::get_column_writer_mut(col_writer).expect("Column type mismatch") + T::get_column_writer_mut(col_writer).unwrap_or_else(|| { + panic!( + "Failed to convert column writer into a typed column writer for `{}` typ", + T::get_physical_type() + ) + }) } /// Typed column writer for a primitive column. From 9fe6ddfd97772855da495e1c07271d6eba03adbb Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Wed, 5 Feb 2020 13:53:08 +0100 Subject: [PATCH 3/4] Remove unnecessary Send/Sync impls for RecordBatch --- rust/arrow/src/record_batch.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs index 3d5d8f35198..8cfa22515b6 100644 --- a/rust/arrow/src/record_batch.rs +++ b/rust/arrow/src/record_batch.rs @@ -65,7 +65,7 @@ impl RecordBatch { } if columns[i].data_type() != schema.field(i).data_type() { return Err(ArrowError::InvalidArgumentError(format!( - "column types must match schema types, expected {:?} but found {:?} at column index {}", + "column types must match schema types, expected {:?} but found {:?} at column index {}", schema.field(i).data_type(), columns[i].data_type(), i))); @@ -130,9 +130,6 @@ impl Into for RecordBatch { } } -unsafe impl Send for RecordBatch {} -unsafe impl Sync for RecordBatch {} - /// Definition of record batch reader. pub trait RecordBatchReader { /// Returns schemas of this record batch reader. From 97e022a57d9a57f0f973df3f19f0d0ea3b55566a Mon Sep 17 00:00:00 2001 From: Markus Westerlind Date: Wed, 5 Feb 2020 20:51:05 +0100 Subject: [PATCH 4/4] s/typ/type/g --- rust/parquet/src/column/reader.rs | 2 +- rust/parquet/src/column/writer.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/parquet/src/column/reader.rs b/rust/parquet/src/column/reader.rs index dc1b07dcc7d..8dc81ee6653 100644 --- a/rust/parquet/src/column/reader.rs +++ b/rust/parquet/src/column/reader.rs @@ -95,7 +95,7 @@ pub fn get_typed_column_reader( ) -> ColumnReaderImpl { T::get_column_reader(col_reader).unwrap_or_else(|| { panic!( - "Failed to convert column reader into a typed column reader for `{}` typ", + "Failed to convert column reader into a typed column reader for `{}` type", T::get_physical_type() ) }) diff --git a/rust/parquet/src/column/writer.rs b/rust/parquet/src/column/writer.rs index d6ebd78ef8b..6a634b9f467 100644 --- a/rust/parquet/src/column/writer.rs +++ b/rust/parquet/src/column/writer.rs @@ -104,7 +104,7 @@ pub fn get_typed_column_writer( ) -> ColumnWriterImpl { T::get_column_writer(col_writer).unwrap_or_else(|| { panic!( - "Failed to convert column writer into a typed column writer for `{}` typ", + "Failed to convert column writer into a typed column writer for `{}` type", T::get_physical_type() ) }) @@ -116,7 +116,7 @@ pub fn get_typed_column_writer_ref( ) -> &ColumnWriterImpl { T::get_column_writer_ref(col_writer).unwrap_or_else(|| { panic!( - "Failed to convert column writer into a typed column writer for `{}` typ", + "Failed to convert column writer into a typed column writer for `{}` type", T::get_physical_type() ) }) @@ -128,7 +128,7 @@ pub fn get_typed_column_writer_mut( ) -> &mut ColumnWriterImpl { T::get_column_writer_mut(col_writer).unwrap_or_else(|| { panic!( - "Failed to convert column writer into a typed column writer for `{}` typ", + "Failed to convert column writer into a typed column writer for `{}` type", T::get_physical_type() ) })