Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions rust/arrow/src/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ impl RecordBatch {
}
if columns[i].data_type() != schema.field(i).data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"column types must match schema types, expected {:?} but found {:?} at column index {}",
"column types must match schema types, expected {:?} but found {:?} at column index {}",
schema.field(i).data_type(),
columns[i].data_type(),
i)));
Expand Down Expand Up @@ -130,9 +130,6 @@ impl Into<StructArray> for RecordBatch {
}
}

unsafe impl Send for RecordBatch {}
unsafe impl Sync for RecordBatch {}

/// Definition of record batch reader.
pub trait RecordBatchReader {
/// Returns schemas of this record batch reader.
Expand Down
20 changes: 7 additions & 13 deletions rust/parquet/src/column/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
use std::{
cmp::{max, min},
collections::HashMap,
mem,
};

use super::page::{Page, PageReader};
Expand Down Expand Up @@ -90,21 +89,16 @@ pub fn get_column_reader(
/// Gets a typed column reader for the specific type `T`, by "up-casting" `col_reader` of
/// non-generic type to a generic column reader type `ColumnReaderImpl`.
///
/// NOTE: the caller MUST guarantee that the actual enum value for `col_reader` matches
/// the type `T`. Otherwise, disastrous consequence could happen.
/// Panics if actual enum value for `col_reader` does not match the type `T`.
pub fn get_typed_column_reader<T: DataType>(
col_reader: ColumnReader,
) -> ColumnReaderImpl<T> {
match col_reader {
ColumnReader::BoolColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::Int32ColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::Int64ColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::Int96ColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::FloatColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::DoubleColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::ByteArrayColumnReader(r) => unsafe { mem::transmute(r) },
ColumnReader::FixedLenByteArrayColumnReader(r) => unsafe { mem::transmute(r) },
}
T::get_column_reader(col_reader).unwrap_or_else(|| {
panic!(
"Failed to convert column reader into a typed column reader for `{}` type",
T::get_physical_type()
)
})
}

/// Typed value reader for a particular primitive column.
Expand Down
57 changes: 20 additions & 37 deletions rust/parquet/src/column/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

//! Contains column writer API.

use std::{cmp, collections::VecDeque, mem, rc::Rc};
use std::{cmp, collections::VecDeque, rc::Rc};

use crate::basic::{Compression, Encoding, PageType, Type};
use crate::column::page::{CompressedPage, Page, PageWriteSpec, PageWriter};
Expand Down Expand Up @@ -98,57 +98,40 @@ pub fn get_column_writer(
/// Gets a typed column writer for the specific type `T`, by "up-casting" `col_writer` of
/// non-generic type to a generic column writer type `ColumnWriterImpl`.
///
/// NOTE: the caller MUST guarantee that the actual enum value for `col_writer` matches
/// the type `T`. Otherwise, disastrous consequence could happen.
/// Panics if actual enum value for `col_writer` does not match the type `T`.
pub fn get_typed_column_writer<T: DataType>(
col_writer: ColumnWriter,
) -> ColumnWriterImpl<T> {
match col_writer {
ColumnWriter::BoolColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::Int32ColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::Int64ColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::Int96ColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::FloatColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::DoubleColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::ByteArrayColumnWriter(r) => unsafe { mem::transmute(r) },
ColumnWriter::FixedLenByteArrayColumnWriter(r) => unsafe { mem::transmute(r) },
}
T::get_column_writer(col_writer).unwrap_or_else(|| {
panic!(
"Failed to convert column writer into a typed column writer for `{}` type",
T::get_physical_type()
)
})
}

/// Similar to `get_typed_column_writer` but returns a reference.
pub fn get_typed_column_writer_ref<T: DataType>(
col_writer: &ColumnWriter,
) -> &ColumnWriterImpl<T> {
match col_writer {
ColumnWriter::BoolColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::Int32ColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::Int64ColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::Int96ColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::FloatColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::DoubleColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::ByteArrayColumnWriter(ref r) => unsafe { mem::transmute(r) },
ColumnWriter::FixedLenByteArrayColumnWriter(ref r) => unsafe {
mem::transmute(r)
},
}
T::get_column_writer_ref(col_writer).unwrap_or_else(|| {
panic!(
"Failed to convert column writer into a typed column writer for `{}` type",
T::get_physical_type()
)
})
}

/// Similar to `get_typed_column_writer` but returns a reference.
pub fn get_typed_column_writer_mut<T: DataType>(
col_writer: &mut ColumnWriter,
) -> &mut ColumnWriterImpl<T> {
match col_writer {
ColumnWriter::BoolColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::Int32ColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::Int64ColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::Int96ColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::FloatColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::DoubleColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::ByteArrayColumnWriter(ref mut r) => unsafe { mem::transmute(r) },
ColumnWriter::FixedLenByteArrayColumnWriter(ref mut r) => unsafe {
mem::transmute(r)
},
}
T::get_column_writer_mut(col_writer).unwrap_or_else(|| {
panic!(
"Failed to convert column writer into a typed column writer for `{}` type",
T::get_physical_type()
)
})
}

/// Typed column writer for a primitive column.
Expand Down
118 changes: 111 additions & 7 deletions rust/parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ use std::mem;
use byteorder::{BigEndian, ByteOrder};

use crate::basic::Type;
use crate::column::reader::{ColumnReader, ColumnReaderImpl};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That could result in a circular dependency, we should not be doing that, or we should at least factor out the interfaces and structs there.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rust handles circular, inter crate dependencies fine 🤷‍♂

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It still does not mean we should be doing it.

use crate::column::writer::{ColumnWriter, ColumnWriterImpl};
use crate::errors::{ParquetError, Result};
use crate::util::memory::{ByteBuffer, ByteBufferPtr};
use std::str::from_utf8;
Expand Down Expand Up @@ -376,10 +378,30 @@ pub trait DataType: 'static {

/// Returns size in bytes for Rust representation of the physical type.
fn get_type_size() -> usize;

fn get_column_reader(column_writer: ColumnReader) -> Option<ColumnReaderImpl<Self>>
where
Self: Sized;

fn get_column_writer(column_writer: ColumnWriter) -> Option<ColumnWriterImpl<Self>>
where
Self: Sized;

fn get_column_writer_ref(
column_writer: &ColumnWriter,
) -> Option<&ColumnWriterImpl<Self>>
where
Self: Sized;

fn get_column_writer_mut(
column_writer: &mut ColumnWriter,
) -> Option<&mut ColumnWriterImpl<Self>>
where
Self: Sized;
}

macro_rules! make_type {
($name:ident, $physical_ty:path, $native_ty:ty, $size:expr) => {
($name:ident, $physical_ty:path, $reader_ident: ident, $writer_ident: ident, $native_ty:ty, $size:expr) => {
pub struct $name {}

impl DataType for $name {
Expand All @@ -392,27 +414,109 @@ macro_rules! make_type {
fn get_type_size() -> usize {
$size
}

fn get_column_reader(
column_writer: ColumnReader,
) -> Option<ColumnReaderImpl<Self>> {
match column_writer {
ColumnReader::$reader_ident(w) => Some(w),
_ => None,
}
}

fn get_column_writer(
column_writer: ColumnWriter,
) -> Option<ColumnWriterImpl<Self>> {
match column_writer {
ColumnWriter::$writer_ident(w) => Some(w),
_ => None,
}
}

fn get_column_writer_ref(
column_writer: &ColumnWriter,
) -> Option<&ColumnWriterImpl<Self>> {
match column_writer {
ColumnWriter::$writer_ident(w) => Some(w),
_ => None,
}
}

fn get_column_writer_mut(
column_writer: &mut ColumnWriter,
) -> Option<&mut ColumnWriterImpl<Self>> {
match column_writer {
ColumnWriter::$writer_ident(w) => Some(w),
_ => None,
}
}
}
};
}

// Generate struct definitions for all physical types

make_type!(BoolType, Type::BOOLEAN, bool, 1);
make_type!(Int32Type, Type::INT32, i32, 4);
make_type!(Int64Type, Type::INT64, i64, 8);
make_type!(Int96Type, Type::INT96, Int96, mem::size_of::<Int96>());
make_type!(FloatType, Type::FLOAT, f32, 4);
make_type!(DoubleType, Type::DOUBLE, f64, 8);
make_type!(
BoolType,
Type::BOOLEAN,
BoolColumnReader,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure about this. We generate column readers and writers when making types?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It generates the unpacking functions needed to unpack the enum. I don't know of another, better way of doing this so 🤷‍♂

Copy link
Contributor

@sadikovi sadikovi Jan 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, this is just an odd place to put them.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Marwes@eb1ba90 is the only alternative I know which I would argue is a lot more verbose for no gain.

BoolColumnWriter,
bool,
1
);
make_type!(
Int32Type,
Type::INT32,
Int32ColumnReader,
Int32ColumnWriter,
i32,
4
);
make_type!(
Int64Type,
Type::INT64,
Int64ColumnReader,
Int64ColumnWriter,
i64,
8
);
make_type!(
Int96Type,
Type::INT96,
Int96ColumnReader,
Int96ColumnWriter,
Int96,
mem::size_of::<Int96>()
);
make_type!(
FloatType,
Type::FLOAT,
FloatColumnReader,
FloatColumnWriter,
f32,
4
);
make_type!(
DoubleType,
Type::DOUBLE,
DoubleColumnReader,
DoubleColumnWriter,
f64,
8
);
make_type!(
ByteArrayType,
Type::BYTE_ARRAY,
ByteArrayColumnReader,
ByteArrayColumnWriter,
ByteArray,
mem::size_of::<ByteArray>()
);
make_type!(
FixedLenByteArrayType,
Type::FIXED_LEN_BYTE_ARRAY,
FixedLenByteArrayColumnReader,
FixedLenByteArrayColumnWriter,
ByteArray,
mem::size_of::<ByteArray>()
);
Expand Down