From a70b8d51796c8657a2aa6dfb6b90d922570f8ec4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 18 Feb 2020 02:45:14 -0600 Subject: [PATCH 01/15] FeatherV2 Compiling again Draft initial implementation of Feather V2, consolidate files Refactor unit tests to test V1 and V2, not all passing yet Port Feather Python tests to use pytest fixture for version, get tests passing Update R bindings, remove methods and writer class removed in C++ Update feather.fbs --- cpp/src/arrow/ipc/feather.cc | 1030 ++++++++++++-------------- cpp/src/arrow/ipc/feather.fbs | 6 + cpp/src/arrow/ipc/feather.h | 118 +-- cpp/src/arrow/ipc/feather_internal.h | 233 ------ cpp/src/arrow/ipc/feather_test.cc | 539 ++++---------- cpp/src/arrow/table.cc | 4 +- python/pyarrow/feather.pxi | 93 +-- python/pyarrow/feather.py | 162 ++-- python/pyarrow/includes/libarrow.pxd | 35 +- python/pyarrow/tests/test_feather.py | 819 ++++++++++---------- r/NAMESPACE | 3 +- r/R/arrowExports.R | 64 +- r/R/feather.R | 84 +-- r/_pkgdown.yml | 3 +- r/src/arrowExports.cpp | 252 +------ r/src/feather.cpp | 115 +-- 16 files changed, 1247 insertions(+), 2313 deletions(-) delete mode 100644 cpp/src/arrow/ipc/feather_internal.h diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 9a324a07757..ffa360454be 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -17,11 +17,13 @@ #include "arrow/ipc/feather.h" +#include #include #include #include #include // IWYU pragma: keep #include +#include #include #include @@ -30,27 +32,38 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/io/interfaces.h" -#include "arrow/ipc/feather_internal.h" -#include "arrow/ipc/util.h" // IWYU pragma: keep +#include "arrow/ipc/metadata_internal.h" +#include "arrow/ipc/options.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/util.h" +#include "arrow/ipc/writer.h" #include "arrow/result.h" #include "arrow/status.h" -#include "arrow/table.h" // IWYU pragma: keep +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" -#include "arrow/visitor.h" +#include "arrow/util/make_unique.h" +#include "arrow/visitor_inline.h" #include "generated/feather_generated.h" namespace arrow { using internal::checked_cast; +using internal::make_unique; + +class ExtensionType; namespace ipc { namespace feather { +typedef flatbuffers::FlatBufferBuilder FBB; + +static constexpr const char* kFeatherV1MagicBytes = "FEA1"; +static constexpr const int kFeatherDefaultAlignment = 8; static const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0}; static inline int64_t PaddedLength(int64_t nbytes) { @@ -58,32 +71,9 @@ static inline int64_t PaddedLength(int64_t nbytes) { return ((nbytes + alignment - 1) / alignment) * alignment; } -// XXX: Hack for Feather 0.3.0 for backwards compatibility with old files -// Size in-file of written byte buffer -static int64_t GetOutputLength(int64_t nbytes) { - if (kFeatherVersion < 2) { - // Feather files < 0.3.0 - return nbytes; - } else { - return PaddedLength(nbytes); - } -} - -static Status WritePadded(io::OutputStream* stream, const uint8_t* data, int64_t length, - int64_t* bytes_written) { - RETURN_NOT_OK(stream->Write(data, length)); - - int64_t remainder = PaddedLength(length) - length; - if (remainder != 0) { - RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); - } - *bytes_written = length + remainder; - return Status::OK(); -} - -static Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* data, - int64_t bit_offset, const int64_t length, - int64_t* bytes_written) { +Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* data, + int64_t bit_offset, const int64_t length, + int64_t* bytes_written) { data = data + bit_offset / 8; uint8_t bit_shift = static_cast(bit_offset % 8); if (bit_offset == 0) { @@ -118,6 +108,19 @@ static Status WritePaddedWithOffset(io::OutputStream* stream, const uint8_t* dat return Status::OK(); } +Status WritePadded(io::OutputStream* stream, const uint8_t* data, int64_t length, + int64_t* bytes_written) { + return WritePaddedWithOffset(stream, data, /*bit_offset=*/0, length, bytes_written); +} + +struct ColumnType { + enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; +}; + +static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { + return static_cast(static_cast(unit)); +} + /// For compatibility, we need to write any data sometimes just to keep producing /// files that can be read with an older reader. static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, @@ -126,7 +129,6 @@ static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, for (int64_t i = 0; i < length; i++) { RETURN_NOT_OK(stream->Write(&null, 1)); } - int64_t remainder = PaddedLength(length) - length; if (remainder != 0) { RETURN_NOT_OK(stream->Write(kPaddingBytes, remainder)); @@ -136,168 +138,21 @@ static Status WritePaddedBlank(io::OutputStream* stream, int64_t length, } // ---------------------------------------------------------------------- -// TableBuilder - -TableBuilder::TableBuilder(int64_t num_rows) : finished_(false), num_rows_(num_rows) {} - -FBB& TableBuilder::fbb() { return fbb_; } - -Status TableBuilder::Finish() { - if (finished_) { - return Status::Invalid("can only call this once"); - } - - FBString desc = 0; - if (!description_.empty()) { - desc = fbb_.CreateString(description_); - } - - flatbuffers::Offset metadata = 0; - - auto root = fbs::CreateCTable(fbb_, desc, num_rows_, fbb_.CreateVector(columns_), - kFeatherVersion, metadata); - fbb_.Finish(root); - finished_ = true; - - return Status::OK(); -} - -std::shared_ptr TableBuilder::GetBuffer() const { - return std::make_shared(fbb_.GetBufferPointer(), - static_cast(fbb_.GetSize())); -} - -void TableBuilder::SetDescription(const std::string& description) { - description_ = description; -} - -void TableBuilder::SetNumRows(int64_t num_rows) { num_rows_ = num_rows; } - -void TableBuilder::add_column(const flatbuffers::Offset& col) { - columns_.push_back(col); -} - -ColumnBuilder::ColumnBuilder(TableBuilder* parent, const std::string& name) - : parent_(parent) { - fbb_ = &parent->fbb(); - name_ = name; - type_ = ColumnType::PRIMITIVE; - meta_time_.unit = TimeUnit::SECOND; -} - -flatbuffers::Offset ColumnBuilder::CreateColumnMetadata() { - switch (type_) { - case ColumnType::PRIMITIVE: - // flatbuffer void - return 0; - case ColumnType::CATEGORY: { - auto cat_meta = fbs::CreateCategoryMetadata( - fbb(), GetPrimitiveArray(fbb(), meta_category_.levels), meta_category_.ordered); - return cat_meta.Union(); - } - case ColumnType::TIMESTAMP: { - // flatbuffer void - flatbuffers::Offset tz = 0; - if (!meta_timestamp_.timezone.empty()) { - tz = fbb().CreateString(meta_timestamp_.timezone); - } - - auto ts_meta = - fbs::CreateTimestampMetadata(fbb(), ToFlatbufferEnum(meta_timestamp_.unit), tz); - return ts_meta.Union(); - } - case ColumnType::DATE: { - auto date_meta = fbs::CreateDateMetadata(fbb()); - return date_meta.Union(); - } - case ColumnType::TIME: { - auto time_meta = fbs::CreateTimeMetadata(fbb(), ToFlatbufferEnum(meta_time_.unit)); - return time_meta.Union(); - } - default: - // null - return flatbuffers::Offset(); - } -} - -Status ColumnBuilder::Finish() { - FBB& buf = fbb(); - - // values - auto values = GetPrimitiveArray(buf, values_); - flatbuffers::Offset metadata = CreateColumnMetadata(); - - auto column = fbs::CreateColumn(buf, buf.CreateString(name_), values, - ToFlatbufferEnum(type_), // metadata_type - metadata, buf.CreateString(user_metadata_)); - - // bad coupling, but OK for now - parent_->add_column(column); - return Status::OK(); -} - -void ColumnBuilder::SetValues(const ArrayMetadata& values) { values_ = values; } - -void ColumnBuilder::SetUserMetadata(const std::string& data) { user_metadata_ = data; } +// ReaderV1 -void ColumnBuilder::SetCategory(const ArrayMetadata& levels, bool ordered) { - type_ = ColumnType::CATEGORY; - meta_category_.levels = levels; - meta_category_.ordered = ordered; -} - -void ColumnBuilder::SetTimestamp(TimeUnit::type unit) { - type_ = ColumnType::TIMESTAMP; - meta_timestamp_.unit = unit; -} - -void ColumnBuilder::SetTimestamp(TimeUnit::type unit, const std::string& timezone) { - SetTimestamp(unit); - meta_timestamp_.timezone = timezone; -} - -void ColumnBuilder::SetDate() { type_ = ColumnType::DATE; } - -void ColumnBuilder::SetTime(TimeUnit::type unit) { - type_ = ColumnType::TIME; - meta_time_.unit = unit; -} - -FBB& ColumnBuilder::fbb() { return *fbb_; } - -std::unique_ptr TableBuilder::AddColumn(const std::string& name) { - return std::unique_ptr(new ColumnBuilder(this, name)); -} - -// ---------------------------------------------------------------------- -// reader.cc - -class TableReader::TableReaderImpl { +class ReaderV1 : public Reader { public: - TableReaderImpl() {} - Status Open(const std::shared_ptr& source) { source_ = source; - int magic_size = static_cast(strlen(kFeatherMagicBytes)); - int footer_size = magic_size + static_cast(sizeof(uint32_t)); - - // Pathological issue where the file is smaller than ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize()); - if (size < magic_size + footer_size) { - return Status::Invalid("File is too small to be a well-formed file"); - } - - ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(0, magic_size)); - - if (memcmp(buffer->data(), kFeatherMagicBytes, magic_size)) { - return Status::Invalid("Not a feather file"); - } + int magic_size = static_cast(strlen(kFeatherV1MagicBytes)); + int footer_size = magic_size + static_cast(sizeof(uint32_t)); // Now get the footer and verify - ARROW_ASSIGN_OR_RAISE(buffer, source->ReadAt(size - footer_size, footer_size)); + ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(size - footer_size, footer_size)); - if (memcmp(buffer->data() + sizeof(uint32_t), kFeatherMagicBytes, magic_size)) { + if (memcmp(buffer->data() + sizeof(uint32_t), kFeatherV1MagicBytes, magic_size)) { return Status::Invalid("Feather file footer incomplete"); } @@ -306,15 +161,28 @@ class TableReader::TableReaderImpl { return Status::Invalid("File is smaller than indicated metadata size"); } ARROW_ASSIGN_OR_RAISE( - buffer, source->ReadAt(size - footer_size - metadata_length, metadata_length)); + metadata_buffer_, + source->ReadAt(size - footer_size - metadata_length, metadata_length)); - metadata_.reset(new TableMetadata()); - return metadata_->Open(buffer); + metadata_ = fbs::GetCTable(metadata_buffer_->data()); + return ReadSchema(); + } + + Status ReadSchema() { + std::vector> fields; + for (int i = 0; i < static_cast(metadata_->columns()->size()); ++i) { + const fbs::Column* col = metadata_->columns()->Get(i); + std::shared_ptr type; + RETURN_NOT_OK( + GetDataType(col->values(), col->metadata_type(), col->metadata(), &type)); + fields.push_back(::arrow::field(col->name()->str(), type)); + } + schema_ = ::arrow::schema(fields); + return Status::OK(); } Status GetDataType(const fbs::PrimitiveArray* values, fbs::TypeMetadata metadata_type, - const void* metadata, std::shared_ptr* out, - std::shared_ptr* out_dictionary = nullptr) { + const void* metadata, std::shared_ptr* out) { #define PRIMITIVE_CASE(CAP_TYPE, FACTORY_FUNC) \ case fbs::Type::CAP_TYPE: \ *out = FACTORY_FUNC(); \ @@ -324,13 +192,11 @@ class TableReader::TableReaderImpl { case fbs::TypeMetadata::CategoryMetadata: { auto meta = static_cast(metadata); - std::shared_ptr index_type; + std::shared_ptr index_type, dict_type; RETURN_NOT_OK(GetDataType(values, fbs::TypeMetadata::NONE, nullptr, &index_type)); - RETURN_NOT_OK( - LoadValues(meta->levels(), fbs::TypeMetadata::NONE, nullptr, out_dictionary)); - - *out = dictionary(index_type, (*out_dictionary)->type(), meta->ordered()); + GetDataType(meta->levels(), fbs::TypeMetadata::NONE, nullptr, &dict_type)); + *out = dictionary(index_type, dict_type, meta->ordered()); break; } case fbs::TypeMetadata::TimestampMetadata: { @@ -380,16 +246,24 @@ class TableReader::TableReaderImpl { return Status::OK(); } + int64_t GetOutputLength(int64_t nbytes) { + // XXX: Hack for Feather 0.3.0 for backwards compatibility with old files + // Size in-file of written byte buffer + if (version() < 2) { + // Feather files < 0.3.0 + return nbytes; + } else { + return PaddedLength(nbytes); + } + } + // Retrieve a primitive array from the data source // // @returns: a Buffer instance, the precise type will depend on the kind of // input data source (which may or may not have memory-map like semantics) - Status LoadValues(const fbs::PrimitiveArray* meta, fbs::TypeMetadata metadata_type, - const void* metadata, std::shared_ptr* out) { - std::shared_ptr type; - std::shared_ptr dictionary; - RETURN_NOT_OK(GetDataType(meta, metadata_type, metadata, &type, &dictionary)); - + Status LoadValues(std::shared_ptr type, const fbs::PrimitiveArray* meta, + fbs::TypeMetadata metadata_type, const void* metadata, + std::shared_ptr* out) { std::vector> buffers; // Buffer data from the source (may or may not perform a copy depending on @@ -399,6 +273,11 @@ class TableReader::TableReaderImpl { int64_t offset = 0; + if (type->id() == Type::DICTIONARY) { + // Load the index type values + type = checked_cast(*type).index_type(); + } + // If there are nulls, the null bitmask is first if (meta->null_count() > 0) { int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(meta->length())); @@ -420,181 +299,131 @@ class TableReader::TableReaderImpl { buffers.push_back(SliceBuffer(buffer, offset, buffer->size() - offset)); - auto arr_data = - ArrayData::Make(type, meta->length(), std::move(buffers), meta->null_count()); - arr_data->dictionary = dictionary; - *out = MakeArray(arr_data); + *out = ArrayData::Make(type, meta->length(), std::move(buffers), meta->null_count()); return Status::OK(); } - bool HasDescription() const { return metadata_->HasDescription(); } + int version() const override { return metadata_->version(); } + int64_t num_rows() const { return metadata_->num_rows(); } - std::string GetDescription() const { return metadata_->GetDescription(); } + std::shared_ptr schema() const override { return schema_; } - int version() const { return metadata_->version(); } - int64_t num_rows() const { return metadata_->num_rows(); } - int64_t num_columns() const { return metadata_->num_columns(); } + Status GetDictionary(int field_index, std::shared_ptr* out) { + const fbs::Column* col_meta = metadata_->columns()->Get(field_index); + auto dict_meta = col_meta->metadata_as(); + const auto& dict_type = + checked_cast(*schema_->field(field_index)->type()); - std::string GetColumnName(int i) const { - const fbs::Column* col_meta = metadata_->column(i); - return col_meta->name()->str(); + std::shared_ptr out_data; + RETURN_NOT_OK(LoadValues(dict_type.value_type(), dict_meta->levels(), + fbs::TypeMetadata::NONE, nullptr, &out_data)); + *out = MakeArray(out_data); + return Status::OK(); } - Status GetColumn(int i, std::shared_ptr* out) { - const fbs::Column* col_meta = metadata_->column(i); + Status GetColumn(int field_index, std::shared_ptr* out) { + const fbs::Column* col_meta = metadata_->columns()->Get(field_index); + std::shared_ptr data; - // auto user_meta = column->user_metadata(); - // if (user_meta->size() > 0) { user_metadata_ = user_meta->str(); } + auto type = schema_->field(field_index)->type(); + RETURN_NOT_OK(LoadValues(type, col_meta->values(), col_meta->metadata_type(), + col_meta->metadata(), &data)); - std::shared_ptr values; - RETURN_NOT_OK(LoadValues(col_meta->values(), col_meta->metadata_type(), - col_meta->metadata(), &values)); - *out = std::make_shared(values); + if (type->id() == Type::DICTIONARY) { + RETURN_NOT_OK(GetDictionary(field_index, &data->dictionary)); + data->type = type; + } + *out = std::make_shared(MakeArray(data)); return Status::OK(); } - Status Read(std::shared_ptr* out) { - std::vector> fields; + Status Read(std::shared_ptr
* out) override { std::vector> columns; - for (int i = 0; i < num_columns(); ++i) { - std::shared_ptr column; - RETURN_NOT_OK(GetColumn(i, &column)); - columns.push_back(column); - fields.push_back(::arrow::field(GetColumnName(i), column->type())); + for (int i = 0; i < static_cast(metadata_->columns()->size()); ++i) { + columns.emplace_back(); + RETURN_NOT_OK(GetColumn(i, &columns.back())); } - *out = Table::Make(schema(fields), columns); + *out = Table::Make(this->schema(), columns, this->num_rows()); return Status::OK(); } - Status Read(const std::vector& indices, std::shared_ptr
* out) { + Status Read(const std::vector& indices, std::shared_ptr
* out) override { std::vector> fields; std::vector> columns; - for (int i = 0; i < num_columns(); ++i) { - bool found = false; - for (auto j : indices) { - if (i == j) { - found = true; - break; - } - } - if (!found) { - continue; + + auto my_schema = this->schema(); + for (auto field_index : indices) { + if (field_index < 0 || field_index >= my_schema->num_fields()) { + return Status::Invalid("Field index ", field_index, " is out of bounds"); } - std::shared_ptr column; - RETURN_NOT_OK(GetColumn(i, &column)); - columns.push_back(column); - fields.push_back(::arrow::field(GetColumnName(i), column->type())); + columns.emplace_back(); + RETURN_NOT_OK(GetColumn(field_index, &columns.back())); + fields.push_back(my_schema->field(field_index)); } - *out = Table::Make(schema(fields), columns); + *out = Table::Make(::arrow::schema(fields), columns, this->num_rows()); return Status::OK(); } - Status Read(const std::vector& names, std::shared_ptr
* out) { + Status Read(const std::vector& names, + std::shared_ptr
* out) override { std::vector> fields; std::vector> columns; - for (int i = 0; i < num_columns(); ++i) { - auto name = GetColumnName(i); - bool found = false; - for (auto& n : names) { - if (name == n) { - found = true; - break; - } - } - if (!found) { - continue; + + std::shared_ptr sch = this->schema(); + for (auto name : names) { + int field_index = sch->GetFieldIndex(name); + if (field_index == -1) { + return Status::Invalid("Field named ", name, " is not found"); } - std::shared_ptr column; - RETURN_NOT_OK(GetColumn(i, &column)); - columns.push_back(column); - fields.push_back(::arrow::field(name, column->type())); + columns.emplace_back(); + RETURN_NOT_OK(GetColumn(field_index, &columns.back())); + fields.push_back(sch->field(field_index)); } - *out = Table::Make(schema(fields), columns); + *out = Table::Make(::arrow::schema(fields), columns, this->num_rows()); return Status::OK(); } private: std::shared_ptr source_; - std::unique_ptr metadata_; - + std::shared_ptr metadata_buffer_; + const fbs::CTable* metadata_; std::shared_ptr schema_; }; // ---------------------------------------------------------------------- -// TableReader public API - -TableReader::TableReader() { impl_.reset(new TableReaderImpl()); } - -TableReader::~TableReader() {} - -Status TableReader::Open(const std::shared_ptr& source, - std::unique_ptr* out) { - out->reset(new TableReader()); - return (*out)->impl_->Open(source); -} - -bool TableReader::HasDescription() const { return impl_->HasDescription(); } - -std::string TableReader::GetDescription() const { return impl_->GetDescription(); } - -int TableReader::version() const { return impl_->version(); } - -int64_t TableReader::num_rows() const { return impl_->num_rows(); } - -int64_t TableReader::num_columns() const { return impl_->num_columns(); } - -std::string TableReader::GetColumnName(int i) const { return impl_->GetColumnName(i); } - -Status TableReader::GetColumn(int i, std::shared_ptr* out) { - return impl_->GetColumn(i, out); -} - -Status TableReader::Read(std::shared_ptr
* out) { return impl_->Read(out); } - -Status TableReader::Read(const std::vector& indices, std::shared_ptr
* out) { - return impl_->Read(indices, out); -} - -Status TableReader::Read(const std::vector& names, - std::shared_ptr
* out) { - return impl_->Read(names, out); -} +// WriterV1 + +struct ArrayMetadata { + fbs::Type type; + int64_t offset; + int64_t length; + int64_t null_count; + int64_t total_bytes; +}; -// ---------------------------------------------------------------------- -// writer.cc - -fbs::Type ToFlatbufferType(Type::type type) { - switch (type) { - case Type::BOOL: - return fbs::Type::BOOL; - case Type::INT8: - return fbs::Type::INT8; - case Type::INT16: - return fbs::Type::INT16; - case Type::INT32: - return fbs::Type::INT32; - case Type::INT64: - return fbs::Type::INT64; - case Type::UINT8: - return fbs::Type::UINT8; - case Type::UINT16: - return fbs::Type::UINT16; - case Type::UINT32: - return fbs::Type::UINT32; - case Type::UINT64: - return fbs::Type::UINT64; - case Type::FLOAT: - return fbs::Type::FLOAT; - case Type::DOUBLE: - return fbs::Type::DOUBLE; +#define TO_FLATBUFFER_CASE(TYPE) \ + case Type::TYPE: \ + return fbs::Type::TYPE; + +Result ToFlatbufferType(const DataType& type) { + switch (type.id()) { + TO_FLATBUFFER_CASE(BOOL); + TO_FLATBUFFER_CASE(INT8); + TO_FLATBUFFER_CASE(INT16); + TO_FLATBUFFER_CASE(INT32); + TO_FLATBUFFER_CASE(INT64); + TO_FLATBUFFER_CASE(UINT8); + TO_FLATBUFFER_CASE(UINT16); + TO_FLATBUFFER_CASE(UINT32); + TO_FLATBUFFER_CASE(UINT64); + TO_FLATBUFFER_CASE(FLOAT); + TO_FLATBUFFER_CASE(DOUBLE); + TO_FLATBUFFER_CASE(LARGE_BINARY); + TO_FLATBUFFER_CASE(BINARY); case Type::STRING: return fbs::Type::UTF8; - case Type::BINARY: - return fbs::Type::BINARY; case Type::LARGE_STRING: return fbs::Type::LARGE_UTF8; - case Type::LARGE_BINARY: - return fbs::Type::LARGE_BINARY; case Type::DATE32: return fbs::Type::INT32; case Type::TIMESTAMP: @@ -604,292 +433,383 @@ fbs::Type ToFlatbufferType(Type::type type) { case Type::TIME64: return fbs::Type::INT64; default: - DCHECK(false) << "Cannot reach this code"; + return Status::TypeError("Unsupported Feather V1 type: ", type.ToString()); } - // prevent compiler warning - return fbs::Type::MIN; } -static Status SanitizeUnsupportedTypes(const Array& values, std::shared_ptr* out) { - if (values.type_id() == Type::NA) { - // As long as R doesn't support NA, we write this as a StringColumn - // to ensure stable roundtrips. - *out = std::make_shared(values.length(), nullptr, nullptr, - values.null_bitmap(), values.null_count()); - return Status::OK(); - } else { - *out = MakeArray(values.data()); - return Status::OK(); - } +static inline flatbuffers::Offset GetPrimitiveArray( + FBB& fbb, const ArrayMetadata& array) { + return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset, + array.length, array.null_count, array.total_bytes); } -class TableWriter::TableWriterImpl : public ArrayVisitor { - public: - TableWriterImpl() : initialized_stream_(false), metadata_(0) {} - - Status Open(const std::shared_ptr& stream) { - stream_ = stream; - return Status::OK(); - } - - void SetDescription(const std::string& desc) { metadata_.SetDescription(desc); } - - void SetNumRows(int64_t num_rows) { metadata_.SetNumRows(num_rows); } - - Status Finalize() { - RETURN_NOT_OK(CheckStarted()); - RETURN_NOT_OK(metadata_.Finish()); +// Convert Feather enums to Flatbuffer enums +static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { + return static_cast(static_cast(unit)); +} - auto buffer = metadata_.GetBuffer(); +const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { + fbs::TypeMetadata::NONE, // PRIMITIVE + fbs::TypeMetadata::CategoryMetadata, // CATEGORY + fbs::TypeMetadata::TimestampMetadata, // TIMESTAMP + fbs::TypeMetadata::DateMetadata, // DATE + fbs::TypeMetadata::TimeMetadata // TIME +}; - // Writer metadata - int64_t bytes_written; - RETURN_NOT_OK( - WritePadded(stream_.get(), buffer->data(), buffer->size(), &bytes_written)); - uint32_t buffer_size = static_cast(bytes_written); +static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { + return COLUMN_TYPE_ENUM_MAPPING[column_type]; +} - // Footer: metadata length, magic bytes - RETURN_NOT_OK(stream_->Write(&buffer_size, sizeof(uint32_t))); - return stream_->Write(kFeatherMagicBytes, strlen(kFeatherMagicBytes)); - } +struct ColumnMetadata { + flatbuffers::Offset WriteMetadata(FBB& fbb) { // NOLINT + switch (this->meta_type) { + case ColumnType::PRIMITIVE: + // flatbuffer void + return 0; + case ColumnType::CATEGORY: { + auto cat_meta = fbs::CreateCategoryMetadata( + fbb, GetPrimitiveArray(fbb, this->category_levels), this->category_ordered); + return cat_meta.Union(); + } + case ColumnType::TIMESTAMP: { + // flatbuffer void + flatbuffers::Offset tz = 0; + if (!this->timezone.empty()) { + tz = fbb.CreateString(this->timezone); + } - Status LoadArrayMetadata(const Array& values, ArrayMetadata* meta) { - if (!(is_primitive(values.type_id()) || is_binary_like(values.type_id()) || - is_large_binary_like(values.type_id()))) { - return Status::Invalid("Array is not primitive type: ", values.type()->ToString()); + auto ts_meta = + fbs::CreateTimestampMetadata(fbb, ToFlatbufferEnum(this->temporal_unit), tz); + return ts_meta.Union(); + } + case ColumnType::DATE: { + auto date_meta = fbs::CreateDateMetadata(fbb); + return date_meta.Union(); + } + case ColumnType::TIME: { + auto time_meta = + fbs::CreateTimeMetadata(fbb, ToFlatbufferEnum(this->temporal_unit)); + return time_meta.Union(); + } + default: + // null + DCHECK(false); + return 0; } + } - meta->type = ToFlatbufferType(values.type_id()); + ArrayMetadata values; + ColumnType::type meta_type; - ARROW_ASSIGN_OR_RAISE(meta->offset, stream_->Tell()); + ArrayMetadata category_levels; + bool category_ordered; - meta->length = values.length(); - meta->null_count = values.null_count(); - meta->total_bytes = 0; + TimeUnit::type temporal_unit; - return Status::OK(); - } - - template - Status WriteBinaryArray(const ArrayType& values, ArrayMetadata* meta, - const uint8_t** values_buffer, int64_t* values_bytes, - int64_t* bytes_written) { - using offset_type = typename ArrayType::offset_type; + // A timezone name known to the Olson timezone database. For display purposes + // because the actual data is all UTC + std::string timezone; +}; - int64_t offset_bytes = sizeof(offset_type) * (values.length() + 1); +Status WriteArrayV1(const Array& values, io::OutputStream* dst, ArrayMetadata* meta); - if (values.value_offsets()) { - *values_bytes = values.raw_value_offsets()[values.length()]; +struct ArrayWriterV1 { + const Array& values; + io::OutputStream* dst; + ArrayMetadata* meta; - // Write the variable-length offsets - RETURN_NOT_OK(WritePadded( - stream_.get(), reinterpret_cast(values.raw_value_offsets()), - offset_bytes, bytes_written)); + Status WriteBuffer(const uint8_t* buffer, int64_t length, int64_t bit_offset) { + int64_t bytes_written = 0; + if (buffer) { + RETURN_NOT_OK( + WritePaddedWithOffset(dst, buffer, bit_offset, length, &bytes_written)); } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), offset_bytes, bytes_written)); + RETURN_NOT_OK(WritePaddedBlank(dst, length, &bytes_written)); } - meta->total_bytes += *bytes_written; + meta->total_bytes += bytes_written; + return Status::OK(); + } - if (values.value_data()) { - *values_buffer = values.value_data()->data(); + template + typename std::enable_if< + is_nested_type::value || is_null_type::value || is_decimal_type::value || + std::is_same::value || is_duration_type::value || + is_interval_type::value || is_fixed_size_binary_type::value || + std::is_same::value || std::is_same::value || + std::is_same::value, + Status>::type + Visit(const T& type) { + return Status::NotImplemented(type.ToString()); + } + + template + typename std::enable_if::value || + std::is_same::value || + std::is_same::value || + is_timestamp_type::value || is_boolean_type::value, + Status>::type + Visit(const T&) { + const auto& prim_values = checked_cast(values); + const auto& fw_type = checked_cast(*values.type()); + + if (prim_values.values()) { + const uint8_t* buffer = + prim_values.values()->data() + (prim_values.offset() * fw_type.bit_width() / 8); + int64_t bit_offset = (prim_values.offset() * fw_type.bit_width()) % 8; + return WriteBuffer(buffer, + BitUtil::BytesForBits(values.length() * fw_type.bit_width()), + bit_offset); + } else { + return Status::OK(); } return Status::OK(); } - Status WriteArray(const Array& values, ArrayMetadata* meta) { - RETURN_NOT_OK(CheckStarted()); - RETURN_NOT_OK(LoadArrayMetadata(values, meta)); - - int64_t bytes_written; - - // Write the null bitmask - if (values.null_count() > 0) { - // We assume there is one bit for each value in values.nulls, - // starting at the zero offset. - int64_t null_bitmap_size = GetOutputLength(BitUtil::BytesForBits(values.length())); - if (values.null_bitmap()) { - auto null_bitmap = values.null_bitmap(); - RETURN_NOT_OK(WritePaddedWithOffset(stream_.get(), null_bitmap->data(), - values.offset(), null_bitmap_size, - &bytes_written)); - } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), null_bitmap_size, &bytes_written)); - } - meta->total_bytes += bytes_written; - } + template + enable_if_base_binary Visit(const T&) { + using ArrayType = typename TypeTraits::ArrayType; + const auto& ty_values = checked_cast(values); + using offset_type = typename T::offset_type; + const offset_type* offsets_data = nullptr; int64_t values_bytes = 0; - int64_t bit_offset = 0; + if (ty_values.value_offsets()) { + offsets_data = ty_values.raw_value_offsets(); + // All of the data has to be written because we don't have offset + // shifting implemented here as with the IPC format + values_bytes = offsets_data[values.length()]; + } + RETURN_NOT_OK(WriteBuffer(reinterpret_cast(offsets_data), + sizeof(offset_type) * (values.length() + 1), + /*bit_offset=*/0)); const uint8_t* values_buffer = nullptr; + if (ty_values.value_data()) { + values_buffer = ty_values.value_data()->data(); + } + return WriteBuffer(values_buffer, values_bytes, /*bit_offset=*/0); + } - if (is_binary_like(values.type_id())) { - RETURN_NOT_OK(WriteBinaryArray(checked_cast(values), meta, - &values_buffer, &values_bytes, &bytes_written)); - } else if (is_large_binary_like(values.type_id())) { - RETURN_NOT_OK(WriteBinaryArray(checked_cast(values), meta, - &values_buffer, &values_bytes, &bytes_written)); - } else { - const auto& prim_values = checked_cast(values); - const auto& fw_type = checked_cast(*values.type()); + Status Write() { + if (values.type_id() == Type::DICTIONARY) { + return WriteArrayV1(*(checked_cast(values).indices()), dst, + meta); + } - values_bytes = BitUtil::BytesForBits(values.length() * fw_type.bit_width()); + ARROW_ASSIGN_OR_RAISE(meta->type, ToFlatbufferType(*values.type())); + ARROW_ASSIGN_OR_RAISE(meta->offset, dst->Tell()); + meta->length = values.length(); + meta->null_count = values.null_count(); + meta->total_bytes = 0; - if (prim_values.values()) { - values_buffer = prim_values.values()->data() + - (prim_values.offset() * fw_type.bit_width() / 8); - bit_offset = (prim_values.offset() * fw_type.bit_width()) % 8; - } - } - if (values_buffer) { - RETURN_NOT_OK(WritePaddedWithOffset(stream_.get(), values_buffer, bit_offset, - values_bytes, &bytes_written)); - } else { - RETURN_NOT_OK(WritePaddedBlank(stream_.get(), values_bytes, &bytes_written)); + // Write the null bitmask + if (values.null_count() > 0) { + RETURN_NOT_OK(WriteBuffer(values.null_bitmap_data(), + BitUtil::BytesForBits(values.length()), values.offset())); } - meta->total_bytes += bytes_written; - - return Status::OK(); + // Write data buffer(s) + return VisitTypeInline(*values.type(), this); } +}; - Status WritePrimitiveValues(const Array& values) { - // Prepare metadata payload - ArrayMetadata meta; - RETURN_NOT_OK(WriteArray(values, &meta)); - current_column_->SetValues(meta); - return Status::OK(); +Status WriteArrayV1(const Array& values, io::OutputStream* dst, ArrayMetadata* meta) { + std::shared_ptr sanitized; + if (values.type_id() == Type::NA) { + // As long as R doesn't support NA, we write this as a StringColumn + // to ensure stable roundtrips. + sanitized = std::make_shared(values.length(), nullptr, nullptr, + values.null_bitmap(), values.null_count()); + } else { + sanitized = MakeArray(values.data()); } + ArrayWriterV1 visitor{*sanitized, dst, meta}; + return visitor.Write(); +} - Status Visit(const NullArray& values) override { - std::shared_ptr sanitized_nulls; - RETURN_NOT_OK(SanitizeUnsupportedTypes(values, &sanitized_nulls)); - return WritePrimitiveValues(*sanitized_nulls); +Status WriteColumnV1(const ChunkedArray& values, io::OutputStream* dst, + ColumnMetadata* out) { + if (values.num_chunks() > 1) { + return Status::Invalid("Writing chunked arrays not supported in Feather V1"); + } + const Array& chunk = *values.chunk(0); + RETURN_NOT_OK(WriteArrayV1(chunk, dst, &out->values)); + switch (chunk.type_id()) { + case Type::DICTIONARY: { + out->meta_type = ColumnType::CATEGORY; + auto dictionary = checked_cast(chunk).dictionary(); + RETURN_NOT_OK(WriteArrayV1(*dictionary, dst, &out->category_levels)); + out->category_ordered = + checked_cast(*chunk.type()).ordered(); + } break; + case Type::DATE32: + out->meta_type = ColumnType::DATE; + break; + case Type::TIME32: { + out->meta_type = ColumnType::TIME; + out->temporal_unit = checked_cast(*chunk.type()).unit(); + } break; + case Type::TIMESTAMP: { + const auto& ts_type = checked_cast(*chunk.type()); + out->meta_type = ColumnType::TIMESTAMP; + out->temporal_unit = ts_type.unit(); + out->timezone = ts_type.timezone(); + } break; + default: + out->meta_type = ColumnType::PRIMITIVE; + break; } + return Status::OK(); +} -#define VISIT_PRIMITIVE(TYPE) \ - Status Visit(const TYPE& values) override { return WritePrimitiveValues(values); } - - VISIT_PRIMITIVE(BooleanArray) - VISIT_PRIMITIVE(Int8Array) - VISIT_PRIMITIVE(Int16Array) - VISIT_PRIMITIVE(Int32Array) - VISIT_PRIMITIVE(Int64Array) - VISIT_PRIMITIVE(UInt8Array) - VISIT_PRIMITIVE(UInt16Array) - VISIT_PRIMITIVE(UInt32Array) - VISIT_PRIMITIVE(UInt64Array) - VISIT_PRIMITIVE(FloatArray) - VISIT_PRIMITIVE(DoubleArray) - VISIT_PRIMITIVE(BinaryArray) - VISIT_PRIMITIVE(StringArray) - VISIT_PRIMITIVE(LargeBinaryArray) - VISIT_PRIMITIVE(LargeStringArray) - -#undef VISIT_PRIMITIVE - - Status Visit(const DictionaryArray& values) override { - const auto& dict_type = checked_cast(*values.type()); - - if (!is_integer(values.indices()->type_id())) { - return Status::Invalid("Category values must be integers"); - } +Status WriteFeatherV1(const Table& table, io::OutputStream* dst) { + // Preamble + int64_t bytes_written; + RETURN_NOT_OK(WritePadded(dst, reinterpret_cast(kFeatherV1MagicBytes), + strlen(kFeatherV1MagicBytes), &bytes_written)); + + // Write columns + flatbuffers::FlatBufferBuilder fbb; + std::vector> fb_columns; + for (int i = 0; i < table.num_columns(); ++i) { + ColumnMetadata col; + RETURN_NOT_OK(WriteColumnV1(*table.column(i), dst, &col)); + auto fb_column = fbs::CreateColumn( + fbb, fbb.CreateString(table.field(i)->name()), GetPrimitiveArray(fbb, col.values), + ToFlatbufferEnum(col.meta_type), col.WriteMetadata(fbb), + /*user_metadata=*/0); + fb_columns.push_back(fb_column); + } + + // Finalize file footer + auto root = fbs::CreateCTable(fbb, /*description=*/0, table.num_rows(), + fbb.CreateVector(fb_columns), kFeatherV1Version, + /*metadata=*/0); + fbb.Finish(root); + auto buffer = std::make_shared(fbb.GetBufferPointer(), + static_cast(fbb.GetSize())); + + // Writer metadata + RETURN_NOT_OK(WritePadded(dst, buffer->data(), buffer->size(), &bytes_written)); + uint32_t metadata_size = static_cast(bytes_written); + + // Footer: metadata length, magic bytes + RETURN_NOT_OK(dst->Write(&metadata_size, sizeof(uint32_t))); + return dst->Write(kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes)); +} - RETURN_NOT_OK(WritePrimitiveValues(*values.indices())); +// ---------------------------------------------------------------------- +// Reader V2 - ArrayMetadata levels_meta; - std::shared_ptr sanitized_dictionary; - RETURN_NOT_OK(SanitizeUnsupportedTypes(*values.dictionary(), &sanitized_dictionary)); - RETURN_NOT_OK(WriteArray(*sanitized_dictionary, &levels_meta)); - current_column_->SetCategory(levels_meta, dict_type.ordered()); +class ReaderV2 : public Reader { + public: + Status Open(const std::shared_ptr& source) { + source_ = source; + ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_)); + schema_ = reader->schema(); return Status::OK(); } - Status Visit(const TimestampArray& values) override { - RETURN_NOT_OK(WritePrimitiveValues(values)); - const auto& ts_type = checked_cast(*values.type()); - current_column_->SetTimestamp(ts_type.unit(), ts_type.timezone()); - return Status::OK(); - } + int version() const override { return kFeatherV2Version; } - Status Visit(const Date32Array& values) override { - RETURN_NOT_OK(WritePrimitiveValues(values)); - current_column_->SetDate(); - return Status::OK(); - } + std::shared_ptr schema() const override { return schema_; } - Status Visit(const Time32Array& values) override { - RETURN_NOT_OK(WritePrimitiveValues(values)); - auto unit = checked_cast(*values.type()).unit(); - current_column_->SetTime(unit); - return Status::OK(); + Status Read(const IpcReadOptions& options, std::shared_ptr
* out) { + ARROW_ASSIGN_OR_RAISE(auto reader, RecordBatchFileReader::Open(source_, options)); + std::vector> batches; + for (int i = 0; i < reader->num_record_batches(); ++i) { + std::shared_ptr batch; + RETURN_NOT_OK(reader->ReadRecordBatch(i, &batch)); + batches.emplace_back(batch); + } + + // XXX: Handle included_fields in RecordBatchFileReader::schema + auto out_schema = reader->schema(); + if (options.included_fields) { + const auto& indices = *options.included_fields; + std::vector> fields; + for (int i = 0; i < out_schema->num_fields(); ++i) { + if (std::find(indices.begin(), indices.end(), i) != indices.end()) { + fields.push_back(out_schema->field(i)); + } + } + out_schema = ::arrow::schema(fields, out_schema->metadata()); + } + return Table::FromRecordBatches(out_schema, batches, out); } - Status Visit(const Time64Array& values) override { - return Status::NotImplemented("time64"); + Status Read(std::shared_ptr
* out) override { + return Read(IpcReadOptions::Defaults(), out); } - Status Append(const std::string& name, const Array& values) { - current_column_ = metadata_.AddColumn(name); - RETURN_NOT_OK(values.Accept(this)); - return current_column_->Finish(); + Status Read(const std::vector& indices, std::shared_ptr
* out) override { + auto options = IpcReadOptions::Defaults(); + options.included_fields = indices; + return Read(options, out); } - Status Write(const Table& table) { - for (int i = 0; i < table.num_columns(); ++i) { - auto column = table.column(i); - current_column_ = metadata_.AddColumn(table.field(i)->name()); - for (const auto chunk : column->chunks()) { - RETURN_NOT_OK(chunk->Accept(this)); + Status Read(const std::vector& names, + std::shared_ptr
* out) override { + std::vector indices; + std::shared_ptr sch = this->schema(); + for (auto name : names) { + int field_index = sch->GetFieldIndex(name); + if (field_index == -1) { + return Status::Invalid("Field named ", name, " is not found"); } - RETURN_NOT_OK(current_column_->Finish()); + indices.push_back(field_index); } - return Status::OK(); + return Read(indices, out); } private: - Status CheckStarted() { - if (!initialized_stream_) { - int64_t bytes_written_unused; - RETURN_NOT_OK(WritePadded(stream_.get(), - reinterpret_cast(kFeatherMagicBytes), - strlen(kFeatherMagicBytes), &bytes_written_unused)); - initialized_stream_ = true; - } - return Status::OK(); - } - - std::shared_ptr stream_; - - bool initialized_stream_; - TableBuilder metadata_; - - std::unique_ptr current_column_; - - Status AppendPrimitive(const PrimitiveArray& values, ArrayMetadata* out); + std::shared_ptr source_; + std::shared_ptr schema_; }; -TableWriter::TableWriter() { impl_.reset(new TableWriterImpl()); } - -TableWriter::~TableWriter() {} - -Status TableWriter::Open(const std::shared_ptr& stream, - std::unique_ptr* out) { - out->reset(new TableWriter()); - return (*out)->impl_->Open(stream); +Result> Reader::Open( + const std::shared_ptr& source) { + // Pathological issue where the file is smaller than header and footer + // combined + ARROW_ASSIGN_OR_RAISE(int64_t size, source->GetSize()); + if (size < /* 2 * 4 + 4 */ 12) { + return Status::Invalid("File is too small to be a well-formed file"); + } + + // Determine what kind of file we have. 6 is the max of len(FEA1) and + // len(ARROW1) + constexpr int magic_size = 6; + ARROW_ASSIGN_OR_RAISE(auto buffer, source->ReadAt(0, magic_size)); + + if (memcmp(buffer->data(), kFeatherV1MagicBytes, strlen(kFeatherV1MagicBytes)) == 0) { + std::shared_ptr result = std::make_shared(); + RETURN_NOT_OK(result->Open(source)); + return result; + } else if (memcmp(buffer->data(), internal::kArrowMagicBytes, + strlen(internal::kArrowMagicBytes)) == 0) { + std::shared_ptr result = std::make_shared(); + RETURN_NOT_OK(result->Open(source)); + return result; + } else { + return Status::Invalid("Not a Feather V1 or Arrow IPC file"); + } } -void TableWriter::SetDescription(const std::string& desc) { impl_->SetDescription(desc); } - -void TableWriter::SetNumRows(int64_t num_rows) { impl_->SetNumRows(num_rows); } +Status WriteTable(const Table& table, io::OutputStream* dst, + const WriteProperties& properties) { + if (properties.version == kFeatherV1Version) { + return WriteFeatherV1(table, dst); + } else { + IpcWriteOptions ipc_options = IpcWriteOptions::Defaults(); + ipc_options.compression = properties.compression; + ipc_options.compression_level = properties.compression_level; -Status TableWriter::Append(const std::string& name, const Array& values) { - return impl_->Append(name, values); + std::shared_ptr writer; + ARROW_ASSIGN_OR_RAISE(writer, NewFileWriter(dst, table.schema(), ipc_options)); + RETURN_NOT_OK(writer->WriteTable(table, properties.chunksize)); + return writer->Close(); + } } -Status TableWriter::Write(const Table& table) { return impl_->Write(table); } - -Status TableWriter::Finalize() { return impl_->Finalize(); } - } // namespace feather } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/feather.fbs b/cpp/src/arrow/ipc/feather.fbs index 5ec06299864..b4076be8757 100644 --- a/cpp/src/arrow/ipc/feather.fbs +++ b/cpp/src/arrow/ipc/feather.fbs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +/// DEPRECATED: Feather V2 is available starting in version 0.17.0 and does not +/// use this file at all. + namespace arrow.ipc.feather.fbs; /// Feather is an experimental serialization format implemented using @@ -141,6 +144,9 @@ table CTable { columns: [Column]; /// Version number of the Feather format + /// + /// Internal versions 0, 1, and 2: Implemented in Apache Arrow <= 0.16.0 and + /// wesm/feather. Uses "custom" metadata defined in this file. version: int; /// Table metadata (likely JSON), not yet used diff --git a/cpp/src/arrow/ipc/feather.h b/cpp/src/arrow/ipc/feather.h index 28b4b43c671..dfb4095aef3 100644 --- a/cpp/src/arrow/ipc/feather.h +++ b/cpp/src/arrow/ipc/feather.h @@ -25,12 +25,13 @@ #include #include +#include "arrow/result.h" +#include "arrow/util/compression.h" #include "arrow/util/visibility.h" namespace arrow { -class Array; -class ChunkedArray; +class Schema; class Status; class Table; @@ -44,53 +45,29 @@ class RandomAccessFile; namespace ipc { namespace feather { -static constexpr const int kFeatherVersion = 2; +static constexpr const int kFeatherV1Version = 2; +static constexpr const int kFeatherV2Version = 3; // ---------------------------------------------------------------------- // Metadata accessor classes -/// \class TableReader +/// \class Reader /// \brief An interface for reading columns from Feather files -class ARROW_EXPORT TableReader { +class ARROW_EXPORT Reader { public: - TableReader(); - ~TableReader(); + virtual ~Reader() = default; /// \brief Open a Feather file from a RandomAccessFile interface /// /// \param[in] source a RandomAccessFile instance - /// \param[out] out the table reader - static Status Open(const std::shared_ptr& source, - std::unique_ptr* out); - - /// \brief Optional table description - /// - /// This does not return a const std::string& because a string has to be - /// copied from the flatbuffer to be able to return a non-flatbuffer type - std::string GetDescription() const; - - /// \brief Return true if the table has a description field populated - bool HasDescription() const; + /// \return the table reader + static Result> Open( + const std::shared_ptr& source); /// \brief Return the version number of the Feather file - int version() const; + virtual int version() const = 0; - /// \brief Return the number of rows in the file - int64_t num_rows() const; - - /// \brief Return the number of columns in the file - int64_t num_columns() const; - - std::string GetColumnName(int i) const; - - /// \brief Read a column from the file as an arrow::ChunkedArray. - /// - /// \param[in] i the column index to read - /// \param[out] out the returned column - /// \return Status - /// - /// This function is zero-copy if the file source supports zero-copy reads - Status GetColumn(int i, std::shared_ptr* out); + virtual std::shared_ptr schema() const = 0; /// \brief Read all columns from the file as an arrow::Table. /// @@ -98,7 +75,7 @@ class ARROW_EXPORT TableReader { /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status Read(std::shared_ptr
* out); + virtual Status Read(std::shared_ptr
* out) = 0; /// \brief Read only the specified columns from the file as an arrow::Table. /// @@ -107,7 +84,7 @@ class ARROW_EXPORT TableReader { /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status Read(const std::vector& indices, std::shared_ptr
* out); + virtual Status Read(const std::vector& indices, std::shared_ptr
* out) = 0; /// \brief Read only the specified columns from the file as an arrow::Table. /// @@ -116,55 +93,42 @@ class ARROW_EXPORT TableReader { /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status Read(const std::vector& names, std::shared_ptr
* out); - - private: - class ARROW_NO_EXPORT TableReaderImpl; - std::unique_ptr impl_; + virtual Status Read(const std::vector& names, + std::shared_ptr
* out) = 0; }; -/// \class TableWriter -/// \brief Interface for writing Feather files -class ARROW_EXPORT TableWriter { - public: - ~TableWriter(); - - /// \brief Create a new TableWriter that writes to an OutputStream - /// \param[in] stream an output stream - /// \param[out] out the returned table writer - /// \return Status - static Status Open(const std::shared_ptr& stream, - std::unique_ptr* out); +struct ARROW_EXPORT WriteProperties { + static WriteProperties Defaults() { return WriteProperties(); } - /// \brief Set the description field in the file metadata - void SetDescription(const std::string& desc); + static WriteProperties DefaultsV1() { + WriteProperties props; + props.version = kFeatherV1Version; + return props; + } - /// \brief Set the number of rows in the file - void SetNumRows(int64_t num_rows); - - /// \brief Append a column to the file + /// Feather file version number /// - /// \param[in] name the column name - /// \param[in] values the column values as a contiguous arrow::Array - /// \return Status - Status Append(const std::string& name, const Array& values); + /// version 2: "Feather V1" Apache Arrow <= 0.16.0 + /// version 3: "Feather V2" Apache Arrow > 0.16.0 + int version = kFeatherV2Version; - /// \brief Write a table to the file - /// - /// \param[in] table the table to be written - /// \return Status - Status Write(const Table& table); + // Parameters for Feather V2 only - /// \brief Finalize the file by writing the file metadata and footer - /// \return Status - Status Finalize(); + /// Number of rows per intra-file chunk. Use smaller chunksize when you need + /// faster random row access + int64_t chunksize = 1LL << 16; - private: - TableWriter(); - class ARROW_NO_EXPORT TableWriterImpl; - std::unique_ptr impl_; + /// Compression type to use + Compression::type compression = Compression::ZSTD; + + /// Compressor-specific compression level + int compression_level = Compression::kUseDefaultCompressionLevel; }; +ARROW_EXPORT +Status WriteTable(const Table& table, io::OutputStream* dst, + const WriteProperties& properties = WriteProperties::Defaults()); + } // namespace feather } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/feather_internal.h b/cpp/src/arrow/ipc/feather_internal.h deleted file mode 100644 index 3c2463057d5..00000000000 --- a/cpp/src/arrow/ipc/feather_internal.h +++ /dev/null @@ -1,233 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Public API for the "Feather" file format, originally created at -// http://github.com/wesm/feather - -#pragma once - -#include -#include -#include -#include -#include - -#include - -#include "arrow/buffer.h" -#include "arrow/ipc/feather.h" -#include "arrow/type.h" - -#include "generated/feather_generated.h" - -namespace arrow { -namespace ipc { -namespace feather { - -typedef std::vector> ColumnVector; -typedef flatbuffers::FlatBufferBuilder FBB; -typedef flatbuffers::Offset FBString; - -struct ARROW_EXPORT ColumnType { - enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; -}; - -struct ARROW_EXPORT ArrayMetadata { - ArrayMetadata() {} - - ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, - int64_t total_bytes) - : type(type), - offset(offset), - length(length), - null_count(null_count), - total_bytes(total_bytes) {} - - bool Equals(const ArrayMetadata& other) const { - return this->type == other.type && this->offset == other.offset && - this->length == other.length && this->null_count == other.null_count && - this->total_bytes == other.total_bytes; - } - - fbs::Type type; - int64_t offset; - int64_t length; - int64_t null_count; - int64_t total_bytes; -}; - -struct ARROW_EXPORT CategoryMetadata { - ArrayMetadata levels; - bool ordered; -}; - -struct ARROW_EXPORT TimestampMetadata { - TimeUnit::type unit; - - // A timezone name known to the Olson timezone database. For display purposes - // because the actual data is all UTC - std::string timezone; -}; - -struct ARROW_EXPORT TimeMetadata { - TimeUnit::type unit; -}; - -static constexpr const char* kFeatherMagicBytes = "FEA1"; -static constexpr const int kFeatherDefaultAlignment = 8; - -class ColumnBuilder; - -class ARROW_EXPORT TableBuilder { - public: - explicit TableBuilder(int64_t num_rows); - ~TableBuilder() = default; - - FBB& fbb(); - Status Finish(); - std::shared_ptr GetBuffer() const; - - std::unique_ptr AddColumn(const std::string& name); - void SetDescription(const std::string& description); - void SetNumRows(int64_t num_rows); - void add_column(const flatbuffers::Offset& col); - - private: - flatbuffers::FlatBufferBuilder fbb_; - ColumnVector columns_; - - friend class ColumnBuilder; - - bool finished_; - std::string description_; - int64_t num_rows_; -}; - -class ARROW_EXPORT TableMetadata { - public: - TableMetadata() : table_(NULLPTR) {} - ~TableMetadata() = default; - - Status Open(const std::shared_ptr& buffer) { - metadata_buffer_ = buffer; - table_ = fbs::GetCTable(buffer->data()); - - if (table_->version() < kFeatherVersion) { - std::cout << "This Feather file is old" - << " and will not be readable beyond the 0.3.0 release" << std::endl; - } - return Status::OK(); - } - - bool HasDescription() const { return table_->description() != 0; } - - std::string GetDescription() const { - if (!HasDescription()) { - return std::string(""); - } - return table_->description()->str(); - } - - int version() const { return table_->version(); } - int64_t num_rows() const { return table_->num_rows(); } - int64_t num_columns() const { return table_->columns()->size(); } - - const fbs::Column* column(int i) { return table_->columns()->Get(i); } - - private: - std::shared_ptr metadata_buffer_; - const fbs::CTable* table_; -}; - -static inline flatbuffers::Offset GetPrimitiveArray( - FBB& fbb, const ArrayMetadata& array) { - return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset, - array.length, array.null_count, array.total_bytes); -} - -static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { - return static_cast(static_cast(unit)); -} - -static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { - return static_cast(static_cast(unit)); -} - -// Convert Feather enums to Flatbuffer enums - -const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { - fbs::TypeMetadata::NONE, // PRIMITIVE - fbs::TypeMetadata::CategoryMetadata, // CATEGORY - fbs::TypeMetadata::TimestampMetadata, // TIMESTAMP - fbs::TypeMetadata::DateMetadata, // DATE - fbs::TypeMetadata::TimeMetadata // TIME -}; - -static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { - return COLUMN_TYPE_ENUM_MAPPING[column_type]; -} - -static inline void FromFlatbuffer(const fbs::PrimitiveArray* values, ArrayMetadata* out) { - out->type = values->type(); - out->offset = values->offset(); - out->length = values->length(); - out->null_count = values->null_count(); - out->total_bytes = values->total_bytes(); -} - -class ARROW_EXPORT ColumnBuilder { - public: - ColumnBuilder(TableBuilder* parent, const std::string& name); - ~ColumnBuilder() = default; - - flatbuffers::Offset CreateColumnMetadata(); - - Status Finish(); - void SetValues(const ArrayMetadata& values); - void SetUserMetadata(const std::string& data); - void SetCategory(const ArrayMetadata& levels, bool ordered = false); - void SetTimestamp(TimeUnit::type unit); - void SetTimestamp(TimeUnit::type unit, const std::string& timezone); - void SetDate(); - void SetTime(TimeUnit::type unit); - FBB& fbb(); - - private: - TableBuilder* parent_; - - std::string name_; - ArrayMetadata values_; - std::string user_metadata_; - - // Column metadata - - // Is this a primitive type, or one of the types having metadata? Default is - // primitive - ColumnType::type type_; - - // Type-specific metadata union - CategoryMetadata meta_category_; - TimeMetadata meta_time_; - - TimestampMetadata meta_timestamp_; - - FBB* fbb_; -}; - -} // namespace feather -} // namespace ipc -} // namespace arrow diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 40198e08cd8..47412ee865a 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -15,18 +15,19 @@ // specific language governing permissions and limitations // under the License. +#include #include -#include #include +#include +#include #include #include "arrow/array.h" +#include "arrow/buffer.h" #include "arrow/io/memory.h" -#include "arrow/ipc/feather_internal.h" +#include "arrow/ipc/feather.h" #include "arrow/ipc/test_common.h" -#include "arrow/memory_pool.h" -#include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" @@ -34,348 +35,76 @@ #include "arrow/type.h" #include "arrow/util/checked_cast.h" -#include "generated/feather_generated.h" - namespace arrow { -class Buffer; - using internal::checked_cast; namespace ipc { namespace feather { -template -inline void assert_vector_equal(const std::vector& left, const std::vector& right) { - ASSERT_EQ(left.size(), right.size()); - - for (size_t i = 0; i < left.size(); ++i) { - ASSERT_EQ(left[i], right[i]) << i; - } -} - -class TestTableBuilder : public ::testing::Test { - public: - void SetUp() { tb_.reset(new TableBuilder(1000)); } - - virtual void Finish() { - ASSERT_OK(tb_->Finish()); +struct TestParam { + TestParam(int arg_version, + Compression::type arg_compression = Compression::UNCOMPRESSED) + : version(arg_version), compression(arg_compression) {} - table_.reset(new TableMetadata()); - ASSERT_OK(table_->Open(tb_->GetBuffer())); - } - - protected: - std::unique_ptr tb_; - std::unique_ptr table_; + int version; + Compression::type compression; }; -TEST_F(TestTableBuilder, Version) { - Finish(); - ASSERT_EQ(kFeatherVersion, table_->version()); -} - -TEST_F(TestTableBuilder, EmptyTable) { - Finish(); - - ASSERT_FALSE(table_->HasDescription()); - ASSERT_EQ("", table_->GetDescription()); - ASSERT_EQ(1000, table_->num_rows()); - ASSERT_EQ(0, table_->num_columns()); -} - -TEST_F(TestTableBuilder, SetDescription) { - std::string desc("this is some good data"); - tb_->SetDescription(desc); - Finish(); - ASSERT_TRUE(table_->HasDescription()); - ASSERT_EQ(desc, table_->GetDescription()); -} - -void AssertArrayEquals(const ArrayMetadata& left, const ArrayMetadata& right) { - EXPECT_EQ(left.type, right.type); - EXPECT_EQ(left.offset, right.offset); - EXPECT_EQ(left.length, right.length); - EXPECT_EQ(left.null_count, right.null_count); - EXPECT_EQ(left.total_bytes, right.total_bytes); -} - -TEST_F(TestTableBuilder, AddPrimitiveColumn) { - std::unique_ptr cb = tb_->AddColumn("f0"); - - ArrayMetadata values1; - ArrayMetadata values2; - values1.type = fbs::Type::INT32; - values1.offset = 10000; - values1.length = 1000; - values1.null_count = 100; - values1.total_bytes = 4000; - - cb->SetValues(values1); - - std::string user_meta = "as you wish"; - cb->SetUserMetadata(user_meta); - - ASSERT_OK(cb->Finish()); - - cb = tb_->AddColumn("f1"); - - values2.type = fbs::Type::UTF8; - values2.offset = 14000; - values2.length = 1000; - values2.null_count = 100; - values2.total_bytes = 10000; - - cb->SetValues(values2); - ASSERT_OK(cb->Finish()); - - Finish(); - - ASSERT_EQ(2, table_->num_columns()); - - auto col = table_->column(0); - - ASSERT_EQ("f0", col->name()->str()); - ASSERT_EQ(user_meta, col->user_metadata()->str()); - - ArrayMetadata values3; - FromFlatbuffer(col->values(), &values3); - AssertArrayEquals(values3, values1); - - col = table_->column(1); - ASSERT_EQ("f1", col->name()->str()); - - ArrayMetadata values4; - FromFlatbuffer(col->values(), &values4); - AssertArrayEquals(values4, values2); -} - -TEST_F(TestTableBuilder, AddCategoryColumn) { - ArrayMetadata values1(fbs::Type::UINT8, 10000, 1000, 100, 4000); - ArrayMetadata levels(fbs::Type::UTF8, 14000, 10, 0, 300); - - std::unique_ptr cb = tb_->AddColumn("c0"); - cb->SetValues(values1); - cb->SetCategory(levels); - ASSERT_OK(cb->Finish()); - - cb = tb_->AddColumn("c1"); - cb->SetValues(values1); - cb->SetCategory(levels, true); - ASSERT_OK(cb->Finish()); - - Finish(); - - auto col = table_->column(0); - ASSERT_EQ(fbs::TypeMetadata::CategoryMetadata, col->metadata_type()); - - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); - - auto cat_ptr = static_cast(col->metadata()); - ASSERT_FALSE(cat_ptr->ordered()); - - FromFlatbuffer(cat_ptr->levels(), &result); - AssertArrayEquals(result, levels); - - col = table_->column(1); - cat_ptr = static_cast(col->metadata()); - ASSERT_TRUE(cat_ptr->ordered()); - FromFlatbuffer(cat_ptr->levels(), &result); - AssertArrayEquals(result, levels); -} - -TEST_F(TestTableBuilder, AddTimestampColumn) { - ArrayMetadata values1(fbs::Type::INT64, 10000, 1000, 100, 4000); - std::unique_ptr cb = tb_->AddColumn("c0"); - cb->SetValues(values1); - cb->SetTimestamp(TimeUnit::MILLI); - ASSERT_OK(cb->Finish()); - - cb = tb_->AddColumn("c1"); - - std::string tz("America/Los_Angeles"); - - cb->SetValues(values1); - cb->SetTimestamp(TimeUnit::SECOND, tz); - ASSERT_OK(cb->Finish()); - - Finish(); - - auto col = table_->column(0); - - ASSERT_EQ(fbs::TypeMetadata::TimestampMetadata, col->metadata_type()); - - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); - - auto ts_ptr = static_cast(col->metadata()); - ASSERT_EQ(fbs::TimeUnit::MILLISECOND, ts_ptr->unit()); - - col = table_->column(1); - ts_ptr = static_cast(col->metadata()); - ASSERT_EQ(fbs::TimeUnit::SECOND, ts_ptr->unit()); - ASSERT_EQ(tz, ts_ptr->timezone()->str()); -} - -TEST_F(TestTableBuilder, AddDateColumn) { - ArrayMetadata values1(fbs::Type::INT64, 10000, 1000, 100, 4000); - std::unique_ptr cb = tb_->AddColumn("d0"); - cb->SetValues(values1); - cb->SetDate(); - ASSERT_OK(cb->Finish()); - - Finish(); - - auto col = table_->column(0); - - ASSERT_EQ(fbs::TypeMetadata::DateMetadata, col->metadata_type()); - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); -} - -TEST_F(TestTableBuilder, AddTimeColumn) { - ArrayMetadata values1(fbs::Type::INT64, 10000, 1000, 100, 4000); - std::unique_ptr cb = tb_->AddColumn("c0"); - cb->SetValues(values1); - cb->SetTime(TimeUnit::SECOND); - ASSERT_OK(cb->Finish()); - Finish(); - - auto col = table_->column(0); - - ASSERT_EQ(fbs::TypeMetadata::TimeMetadata, col->metadata_type()); - ArrayMetadata result; - FromFlatbuffer(col->values(), &result); - AssertArrayEquals(result, values1); - - auto t_ptr = static_cast(col->metadata()); - ASSERT_EQ(fbs::TimeUnit::SECOND, t_ptr->unit()); -} - -void CheckArrays(const Array& expected, const Array& result) { - if (!result.Equals(expected)) { - std::stringstream pp_result; - std::stringstream pp_expected; - - ARROW_EXPECT_OK(PrettyPrint(result, 0, &pp_result)); - ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected)); - FAIL() << "Got: " << pp_result.str() << "\nExpected: " << pp_expected.str(); - } -} - -void CheckBatches(const RecordBatch& expected, const RecordBatch& result) { - if (!result.Equals(expected)) { - std::stringstream pp_result; - std::stringstream pp_expected; +class TestFeather : public ::testing::TestWithParam { + public: + void SetUp() { Initialize(); } - ARROW_EXPECT_OK(PrettyPrint(result, 0, &pp_result)); - ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected)); - FAIL() << "Got: " << pp_result.str() << "\nExpected: " << pp_expected.str(); - } -} + void Initialize() { ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); } -class TestTableReader : public ::testing::Test { - public: - void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); - ASSERT_OK(TableWriter::Open(stream_, &writer_)); + WriteProperties GetProperties() { + auto param = GetParam(); + auto props = WriteProperties::Defaults(); + props.version = param.version; + props.compression = param.compression; + return props; } - void Finish() { - // Write table footer - ASSERT_OK(writer_->Finalize()); - + void DoWrite(const Table& table) { + Initialize(); + ASSERT_OK(WriteTable(table, stream_.get(), GetProperties())); ASSERT_OK_AND_ASSIGN(output_, stream_->Finish()); - auto buffer = std::make_shared(output_); - ASSERT_OK(TableReader::Open(buffer, &reader_)); + ASSERT_OK_AND_ASSIGN(reader_, Reader::Open(buffer)); } - protected: - std::shared_ptr stream_; - std::unique_ptr writer_; - std::unique_ptr reader_; - - std::shared_ptr output_; -}; - -TEST_F(TestTableReader, ReadIndices) { - std::shared_ptr batch1; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch1)); - std::shared_ptr batch2; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch2)); - - ASSERT_OK(writer_->Append("f0", *batch1->column(0))); - ASSERT_OK(writer_->Append("f1", *batch1->column(1))); - ASSERT_OK(writer_->Append("f2", *batch2->column(0))); - ASSERT_OK(writer_->Append("f3", *batch2->column(1))); - Finish(); - - std::vector indices({3, 0, 5}); - std::shared_ptr
result; - ASSERT_OK(reader_->Read(indices, &result)); - std::vector> fields; - std::vector> arrays; - fields.push_back(std::make_shared("f0", int32())); - arrays.push_back(batch1->column(0)); - fields.push_back(std::make_shared("f3", int32())); - arrays.push_back(batch2->column(1)); - auto expected = Table::Make(std::make_shared(fields), arrays); - AssertTablesEqual(*expected, *result); -} - -TEST_F(TestTableReader, ReadNames) { - std::shared_ptr batch1; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch1)); - std::shared_ptr batch2; - ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch2)); - - ASSERT_OK(writer_->Append("f0", *batch1->column(0))); - ASSERT_OK(writer_->Append("f1", *batch1->column(1))); - ASSERT_OK(writer_->Append("f2", *batch2->column(0))); - ASSERT_OK(writer_->Append("f3", *batch2->column(1))); - Finish(); - - std::vector names({"f3", "f0", "f5"}); - std::shared_ptr
result; - ASSERT_OK(reader_->Read(names, &result)); - std::vector> fields; - std::vector> arrays; - fields.push_back(std::make_shared("f0", int32())); - arrays.push_back(batch1->column(0)); - fields.push_back(std::make_shared("f3", int32())); - arrays.push_back(batch2->column(1)); - auto expected = Table::Make(std::make_shared(fields), arrays); - AssertTablesEqual(*expected, *result); -} - -class TestTableWriter : public ::testing::Test { - public: - void SetUp() { - ASSERT_OK_AND_ASSIGN(stream_, io::BufferOutputStream::Create()); - ASSERT_OK(TableWriter::Open(stream_, &writer_)); + void CheckSlice(std::shared_ptr batch, int start, int size) { + batch = batch->Slice(start, size); + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch}, &table)); + + DoWrite(*table); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + if (table->num_rows() > 0) { + AssertTablesEqual(*table, *result); + } else { + ASSERT_EQ(0, result->num_rows()); + ASSERT_TRUE(result->schema()->Equals(*table->schema())); + } } - void Finish() { - // Write table footer - ASSERT_OK(writer_->Finalize()); - - ASSERT_OK_AND_ASSIGN(output_, stream_->Finish()); - - auto buffer = std::make_shared(output_); - ASSERT_OK(TableReader::Open(buffer, &reader_)); + void CheckSlices(std::shared_ptr batch) { + std::vector starts = {0, 1, 300, 301, 302, 303, 304, 305, 306, 307}; + std::vector sizes = {0, 1, 7, 8, 30, 32, 100}; + for (auto start : starts) { + for (auto size : sizes) { + CheckSlice(batch, start, size); + } + } } - void CheckBatch(std::shared_ptr batch) { + void CheckRoundtrip(std::shared_ptr batch) { std::shared_ptr
table; std::vector> batches = {batch}; ASSERT_OK(Table::FromRecordBatches(batches, &table)); - ASSERT_OK(writer_->Write(*table)); - Finish(); + + DoWrite(*table); std::shared_ptr
read_table; ASSERT_OK(reader_->Read(&read_table)); @@ -384,65 +113,73 @@ class TestTableWriter : public ::testing::Test { protected: std::shared_ptr stream_; - std::unique_ptr writer_; - std::unique_ptr reader_; - + std::shared_ptr reader_; std::shared_ptr output_; }; -TEST_F(TestTableWriter, EmptyTable) { - Finish(); +TEST_P(TestFeather, ReadIndicesOrNames) { + std::shared_ptr batch1; + ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch1)); + + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch1}, &table)); - ASSERT_FALSE(reader_->HasDescription()); - ASSERT_EQ("", reader_->GetDescription()); + DoWrite(*table); - ASSERT_EQ(0, reader_->num_rows()); - ASSERT_EQ(0, reader_->num_columns()); -} + auto expected = Table::Make(schema({field("f1", int32())}), {batch1->column(1)}); + + std::shared_ptr
result1, result2; + + std::vector indices = {1}; + ASSERT_OK(reader_->Read(indices, &result1)); + AssertTablesEqual(*expected, *result1); -TEST_F(TestTableWriter, SetNumRows) { - writer_->SetNumRows(1000); - Finish(); - ASSERT_EQ(1000, reader_->num_rows()); + std::vector names = {"f1"}; + ASSERT_OK(reader_->Read(names, &result2)); + AssertTablesEqual(*expected, *result2); } -TEST_F(TestTableWriter, SetDescription) { - std::string desc("contents of the file"); - writer_->SetDescription(desc); - Finish(); +TEST_P(TestFeather, EmptyTable) { + std::vector> columns; + auto table = Table::Make(schema({}), columns, 0); - ASSERT_TRUE(reader_->HasDescription()); - ASSERT_EQ(desc, reader_->GetDescription()); + DoWrite(*table); - ASSERT_EQ(0, reader_->num_rows()); - ASSERT_EQ(0, reader_->num_columns()); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + AssertTablesEqual(*table, *result); +} + +TEST_P(TestFeather, SetNumRows) { + std::vector> columns; + auto table = Table::Make(schema({}), columns, 1000); + DoWrite(*table); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + ASSERT_EQ(1000, result->num_rows()); } -TEST_F(TestTableWriter, PrimitiveRoundTrip) { +TEST_P(TestFeather, PrimitiveRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch)); - ASSERT_OK(writer_->Append("f0", *batch->column(0))); - ASSERT_OK(writer_->Append("f1", *batch->column(1))); - Finish(); + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch}, &table)); - std::shared_ptr col; - ASSERT_OK(reader_->GetColumn(0, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(0))); - ASSERT_EQ("f0", reader_->GetColumnName(0)); + DoWrite(*table); - ASSERT_OK(reader_->GetColumn(1, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(1))); - ASSERT_EQ("f1", reader_->GetColumnName(1)); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + AssertTablesEqual(*table, *result); } -TEST_F(TestTableWriter, CategoryRoundtrip) { +TEST_P(TestFeather, CategoryRoundtrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeDictionaryFlat(&batch)); - CheckBatch(batch); + CheckRoundtrip(batch); } -TEST_F(TestTableWriter, TimeTypes) { +TEST_P(TestFeather, TimeTypes) { std::vector is_valid = {true, true, true, false, true, true, true}; auto f0 = field("f0", date32()); auto f1 = field("f1", time32(TimeUnit::MILLI)); @@ -485,85 +222,63 @@ TEST_F(TestTableWriter, TimeTypes) { } auto batch = RecordBatch::Make(schema, 7, std::move(arrays)); - CheckBatch(batch); + CheckRoundtrip(batch); } -TEST_F(TestTableWriter, VLenPrimitiveRoundTrip) { +TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); - CheckBatch(batch); + CheckRoundtrip(batch); } -TEST_F(TestTableWriter, PrimitiveNullRoundTrip) { +TEST_P(TestFeather, PrimitiveNullRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeNullRecordBatch(&batch)); - for (int i = 0; i < batch->num_columns(); ++i) { - ASSERT_OK(writer_->Append(batch->column_name(i), *batch->column(i))); - } - Finish(); - - std::shared_ptr col; - for (int i = 0; i < batch->num_columns(); ++i) { - ASSERT_OK(reader_->GetColumn(i, &col)); - ASSERT_EQ(batch->column_name(i), reader_->GetColumnName(i)); - StringArray str_values(batch->column(i)->length(), nullptr, nullptr, - batch->column(i)->null_bitmap(), - batch->column(i)->null_count()); - CheckArrays(str_values, *col->chunk(0)); - } -} - -class TestTableWriterSlice : public TestTableWriter, - public ::testing::WithParamInterface> { - public: - void CheckSlice(std::shared_ptr batch) { - auto p = GetParam(); - auto start = std::get<0>(p); - auto size = std::get<1>(p); - - batch = batch->Slice(start, size); - - ASSERT_OK(writer_->Append("f0", *batch->column(0))); - ASSERT_OK(writer_->Append("f1", *batch->column(1))); - Finish(); + std::shared_ptr
table; + ASSERT_OK(Table::FromRecordBatches({batch}, &table)); - std::shared_ptr col; - ASSERT_OK(reader_->GetColumn(0, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(0))); - ASSERT_EQ("f0", reader_->GetColumnName(0)); + DoWrite(*table); - ASSERT_OK(reader_->GetColumn(1, &col)); - ASSERT_TRUE(col->chunk(0)->Equals(batch->column(1))); - ASSERT_EQ("f1", reader_->GetColumnName(1)); + std::shared_ptr
result; + ASSERT_OK(reader_->Read(&result)); + + if (GetParam().version == kFeatherV1Version) { + std::vector> expected_fields; + for (int i = 0; i < batch->num_columns(); ++i) { + ASSERT_EQ(batch->column_name(i), reader_->schema()->field(i)->name()); + StringArray str_values(batch->column(i)->length(), nullptr, nullptr, + batch->column(i)->null_bitmap(), + batch->column(i)->null_count()); + AssertArraysEqual(str_values, *result->column(i)->chunk(0)); + } + } else { + AssertTablesEqual(*table, *result); } -}; +} -TEST_P(TestTableWriterSlice, SliceRoundTrip) { +TEST_P(TestFeather, SliceRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeIntBatchSized(600, &batch)); - CheckSlice(batch); + CheckSlices(batch); } -TEST_P(TestTableWriterSlice, SliceStringsRoundTrip) { - auto p = GetParam(); - auto start = std::get<0>(p); - auto with_nulls = start % 2 == 0; +TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, with_nulls)); - CheckSlice(batch); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + CheckSlices(batch); } -TEST_P(TestTableWriterSlice, SliceBooleanRoundTrip) { +TEST_P(TestFeather, SliceBooleanRoundTrip) { std::shared_ptr batch; ASSERT_OK(ipc::test::MakeBooleanBatchSized(600, &batch)); - CheckSlice(batch); + CheckSlices(batch); } -INSTANTIATE_TEST_SUITE_P(TestTableWriterSliceOffsets, TestTableWriterSlice, - ::testing::Combine(::testing::Values(0, 1, 300, 301, 302, 303, - 304, 305, 306, 307), - ::testing::Values(0, 1, 7, 8, 30, 32, 100))); +INSTANTIATE_TEST_SUITE_P( + FeatherTests, TestFeather, + ::testing::Values(TestParam(kFeatherV1Version), TestParam(kFeatherV2Version), + TestParam(kFeatherV2Version, Compression::ZSTD))); } // namespace feather } // namespace ipc diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 129a6d13645..3ce72177155 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -479,12 +479,14 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, const int nbatches = static_cast(batches.size()); const int ncolumns = static_cast(schema->num_fields()); + int64_t num_rows = 0; for (int i = 0; i < nbatches; ++i) { if (!batches[i]->schema()->Equals(*schema, false)) { return Status::Invalid("Schema at index ", static_cast(i), " was different: \n", schema->ToString(), "\nvs\n", batches[i]->schema()->ToString()); } + num_rows += batches[i]->num_rows(); } std::vector> columns(ncolumns); @@ -497,7 +499,7 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, columns[i] = std::make_shared(column_arrays, schema->field(i)->type()); } - *table = Table::Make(schema, columns); + *table = Table::Make(schema, columns, num_rows); return Status::OK(); } diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 8700f67ae62..3302f7c5fd5 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -16,61 +16,41 @@ # under the License. # --------------------------------------------------------------------- -# Implement legacy Feather file format +# Implement Feather file format class FeatherError(Exception): pass -cdef class FeatherWriter: - cdef: - unique_ptr[CFeatherWriter] writer - - cdef public: - int64_t num_rows - - def __cinit__(self): - self.num_rows = -1 - - def open(self, object dest): - cdef shared_ptr[COutputStream] sink - get_writer(dest, &sink) +def write_feather(Table table, object dest, compression=None, + compression_level=None, + version=2): + cdef shared_ptr[COutputStream] sink + get_writer(dest, &sink) - with nogil: - check_status(CFeatherWriter.Open(sink, &self.writer)) - - def close(self): - if self.num_rows < 0: - self.num_rows = 0 - self.writer.get().SetNumRows(self.num_rows) - with nogil: - check_status(self.writer.get().Finalize()) + cdef CFeatherProperties properties + if version == 2: + properties.version = kFeatherV2Version + else: + properties.version = kFeatherV1Version - def write_array(self, object name, object col, object mask=None): - cdef Array arr + if compression == 'zstd': + properties.compression = CCompressionType_ZSTD + else: + properties.compression = CCompressionType_UNCOMPRESSED - if self.num_rows >= 0: - if len(col) != self.num_rows: - raise ValueError('prior column had a different number of rows') - else: - self.num_rows = len(col) + if compression_level is not None: + properties.compression_level = compression_level - if isinstance(col, Array): - arr = col - else: - arr = Array.from_pandas(col, mask=mask) - - cdef c_string c_name = tobytes(name) - - with nogil: - check_status( - self.writer.get().Append(c_name, deref(arr.sp_array))) + with nogil: + check_status(WriteFeather(deref(table.table), sink.get(), + properties)) cdef class FeatherReader: cdef: - unique_ptr[CFeatherReader] reader + shared_ptr[CFeatherReader] reader def __cinit__(self): pass @@ -80,32 +60,9 @@ cdef class FeatherReader: get_reader(source, use_memory_map, &reader) with nogil: - check_status(CFeatherReader.Open(reader, &self.reader)) - - @property - def num_rows(self): - return self.reader.get().num_rows() - - @property - def num_columns(self): - return self.reader.get().num_columns() - - def get_column_name(self, int i): - cdef c_string name = self.reader.get().GetColumnName(i) - return frombytes(name) - - def get_column(self, int i): - if i < 0 or i >= self.num_columns: - raise IndexError(i) - - cdef shared_ptr[CChunkedArray] sp_chunked_array - with nogil: - check_status(self.reader.get() - .GetColumn(i, &sp_chunked_array)) - - return pyarrow_wrap_chunked_array(sp_chunked_array) + self.reader = GetResultValue(CFeatherReader.Open(reader)) - def _read(self): + def read(self): cdef shared_ptr[CTable] sp_table with nogil: check_status(self.reader.get() @@ -113,7 +70,7 @@ cdef class FeatherReader: return pyarrow_wrap_table(sp_table) - def _read_indices(self, indices): + def read_indices(self, indices): cdef: shared_ptr[CTable] sp_table vector[int] c_indices @@ -126,7 +83,7 @@ cdef class FeatherReader: return pyarrow_wrap_table(sp_table) - def _read_names(self, names): + def read_names(self, names): cdef: shared_ptr[CTable] sp_table vector[c_string] c_names diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 82b8ee9146b..d5e961b9846 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -29,74 +29,6 @@ def _check_pandas_version(): raise ImportError("feather requires pandas >= 0.17.0") -class FeatherReader(ext.FeatherReader): - - def __init__(self, source): - _check_pandas_version() - self.source = source - self.open(source) - - def read_table(self, columns=None): - if columns is None: - return self._read() - column_types = [type(column) for column in columns] - if all(map(lambda t: t == int, column_types)): - return self._read_indices(columns) - elif all(map(lambda t: t == str, column_types)): - return self._read_names(columns) - - column_type_names = [t.__name__ for t in column_types] - raise TypeError("Columns must be indices or names. " - "Got columns {} of types {}" - .format(columns, column_type_names)) - - def read_pandas(self, columns=None, use_threads=True): - return self.read_table(columns=columns).to_pandas( - use_threads=use_threads) - - -def check_chunked_overflow(name, col): - if col.num_chunks == 1: - return - - if col.type in (ext.binary(), ext.string()): - raise ValueError("Column '{}' exceeds 2GB maximum capacity of " - "a Feather binary column. This restriction may be " - "lifted in the future".format(name)) - else: - # TODO(wesm): Not sure when else this might be reached - raise ValueError("Column '{}' of type {} was chunked on conversion " - "to Arrow and cannot be currently written to " - "Feather format".format(name, str(col.type))) - - -class FeatherWriter: - - def __init__(self, dest): - _check_pandas_version() - self.dest = dest - self.writer = ext.FeatherWriter() - self.writer.open(dest) - - def write(self, df): - if (_pandas_api.has_sparse - and isinstance(df, _pandas_api.pd.SparseDataFrame)): - df = df.to_dense() - - if not df.columns.is_unique: - raise ValueError("cannot serialize duplicate column names") - - # TODO(wesm): Remove this length check, see ARROW-1732 - if len(df.columns) > 0: - table = Table.from_pandas(df, preserve_index=False) - for i, name in enumerate(table.schema.names): - col = table[i] - check_chunked_overflow(name, col) - self.writer.write_array(name, col.chunk(0)) - - self.writer.close() - - class FeatherDataset: """ Encapsulates details of reading a list of Feather files. @@ -127,15 +59,15 @@ def read_table(self, columns=None): pyarrow.Table Content of the file as a table (of columns) """ - _fil = FeatherReader(self.paths[0]).read_table(columns=columns) + _fil = read_table(self.paths[0], columns=columns) self._tables = [_fil] self.schema = _fil.schema - for fil in self.paths[1:]: - fil_table = FeatherReader(fil).read_table(columns=columns) + for path in self.paths[1:]: + table = read_table(path, columns=columns) if self.validate_schema: - self.validate_schemas(fil, fil_table) - self._tables.append(fil_table) + self.validate_schemas(path, table) + self._tables.append(table) return concat_tables(self._tables) def validate_schemas(self, piece, table): @@ -165,7 +97,23 @@ def read_pandas(self, columns=None, use_threads=True): use_threads=use_threads) -def write_feather(df, dest): +def check_chunked_overflow(name, col): + if col.num_chunks == 1: + return + + if col.type in (ext.binary(), ext.string()): + raise ValueError("Column '{}' exceeds 2GB maximum capacity of " + "a Feather binary column. This restriction may be " + "lifted in the future".format(name)) + else: + # TODO(wesm): Not sure when else this might be reached + raise ValueError("Column '{}' of type {} was chunked on conversion " + "to Arrow and cannot be currently written to " + "Feather format".format(name, str(col.type))) + + +def write_feather(df, dest, compression=None, compression_level=None, + version=2): """ Write a pandas.DataFrame to Feather format. @@ -175,15 +123,38 @@ def write_feather(df, dest): Dataframe to write out as feather format. dest : str Local destination path. + compression : string, default None + Either "zstd" or None (for now) + compression_level : int, default None + Use a compression level particular to the chosen compressor. If None + use the default compression level + version : int, default 2 + Feather file version. Version 2 is the current. Version 1 is the more + limited legacy format """ - writer = FeatherWriter(dest) + _check_pandas_version() + if (_pandas_api.has_sparse + and isinstance(df, _pandas_api.pd.SparseDataFrame)): + df = df.to_dense() + + if _pandas_api.is_data_frame(df): + table = Table.from_pandas(df, preserve_index=False) + + if version == 1: + # Version 1 does not chunking + for i, name in enumerate(table.schema.names): + col = table[i] + check_chunked_overflow(name, col) + else: + table = df + + if version == 1 and len(table.column_names) > len(set(table.column_names)): + raise ValueError("cannot serialize duplicate column names") + try: - writer.write(df) + ext.write_feather(table, dest, compression=compression, + compression_level=compression_level, version=version) except Exception: - # Try to make sure the resource is closed - import gc - writer = None - gc.collect() if isinstance(dest, str): try: os.remove(dest) @@ -192,7 +163,7 @@ def write_feather(df, dest): raise -def read_feather(source, columns=None, use_threads=True): +def read_feather(source, columns=None, as_pandas=True, use_threads=True): """ Read a pandas.DataFrame from Feather format. @@ -203,6 +174,8 @@ def read_feather(source, columns=None, use_threads=True): columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read. + as_pandas : bool, default True + Convert result to pandas.DataFrame use_threads: bool, default True Whether to parallelize reading using multiple threads. @@ -210,8 +183,28 @@ def read_feather(source, columns=None, use_threads=True): ------- df : pandas.DataFrame """ - reader = FeatherReader(source) - return reader.read_pandas(columns=columns, use_threads=use_threads) + _check_pandas_version() + reader = ext.FeatherReader() + reader.open(source) + + def _handle_result(result): + if as_pandas: + result = result.to_pandas(use_threads=use_threads) + return result + + if columns is None: + return _handle_result(reader.read()) + + column_types = [type(column) for column in columns] + if all(map(lambda t: t == int, column_types)): + return _handle_result(reader.read_indices(columns)) + elif all(map(lambda t: t == str, column_types)): + return _handle_result(reader.read_names(columns)) + + column_type_names = [t.__name__ for t in column_types] + raise TypeError("Columns must be indices or names. " + "Got columns {} of types {}" + .format(columns, column_type_names)) def read_table(source, columns=None): @@ -229,5 +222,4 @@ def read_table(source, columns=None): ------- table : pyarrow.Table """ - reader = FeatherReader(source) - return reader.read_table(columns=columns) + return read_feather(source, columns=columns, as_pandas=False) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4b60e82d5e3..95b059f8648 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1320,33 +1320,26 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: const CIpcWriteOptions& options, CIpcPayload* out) - cdef cppclass CFeatherWriter" arrow::ipc::feather::TableWriter": - @staticmethod - CStatus Open(const shared_ptr[COutputStream]& stream, - unique_ptr[CFeatherWriter]* out) + int kFeatherV1Version" arrow::ipc::feather::kFeatherV1Version" + int kFeatherV2Version" arrow::ipc::feather::kFeatherV2Version" - void SetDescription(const c_string& desc) - void SetNumRows(int64_t num_rows) + cdef cppclass CFeatherProperties" arrow::ipc::feather::WriteProperties": + int version + int chunksize + CCompressionType compression + int compression_level - CStatus Append(const c_string& name, const CArray& values) - CStatus Finalize() + CStatus WriteFeather" arrow::ipc::feather::WriteTable"\ + (const CTable& table, COutputStream* out, + CFeatherProperties properties) - cdef cppclass CFeatherReader" arrow::ipc::feather::TableReader": + cdef cppclass CFeatherReader" arrow::ipc::feather::Reader": @staticmethod - CStatus Open(const shared_ptr[CRandomAccessFile]& file, - unique_ptr[CFeatherReader]* out) - - c_string GetDescription() - c_bool HasDescription() - - int64_t num_rows() - int64_t num_columns() - + CResult[shared_ptr[CFeatherReader]] Open( + const shared_ptr[CRandomAccessFile]& file) + int version() shared_ptr[CSchema] schema() - CStatus GetColumn(int i, shared_ptr[CChunkedArray]* out) - c_string GetColumnName(int i) - CStatus Read(shared_ptr[CTable]* out) CStatus Read(const vector[int] indices, shared_ptr[CTable]* out) CStatus Read(const vector[c_string] names, shared_ptr[CTable]* out) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 7a0c5c2b806..ff3d0d0ff4d 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -19,16 +19,13 @@ import os import sys import tempfile -import unittest import pytest -from numpy.testing import assert_array_equal import numpy as np import pyarrow as pa -from pyarrow.feather import (read_feather, write_feather, - read_table, FeatherReader, FeatherDataset) -from pyarrow.lib import FeatherWriter +from pyarrow.feather import (read_feather, write_feather, read_table, + FeatherDataset) try: @@ -49,512 +46,490 @@ def random_path(prefix='feather_'): return tempfile.mktemp(prefix=prefix) -class TestFeatherReader(unittest.TestCase): +@pytest.fixture(scope="module", params=[1, 2]) +def version(request): + yield request.param - def setUp(self): - self.test_files = [] - def tearDown(self): - for path in self.test_files: - try: - os.remove(path) - except os.error: - pass +TEST_FILES = None - def test_file_not_exist(self): - with pytest.raises(pa.ArrowIOError): - FeatherReader('test_invalid_file') - def _get_null_counts(self, path, columns=None): - reader = FeatherReader(path) - counts = [] - for i in range(reader.num_columns): - col = reader.get_column(i) - name = reader.get_column_name(i) - if columns is None or name in columns: - counts.append(col.null_count) +def setup_module(module): + global TEST_FILES + TEST_FILES = [] - return counts - def _check_pandas_roundtrip(self, df, expected=None, path=None, - columns=None, null_counts=None, - use_threads=False): - if path is None: - path = random_path() +def teardown_module(module): + for path in TEST_FILES: + try: + os.remove(path) + except os.error: + pass - self.test_files.append(path) - write_feather(df, path) - if not os.path.exists(path): - raise Exception('file not written') - result = read_feather(path, columns, use_threads=use_threads) - if expected is None: - expected = df +def test_file_not_exist(): + with pytest.raises(pa.ArrowIOError): + read_feather('test_invalid_file') - assert_frame_equal(result, expected) - if null_counts is None: - null_counts = np.zeros(len(expected.columns)) - - np.testing.assert_array_equal(self._get_null_counts(path, columns), - null_counts) - - def _assert_error_on_write(self, df, exc, path=None): - # check that we are raising the exception - # on writing - - if path is None: - path = random_path() - - self.test_files.append(path) - - def f(): - write_feather(df, path) - - pytest.raises(exc, f) - - def test_dataset(self): - num_values = (100, 100) - num_files = 5 - paths = [random_path() for i in range(num_files)] - df = pd.DataFrame(np.random.randn(*num_values), - columns=['col_' + str(i) - for i in range(num_values[1])]) - - self.test_files.extend(paths) - for index, path in enumerate(paths): - rows = (index * (num_values[0] // num_files), - (index + 1) * (num_values[0] // num_files)) - writer = FeatherWriter() - writer.open(path) - - for col in range(num_values[1]): - writer.write_array(df.columns[col], - df.iloc[rows[0]:rows[1], col]) - - writer.close() - - data = FeatherDataset(paths).read_pandas() - assert_frame_equal(data, df) - - def test_num_columns_attr(self): - df0 = pd.DataFrame({}) - df1 = pd.DataFrame({ - 'foo': [1, 2, 3, 4, 5] - }) - df2 = pd.DataFrame({ - 'foo': [1, 2, 3, 4, 5], - 'bar': [1, 2, 3, 4, 5] - }) - for df, ncols in zip([df0, df1, df2], [0, 1, 2]): - path = random_path() - self.test_files.append(path) - write_feather(df, path) - - reader = FeatherReader(path) - assert reader.num_columns == ncols - - def test_num_rows_attr(self): - df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]}) +def _check_pandas_roundtrip(df, expected=None, path=None, + columns=None, use_threads=False, + version=None, compression=None): + if path is None: path = random_path() - self.test_files.append(path) - write_feather(df, path) - reader = FeatherReader(path) - assert reader.num_rows == len(df) + TEST_FILES.append(path) + write_feather(df, path, compression=compression, version=version) + if not os.path.exists(path): + raise Exception('file not written') + + result = read_feather(path, columns, use_threads=use_threads) + if expected is None: + expected = df - df = pd.DataFrame({}) + assert_frame_equal(result, expected) + + +def _assert_error_on_write(df, exc, path=None): + # check that we are raising the exception + # on writing + + if path is None: path = random_path() - self.test_files.append(path) + + TEST_FILES.append(path) + + def f(): write_feather(df, path) - reader = FeatherReader(path) - assert reader.num_rows == 0 + pytest.raises(exc, f) - def test_float_no_nulls(self): - data = {} - numpy_dtypes = ['f4', 'f8'] - num_values = 100 - for dtype in numpy_dtypes: - values = np.random.randn(num_values) - data[dtype] = values.astype(dtype) +def test_dataset(version): + num_values = (100, 100) + num_files = 5 + paths = [random_path() for i in range(num_files)] + df = pd.DataFrame(np.random.randn(*num_values), + columns=['col_' + str(i) + for i in range(num_values[1])]) - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) + TEST_FILES.extend(paths) + for index, path in enumerate(paths): + rows = (index * (num_values[0] // num_files), + (index + 1) * (num_values[0] // num_files)) - def test_read_table(self): - num_values = (100, 100) - path = random_path() + write_feather(df.iloc[rows[0]:rows[1]], path, version=version) + + data = FeatherDataset(paths).read_pandas() + assert_frame_equal(data, df) + + +def test_float_no_nulls(version): + data = {} + numpy_dtypes = ['f4', 'f8'] + num_values = 100 + + for dtype in numpy_dtypes: + values = np.random.randn(num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, version=version) + + +def test_read_table(version): + num_values = (100, 100) + path = random_path() + + TEST_FILES.append(path) + + values = np.random.randint(0, 100, size=num_values) + + df = pd.DataFrame(values, columns=['col_' + str(i) + for i in range(100)]) + write_feather(df, path, version=version) - self.test_files.append(path) - writer = FeatherWriter() - writer.open(path) + data = pd.DataFrame(values, + columns=['col_' + str(i) for i in range(100)]) + table = pa.Table.from_pandas(data) + result = read_table(path) + + assert_frame_equal(table.to_pandas(), result.to_pandas()) + + +def test_float_nulls(version): + num_values = 100 + + path = random_path() + TEST_FILES.append(path) + + null_mask = np.random.randint(0, 10, size=num_values) < 3 + dtypes = ['f4', 'f8'] + expected_cols = [] + + arrays = [] + for name in dtypes: + values = np.random.randn(num_values).astype(name) + arrays.append(pa.array(values, mask=null_mask)) + + values[null_mask] = np.nan + + expected_cols.append(values) + + table = pa.table(arrays, names=dtypes) + write_feather(table, path, version=version) + + ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), + columns=dtypes) + + result = read_feather(path) + assert_frame_equal(result, ex_frame) + + +def test_integer_no_nulls(version): + data = {} + + numpy_dtypes = ['i1', 'i2', 'i4', 'i8', + 'u1', 'u2', 'u4', 'u8'] + num_values = 100 + + for dtype in numpy_dtypes: values = np.random.randint(0, 100, size=num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, version=version) - for i in range(100): - writer.write_array('col_' + str(i), values[:, i]) - writer.close() +def test_platform_numpy_integers(version): + data = {} - data = pd.DataFrame(values, - columns=['col_' + str(i) for i in range(100)]) - table = pa.Table.from_pandas(data) + numpy_dtypes = ['longlong'] + num_values = 100 - result = read_table(path) + for dtype in numpy_dtypes: + values = np.random.randint(0, 100, size=num_values) + data[dtype] = values.astype(dtype) - assert_frame_equal(table.to_pandas(), result.to_pandas()) + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, version=version) - def test_float_nulls(self): - num_values = 100 - path = random_path() - self.test_files.append(path) - writer = FeatherWriter() - writer.open(path) +def test_integer_with_nulls(version): + # pandas requires upcast to float dtype + path = random_path() + TEST_FILES.append(path) - null_mask = np.random.randint(0, 10, size=num_values) < 3 - dtypes = ['f4', 'f8'] - expected_cols = [] - null_counts = [] - for name in dtypes: - values = np.random.randn(num_values).astype(name) - writer.write_array(name, values, null_mask) + int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + num_values = 100 - values[null_mask] = np.nan + arrays = [] + null_mask = np.random.randint(0, 10, size=num_values) < 3 + expected_cols = [] + for name in int_dtypes: + values = np.random.randint(0, 100, size=num_values) + arrays.append(pa.array(values, mask=null_mask)) - expected_cols.append(values) - null_counts.append(null_mask.sum()) + expected = values.astype('f8') + expected[null_mask] = np.nan - writer.close() + expected_cols.append(expected) - ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), - columns=dtypes) + table = pa.table(arrays, names=int_dtypes) + write_feather(table, path, version=version) - result = read_feather(path) - assert_frame_equal(result, ex_frame) - assert_array_equal(self._get_null_counts(path), null_counts) + ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), + columns=int_dtypes) - def test_integer_no_nulls(self): - data = {} + result = read_feather(path) + assert_frame_equal(result, ex_frame) - numpy_dtypes = ['i1', 'i2', 'i4', 'i8', - 'u1', 'u2', 'u4', 'u8'] - num_values = 100 - for dtype in numpy_dtypes: - values = np.random.randint(0, 100, size=num_values) - data[dtype] = values.astype(dtype) +def test_boolean_no_nulls(version): + num_values = 100 - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) + np.random.seed(0) - def test_platform_numpy_integers(self): - data = {} + df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) + _check_pandas_roundtrip(df, version=version) - numpy_dtypes = ['longlong'] - num_values = 100 - for dtype in numpy_dtypes: - values = np.random.randint(0, 100, size=num_values) - data[dtype] = values.astype(dtype) +def test_boolean_nulls(version): + # pandas requires upcast to object dtype + path = random_path() + TEST_FILES.append(path) - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) + num_values = 100 + np.random.seed(0) - def test_integer_with_nulls(self): - # pandas requires upcast to float dtype - path = random_path() - self.test_files.append(path) + mask = np.random.randint(0, 10, size=num_values) < 3 + values = np.random.randint(0, 10, size=num_values) < 5 - int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - num_values = 100 + table = pa.table([pa.array(values, mask=mask)], names=['bools']) + write_feather(table, path, version=version) - writer = FeatherWriter() - writer.open(path) + expected = values.astype(object) + expected[mask] = None - null_mask = np.random.randint(0, 10, size=num_values) < 3 - expected_cols = [] - for name in int_dtypes: - values = np.random.randint(0, 100, size=num_values) - writer.write_array(name, values, null_mask) + ex_frame = pd.DataFrame({'bools': expected}) - expected = values.astype('f8') - expected[null_mask] = np.nan + result = read_feather(path) + assert_frame_equal(result, ex_frame) - expected_cols.append(expected) - ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), - columns=int_dtypes) +def test_buffer_bounds_error(version): + # ARROW-1676 + path = random_path() + TEST_FILES.append(path) - writer.close() + for i in range(16, 256): + values = pa.array([None] + list(range(i)), type=pa.float64()) + write_feather(pa.table([values], names=['arr']), path, + version=version) result = read_feather(path) - assert_frame_equal(result, ex_frame) + expected = pd.DataFrame({'arr': values.to_pandas()}) + assert_frame_equal(result, expected) - def test_boolean_no_nulls(self): - num_values = 100 + _check_pandas_roundtrip(expected, version=version) - np.random.seed(0) - df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) - self._check_pandas_roundtrip(df) +def test_boolean_object_nulls(version): + repeats = 100 + arr = np.array([False, None, True] * repeats, dtype=object) + df = pd.DataFrame({'bools': arr}) + _check_pandas_roundtrip(df, version=version) - def test_boolean_nulls(self): - # pandas requires upcast to object dtype - path = random_path() - self.test_files.append(path) - num_values = 100 - np.random.seed(0) +def test_delete_partial_file_on_error(version): + if sys.platform == 'win32': + pytest.skip('Windows hangs on to file handle for some reason') - writer = FeatherWriter() - writer.open(path) + class CustomClass: + pass - mask = np.random.randint(0, 10, size=num_values) < 3 - values = np.random.randint(0, 10, size=num_values) < 5 - writer.write_array('bools', values, mask) + # strings will fail + df = pd.DataFrame( + { + 'numbers': range(5), + 'strings': [b'foo', None, 'bar', CustomClass(), np.nan]}, + columns=['numbers', 'strings']) - expected = values.astype(object) - expected[mask] = None + path = random_path() + try: + write_feather(df, path, version=version) + except Exception: + pass - writer.close() + assert not os.path.exists(path) - ex_frame = pd.DataFrame({'bools': expected}) - result = read_feather(path) - assert_frame_equal(result, ex_frame) +def test_strings(version): + repeats = 1000 - def test_buffer_bounds_error(self): - # ARROW-1676 - path = random_path() - self.test_files.append(path) + # Mixed bytes, unicode, strings coerced to binary + values = [b'foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) - for i in range(16, 256): - values = pa.array([None] + list(range(i)), type=pa.float64()) + ex_values = [b'foo', None, b'bar', b'qux', np.nan] + expected = pd.DataFrame({'strings': ex_values * repeats}) + _check_pandas_roundtrip(df, expected, version=version) - writer = FeatherWriter() - writer.open(path) + # embedded nulls are ok + values = ['foo', None, 'bar', 'qux', None] + df = pd.DataFrame({'strings': values * repeats}) + expected = pd.DataFrame({'strings': values * repeats}) + _check_pandas_roundtrip(df, expected, version=version) - writer.write_array('arr', values) - writer.close() + values = ['foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + expected = pd.DataFrame({'strings': values * repeats}) + _check_pandas_roundtrip(df, expected, version=version) - result = read_feather(path) - expected = pd.DataFrame({'arr': values.to_pandas()}) - assert_frame_equal(result, expected) - self._check_pandas_roundtrip(expected, null_counts=[1]) +def test_empty_strings(version): + df = pd.DataFrame({'strings': [''] * 10}) + _check_pandas_roundtrip(df, version=version) - def test_boolean_object_nulls(self): - repeats = 100 - arr = np.array([False, None, True] * repeats, dtype=object) - df = pd.DataFrame({'bools': arr}) - self._check_pandas_roundtrip(df, null_counts=[1 * repeats]) - def test_delete_partial_file_on_error(self): - if sys.platform == 'win32': - pytest.skip('Windows hangs on to file handle for some reason') +def test_all_none(version): + df = pd.DataFrame({'all_none': [None] * 10}) + _check_pandas_roundtrip(df, version=version) - class CustomClass: - pass - # strings will fail - df = pd.DataFrame( - { - 'numbers': range(5), - 'strings': [b'foo', None, 'bar', CustomClass(), np.nan]}, - columns=['numbers', 'strings']) +def test_all_null_category(version): + # ARROW-1188 + df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)}) + df = df.assign(B=df.B.astype("category")) + _check_pandas_roundtrip(df, version=version) - path = random_path() - try: - write_feather(df, path) - except Exception: - pass - assert not os.path.exists(path) - - def test_strings(self): - repeats = 1000 - - # Mixed bytes, unicode, strings coerced to binary - values = [b'foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - - ex_values = [b'foo', None, b'bar', b'qux', np.nan] - expected = pd.DataFrame({'strings': ex_values * repeats}) - self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - - # embedded nulls are ok - values = ['foo', None, 'bar', 'qux', None] - df = pd.DataFrame({'strings': values * repeats}) - expected = pd.DataFrame({'strings': values * repeats}) - self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - - values = ['foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - expected = pd.DataFrame({'strings': values * repeats}) - self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats]) - - def test_empty_strings(self): - df = pd.DataFrame({'strings': [''] * 10}) - self._check_pandas_roundtrip(df) - - def test_all_none(self): - df = pd.DataFrame({'all_none': [None] * 10}) - self._check_pandas_roundtrip(df, null_counts=[10]) - - def test_all_null_category(self): - # ARROW-1188 - df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)}) - df = df.assign(B=df.B.astype("category")) - self._check_pandas_roundtrip(df, null_counts=[0, 3]) - - def test_multithreaded_read(self): - data = {'c{}'.format(i): [''] * 10 - for i in range(100)} - df = pd.DataFrame(data) - self._check_pandas_roundtrip(df, use_threads=True) - - def test_nan_as_null(self): - # Create a nan that is not numpy.nan - values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10) - df = pd.DataFrame({'strings': values}) - self._check_pandas_roundtrip(df) - - def test_category(self): - repeats = 1000 - values = ['foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - df['strings'] = df['strings'].astype('category') - - values = ['foo', None, 'bar', 'qux', None] - expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)}) - self._check_pandas_roundtrip(df, expected, - null_counts=[2 * repeats]) - - def test_timestamp(self): - df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) - df['with_tz'] = (df.naive.dt.tz_localize('utc') - .dt.tz_convert('America/Los_Angeles')) - - self._check_pandas_roundtrip(df) - - def test_timestamp_with_nulls(self): - df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1), - None, - pd.Timestamp(2016, 1, 3)]}) - df['with_tz'] = df.test.dt.tz_localize('utc') - - self._check_pandas_roundtrip(df, null_counts=[1, 1]) - - @pytest.mark.xfail(reason="not supported ATM", - raises=NotImplementedError) - def test_timedelta_with_nulls(self): - df = pd.DataFrame({'test': [pd.Timedelta('1 day'), - None, - pd.Timedelta('3 day')]}) - - self._check_pandas_roundtrip(df, null_counts=[1, 1]) - - def test_out_of_float64_timestamp_with_nulls(self): - df = pd.DataFrame( - {'test': pd.DatetimeIndex([1451606400000000001, - None, 14516064000030405])}) - df['with_tz'] = df.test.dt.tz_localize('utc') - self._check_pandas_roundtrip(df, null_counts=[1, 1]) - - def test_non_string_columns(self): - df = pd.DataFrame({0: [1, 2, 3, 4], - 1: [True, False, True, False]}) - - expected = df.rename(columns=str) - self._check_pandas_roundtrip(df, expected) - - @pytest.mark.skipif(not os.path.supports_unicode_filenames, - reason='unicode filenames not supported') - def test_unicode_filename(self): - # GH #209 - name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8') - df = pd.DataFrame({'foo': [1, 2, 3, 4]}) - self._check_pandas_roundtrip(df, path=random_path(prefix=name)) - - def test_read_columns(self): - data = {'foo': [1, 2, 3, 4], - 'boo': [5, 6, 7, 8], - 'woo': [1, 3, 5, 7]} - columns = list(data.keys())[1:3] - df = pd.DataFrame(data) - expected = pd.DataFrame({c: data[c] for c in columns}) - self._check_pandas_roundtrip(df, expected, columns=columns) - - def test_overwritten_file(self): - path = random_path() - self.test_files.append(path) +def test_multithreaded_read(version): + data = {'c{}'.format(i): [''] * 10 + for i in range(100)} + df = pd.DataFrame(data) + _check_pandas_roundtrip(df, use_threads=True, version=version) - num_values = 100 - np.random.seed(0) - values = np.random.randint(0, 10, size=num_values) - write_feather(pd.DataFrame({'ints': values}), path) +def test_nan_as_null(version): + # Create a nan that is not numpy.nan + values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10) + df = pd.DataFrame({'strings': values}) + _check_pandas_roundtrip(df, version=version) - df = pd.DataFrame({'ints': values[0: num_values//2]}) - self._check_pandas_roundtrip(df, path=path) - def test_filelike_objects(self): - from io import BytesIO +def test_category(version): + repeats = 1000 + values = ['foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + df['strings'] = df['strings'].astype('category') - buf = BytesIO() + values = ['foo', None, 'bar', 'qux', None] + expected = pd.DataFrame({'strings': pd.Categorical(values * repeats)}) + _check_pandas_roundtrip(df, expected, version=version) - # the copy makes it non-strided - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=['a', 'b', 'c']).copy() - write_feather(df, buf) - buf.seek(0) +def test_timestamp(version): + df = pd.DataFrame({'naive': pd.date_range('2016-03-28', periods=10)}) + df['with_tz'] = (df.naive.dt.tz_localize('utc') + .dt.tz_convert('America/Los_Angeles')) - result = read_feather(buf) - assert_frame_equal(result, df) + _check_pandas_roundtrip(df, version=version) - @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") - @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") - def test_sparse_dataframe(self): - if not pa.pandas_compat._pandas_api.has_sparse: - pytest.skip("version of pandas does not support SparseDataFrame") - # GH #221 - data = {'A': [0, 1, 2], - 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) - expected = df.to_dense() - self._check_pandas_roundtrip(df, expected) - def test_duplicate_columns(self): +def test_timestamp_with_nulls(version): + df = pd.DataFrame({'test': [pd.Timestamp(2016, 1, 1), + None, + pd.Timestamp(2016, 1, 3)]}) + df['with_tz'] = df.test.dt.tz_localize('utc') - # https://github.com/wesm/feather/issues/53 - # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() - self._assert_error_on_write(df, ValueError) + _check_pandas_roundtrip(df, version=version) - def test_unsupported(self): - # https://github.com/wesm/feather/issues/240 - # serializing actual python objects - # custom python objects - class A: - pass +@pytest.mark.xfail(reason="not supported", raises=TypeError) +def test_timedelta_with_nulls_v1(): + df = pd.DataFrame({'test': [pd.Timedelta('1 day'), + None, + pd.Timedelta('3 day')]}) + _check_pandas_roundtrip(df, version=1) + + +def test_timedelta_with_nulls(): + df = pd.DataFrame({'test': [pd.Timedelta('1 day'), + None, + pd.Timedelta('3 day')]}) + _check_pandas_roundtrip(df, version=2) + + +def test_out_of_float64_timestamp_with_nulls(version): + df = pd.DataFrame( + {'test': pd.DatetimeIndex([1451606400000000001, + None, 14516064000030405])}) + df['with_tz'] = df.test.dt.tz_localize('utc') + _check_pandas_roundtrip(df, version=version) + + +def test_non_string_columns(version): + df = pd.DataFrame({0: [1, 2, 3, 4], + 1: [True, False, True, False]}) + + expected = df.rename(columns=str) + _check_pandas_roundtrip(df, expected, version=version) + + +@pytest.mark.skipif(not os.path.supports_unicode_filenames, + reason='unicode filenames not supported') +def test_unicode_filename(version): + # GH #209 + name = (b'Besa_Kavaj\xc3\xab.feather').decode('utf-8') + df = pd.DataFrame({'foo': [1, 2, 3, 4]}) + _check_pandas_roundtrip(df, path=random_path(prefix=name), + version=version) + + +def test_read_columns(version): + data = {'foo': [1, 2, 3, 4], + 'boo': [5, 6, 7, 8], + 'woo': [1, 3, 5, 7]} + columns = list(data.keys())[1:3] + df = pd.DataFrame(data) + expected = pd.DataFrame({c: data[c] for c in columns}) + _check_pandas_roundtrip(df, expected, version=version, columns=columns) - df = pd.DataFrame({'a': [A(), A()]}) - self._assert_error_on_write(df, ValueError) - # non-strings - df = pd.DataFrame({'a': ['a', 1, 2.0]}) - self._assert_error_on_write(df, TypeError) +def test_overwritten_file(version): + path = random_path() + TEST_FILES.append(path) - @pytest.mark.slow - def test_large_dataframe(self): - df = pd.DataFrame({'A': np.arange(400000000)}) - self._check_pandas_roundtrip(df) + num_values = 100 + np.random.seed(0) + + values = np.random.randint(0, 10, size=num_values) + write_feather(pd.DataFrame({'ints': values}), path, version=version) + + df = pd.DataFrame({'ints': values[0: num_values//2]}) + _check_pandas_roundtrip(df, path=path, version=version) + + +def test_filelike_objects(version): + from io import BytesIO + + buf = BytesIO() + + # the copy makes it non-strided + df = pd.DataFrame(np.arange(12).reshape(4, 3), + columns=['a', 'b', 'c']).copy() + write_feather(df, buf, version=version) + + buf.seek(0) + + result = read_feather(buf) + assert_frame_equal(result, df) + + +@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") +@pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") +def test_sparse_dataframe(version): + if not pa.pandas_compat._pandas_api.has_sparse: + pytest.skip("version of pandas does not support SparseDataFrame") + # GH #221 + data = {'A': [0, 1, 2], + 'B': [1, 0, 1]} + df = pd.DataFrame(data).to_sparse(fill_value=1) + expected = df.to_dense() + _check_pandas_roundtrip(df, expected, version=version) + + +def test_duplicate_columns(): + + # https://github.com/wesm/feather/issues/53 + # not currently able to handle duplicate columns + df = pd.DataFrame(np.arange(12).reshape(4, 3), + columns=list('aaa')).copy() + _assert_error_on_write(df, ValueError) + + +def test_unsupported(): + # https://github.com/wesm/feather/issues/240 + # serializing actual python objects + + # custom python objects + class A: + pass + + df = pd.DataFrame({'a': [A(), A()]}) + _assert_error_on_write(df, ValueError) + + # non-strings + df = pd.DataFrame({'a': ['a', 1, 2.0]}) + _assert_error_on_write(df, TypeError) + + +@pytest.mark.slow +def test_large_dataframe(version): + df = pd.DataFrame({'A': np.arange(400000000)}) + _check_pandas_roundtrip(df, version=version) @pytest.mark.large_memory @@ -568,7 +543,13 @@ def test_chunked_binary_error_message(): ] * 2 * (1 << 10) df = pd.DataFrame({'byte_col': values}) + # Works fine with version 2 + buf = io.BytesIO() + write_feather(df, buf, version=2) + result = read_feather(pa.input_file(buf.getvalue())) + assert_frame_equal(result, df) + with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " "capacity of a Feather binary column. This restriction " "may be lifted in the future"): - write_feather(df, io.BytesIO()) + write_feather(df, io.BytesIO(), version=1) diff --git a/r/NAMESPACE b/r/NAMESPACE index 6964b91fa8c..b6320fc99fd 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -88,8 +88,7 @@ export(DictionaryArray) export(DirectoryPartitioning) export(DirectoryPartitioningFactory) export(Expression) -export(FeatherTableReader) -export(FeatherTableWriter) +export(FeatherReader) export(Field) export(FieldExpression) export(FileFormat) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 695a025602b..f38e73b17a8 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -708,68 +708,24 @@ dataset___expr__ToString <- function(x){ .Call(`_arrow_dataset___expr__ToString` , x) } -ipc___feather___TableWriter__SetDescription <- function(writer, description){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__SetDescription` , writer, description)) +ipc___WriteFeather__RecordBatch <- function(stream, batch){ + invisible(.Call(`_arrow_ipc___WriteFeather__RecordBatch` , stream, batch)) } -ipc___feather___TableWriter__SetNumRows <- function(writer, num_rows){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__SetNumRows` , writer, num_rows)) +ipc___feather___Reader__version <- function(reader){ + .Call(`_arrow_ipc___feather___Reader__version` , reader) } -ipc___feather___TableWriter__Append <- function(writer, name, values){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__Append` , writer, name, values)) +ipc___feather___Reader__Read <- function(reader, columns){ + .Call(`_arrow_ipc___feather___Reader__Read` , reader, columns) } -ipc___feather___TableWriter__Finalize <- function(writer){ - invisible(.Call(`_arrow_ipc___feather___TableWriter__Finalize` , writer)) +ipc___feather___Reader__Open <- function(stream){ + .Call(`_arrow_ipc___feather___Reader__Open` , stream) } -ipc___feather___TableWriter__Open <- function(stream){ - .Call(`_arrow_ipc___feather___TableWriter__Open` , stream) -} - -ipc___TableWriter__RecordBatch__WriteFeather <- function(writer, batch){ - invisible(.Call(`_arrow_ipc___TableWriter__RecordBatch__WriteFeather` , writer, batch)) -} - -ipc___feather___TableReader__GetDescription <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__GetDescription` , reader) -} - -ipc___feather___TableReader__HasDescription <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__HasDescription` , reader) -} - -ipc___feather___TableReader__version <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__version` , reader) -} - -ipc___feather___TableReader__num_rows <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__num_rows` , reader) -} - -ipc___feather___TableReader__num_columns <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__num_columns` , reader) -} - -ipc___feather___TableReader__GetColumnName <- function(reader, i){ - .Call(`_arrow_ipc___feather___TableReader__GetColumnName` , reader, i) -} - -ipc___feather___TableReader__GetColumn <- function(reader, i){ - .Call(`_arrow_ipc___feather___TableReader__GetColumn` , reader, i) -} - -ipc___feather___TableReader__Read <- function(reader, columns){ - .Call(`_arrow_ipc___feather___TableReader__Read` , reader, columns) -} - -ipc___feather___TableReader__Open <- function(stream){ - .Call(`_arrow_ipc___feather___TableReader__Open` , stream) -} - -ipc___feather___TableReader__column_names <- function(reader){ - .Call(`_arrow_ipc___feather___TableReader__column_names` , reader) +ipc___feather___Reader__column_names <- function(reader){ + .Call(`_arrow_ipc___feather___Reader__column_names` , reader) } Field__initialize <- function(name, field, nullable){ diff --git a/r/R/feather.R b/r/R/feather.R index 86ef83cdb6e..2af912e643d 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -44,59 +44,15 @@ write_feather <- function(x, sink) { } assert_is(sink, "OutputStream") - writer <- FeatherTableWriter$create(sink) - ipc___TableWriter__RecordBatch__WriteFeather(writer, x) + ipc___WriteFeather__RecordBatch(sink, x) invisible(x_out) } -#' @title FeatherTableWriter class -#' @rdname FeatherTableWriter -#' @name FeatherTableWriter -#' @docType class -#' @usage NULL -#' @format NULL -#' @description This class enables you to write Feather files. See its usage in -#' [write_feather()]. -#' -#' @section Factory: -#' -#' The `FeatherTableWriter$create()` factory method instantiates the object and -#' takes the following argument: -#' -#' - `stream` An `OutputStream` -#' -#' @section Methods: -#' -#' - `$GetDescription()` -#' - `$HasDescription()` -#' - `$version()` -#' - `$num_rows()` -#' - `$num_columns()` -#' - `$GetColumnName()` -#' - `$GetColumn()` -#' - `$Read(columns)` -#' -#' @export -#' @include arrow-package.R -FeatherTableWriter <- R6Class("FeatherTableWriter", inherit = ArrowObject, - public = list( - SetDescription = function(description) ipc___feather___TableWriter__SetDescription(self, description), - SetNumRows = function(num_rows) ipc___feather___TableWriter__SetNumRows(self, num_rows), - Append = function(name, values) ipc___feather___TableWriter__Append(self, name, values), - Finalize = function() ipc___feather___TableWriter__Finalize(self) - ) -) - -FeatherTableWriter$create <- function(stream) { - assert_is(stream, "OutputStream") - unique_ptr(FeatherTableWriter, ipc___feather___TableWriter__Open(stream)) -} - #' Read a Feather file #' #' @param file A character file path, a raw vector, or `InputStream`, passed to -#' `FeatherTableReader$create()`. +#' `FeatherReader$create()`. #' @inheritParams read_delim_arrow #' @param ... additional parameters #' @@ -115,9 +71,9 @@ FeatherTableWriter$create <- function(stream) { #' df <- read_feather(tf, col_select = starts_with("Sepal")) #' } read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { - reader <- FeatherTableReader$create(file, ...) + reader <- FeatherReader$create(file, ...) - all_columns <- ipc___feather___TableReader__column_names(reader) + all_columns <- ipc___feather___Reader__column_names(reader) col_select <- enquo(col_select) columns <- if (!quo_is_null(col_select)) { vars_select(all_columns, !!col_select) @@ -130,9 +86,9 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { out } -#' @title FeatherTableReader class -#' @rdname FeatherTableReader -#' @name FeatherTableReader +#' @title FeatherReader class +#' @rdname FeatherReader +#' @name FeatherReader #' @docType class #' @usage NULL #' @format NULL @@ -142,7 +98,7 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { #' #' @section Factory: #' -#' The `FeatherTableReader$create()` factory method instantiates the object and +#' The `FeatherReader$create()` factory method instantiates the object and #' takes the following arguments: #' #' - `file` A character file name, raw vector, or Arrow file connection object @@ -152,33 +108,21 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { #' #' @section Methods: #' -#' - `$GetDescription()` -#' - `$HasDescription()` #' - `$version()` -#' - `$num_rows()` -#' - `$num_columns()` -#' - `$GetColumnName()` -#' - `$GetColumn()` #' - `$Read(columns)` #' #' @export #' @include arrow-package.R -FeatherTableReader <- R6Class("FeatherTableReader", inherit = ArrowObject, +FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject, public = list( - GetDescription = function() ipc___feather___TableReader__GetDescription(self), - HasDescription = function() ipc__feather___TableReader__HasDescription(self), - version = function() ipc___feather___TableReader__version(self), - num_rows = function() ipc___feather___TableReader__num_rows(self), - num_columns = function() ipc___feather___TableReader__num_columns(self), - GetColumnName = function(i) ipc___feather___TableReader__GetColumnName(self, i), - GetColumn = function(i) shared_ptr(Array, ipc___feather___TableReader__GetColumn(self, i)), + version = function() ipc___feather___Reader__version(self), Read = function(columns) { - shared_ptr(Table, ipc___feather___TableReader__Read(self, columns)) + shared_ptr(Table, ipc___feather___Reader__Read(self, columns)) } ) ) -FeatherTableReader$create <- function(file, mmap = TRUE, ...) { +FeatherReader$create <- function(file, mmap = TRUE, ...) { file <- make_readable_file(file, mmap) - unique_ptr(FeatherTableReader, ipc___feather___TableReader__Open(file)) -} + shared_ptr(FeatherReader, ipc___feather___Reader__Open(file)) +} \ No newline at end of file diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 165892a8d7d..e51487d24b1 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -83,8 +83,7 @@ reference: - ParquetReaderProperties - ParquetFileWriter - ParquetWriterProperties - - FeatherTableReader - - FeatherTableWriter + - FeatherReader - CsvTableReader - RecordBatchReader - RecordBatchWriter diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 161f0934a02..e53414d3a3c 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2739,254 +2739,79 @@ RcppExport SEXP _arrow_dataset___expr__ToString(SEXP x_sexp){ // feather.cpp #if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__SetDescription(const std::unique_ptr& writer, const std::string& description); -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetDescription(SEXP writer_sexp, SEXP description_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter::type description(description_sexp); - ipc___feather___TableWriter__SetDescription(writer, description); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetDescription(SEXP writer_sexp, SEXP description_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__SetDescription(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__SetNumRows(const std::unique_ptr& writer, int64_t num_rows); -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetNumRows(SEXP writer_sexp, SEXP num_rows_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter::type num_rows(num_rows_sexp); - ipc___feather___TableWriter__SetNumRows(writer, num_rows); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__SetNumRows(SEXP writer_sexp, SEXP num_rows_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__SetNumRows(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__Append(const std::unique_ptr& writer, const std::string& name, const std::shared_ptr& values); -RcppExport SEXP _arrow_ipc___feather___TableWriter__Append(SEXP writer_sexp, SEXP name_sexp, SEXP values_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - Rcpp::traits::input_parameter::type name(name_sexp); - Rcpp::traits::input_parameter&>::type values(values_sexp); - ipc___feather___TableWriter__Append(writer, name, values); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__Append(SEXP writer_sexp, SEXP name_sexp, SEXP values_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__Append(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___feather___TableWriter__Finalize(const std::unique_ptr& writer); -RcppExport SEXP _arrow_ipc___feather___TableWriter__Finalize(SEXP writer_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); - ipc___feather___TableWriter__Finalize(writer); - return R_NilValue; -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__Finalize(SEXP writer_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__Finalize(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::unique_ptr ipc___feather___TableWriter__Open(const std::shared_ptr& stream); -RcppExport SEXP _arrow_ipc___feather___TableWriter__Open(SEXP stream_sexp){ +void ipc___WriteFeather__RecordBatch(const std::shared_ptr& stream, const std::shared_ptr& batch); +RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type stream(stream_sexp); - return Rcpp::wrap(ipc___feather___TableWriter__Open(stream)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableWriter__Open(SEXP stream_sexp){ - Rf_error("Cannot call ipc___feather___TableWriter__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -void ipc___TableWriter__RecordBatch__WriteFeather(const std::unique_ptr& writer, const std::shared_ptr& batch); -RcppExport SEXP _arrow_ipc___TableWriter__RecordBatch__WriteFeather(SEXP writer_sexp, SEXP batch_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type writer(writer_sexp); Rcpp::traits::input_parameter&>::type batch(batch_sexp); - ipc___TableWriter__RecordBatch__WriteFeather(writer, batch); + ipc___WriteFeather__RecordBatch(stream, batch); return R_NilValue; END_RCPP } #else -RcppExport SEXP _arrow_ipc___TableWriter__RecordBatch__WriteFeather(SEXP writer_sexp, SEXP batch_sexp){ - Rf_error("Cannot call ipc___TableWriter__RecordBatch__WriteFeather(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp){ + Rf_error("Cannot call ipc___WriteFeather__RecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -std::string ipc___feather___TableReader__GetDescription(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__GetDescription(SEXP reader_sexp){ +int ipc___feather___Reader__version(const std::shared_ptr& reader); +RcppExport SEXP _arrow_ipc___feather___Reader__version(SEXP reader_sexp){ BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__GetDescription(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__GetDescription(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__GetDescription(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -bool ipc___feather___TableReader__HasDescription(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__HasDescription(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__HasDescription(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__HasDescription(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__HasDescription(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -int ipc___feather___TableReader__version(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__version(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__version(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__version(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__version(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -int64_t ipc___feather___TableReader__num_rows(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__num_rows(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__num_rows(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__num_rows(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__num_rows(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -int64_t ipc___feather___TableReader__num_columns(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__num_columns(SEXP reader_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__num_columns(reader)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__num_columns(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__num_columns(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::string ipc___feather___TableReader__GetColumnName(const std::unique_ptr& reader, int i); -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumnName(SEXP reader_sexp, SEXP i_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - Rcpp::traits::input_parameter::type i(i_sexp); - return Rcpp::wrap(ipc___feather___TableReader__GetColumnName(reader, i)); -END_RCPP -} -#else -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumnName(SEXP reader_sexp, SEXP i_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__GetColumnName(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// feather.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ipc___feather___TableReader__GetColumn(const std::unique_ptr& reader, int i); -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumn(SEXP reader_sexp, SEXP i_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - Rcpp::traits::input_parameter::type i(i_sexp); - return Rcpp::wrap(ipc___feather___TableReader__GetColumn(reader, i)); + Rcpp::traits::input_parameter&>::type reader(reader_sexp); + return Rcpp::wrap(ipc___feather___Reader__version(reader)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumn(SEXP reader_sexp, SEXP i_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__GetColumn(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__version(SEXP reader_sexp){ + Rf_error("Cannot call ipc___feather___Reader__version(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ipc___feather___TableReader__Read(const std::unique_ptr& reader, SEXP columns); -RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP reader_sexp, SEXP columns_sexp){ +std::shared_ptr ipc___feather___Reader__Read(const std::shared_ptr& reader, SEXP columns); +RcppExport SEXP _arrow_ipc___feather___Reader__Read(SEXP reader_sexp, SEXP columns_sexp){ BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); + Rcpp::traits::input_parameter&>::type reader(reader_sexp); Rcpp::traits::input_parameter::type columns(columns_sexp); - return Rcpp::wrap(ipc___feather___TableReader__Read(reader, columns)); + return Rcpp::wrap(ipc___feather___Reader__Read(reader, columns)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__Read(SEXP reader_sexp, SEXP columns_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__Read(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__Read(SEXP reader_sexp, SEXP columns_sexp){ + Rf_error("Cannot call ipc___feather___Reader__Read(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -std::unique_ptr ipc___feather___TableReader__Open(const std::shared_ptr& stream); -RcppExport SEXP _arrow_ipc___feather___TableReader__Open(SEXP stream_sexp){ +std::shared_ptr ipc___feather___Reader__Open(const std::shared_ptr& stream); +RcppExport SEXP _arrow_ipc___feather___Reader__Open(SEXP stream_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type stream(stream_sexp); - return Rcpp::wrap(ipc___feather___TableReader__Open(stream)); + return Rcpp::wrap(ipc___feather___Reader__Open(stream)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__Open(SEXP stream_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__Open(SEXP stream_sexp){ + Rf_error("Cannot call ipc___feather___Reader__Open(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif // feather.cpp #if defined(ARROW_R_WITH_ARROW) -Rcpp::CharacterVector ipc___feather___TableReader__column_names(const std::unique_ptr& reader); -RcppExport SEXP _arrow_ipc___feather___TableReader__column_names(SEXP reader_sexp){ +Rcpp::CharacterVector ipc___feather___Reader__column_names(const std::shared_ptr& reader); +RcppExport SEXP _arrow_ipc___feather___Reader__column_names(SEXP reader_sexp){ BEGIN_RCPP - Rcpp::traits::input_parameter&>::type reader(reader_sexp); - return Rcpp::wrap(ipc___feather___TableReader__column_names(reader)); + Rcpp::traits::input_parameter&>::type reader(reader_sexp); + return Rcpp::wrap(ipc___feather___Reader__column_names(reader)); END_RCPP } #else -RcppExport SEXP _arrow_ipc___feather___TableReader__column_names(SEXP reader_sexp){ - Rf_error("Cannot call ipc___feather___TableReader__column_names(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___feather___Reader__column_names(SEXP reader_sexp){ + Rf_error("Cannot call ipc___feather___Reader__column_names(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6104,22 +5929,11 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___expr__is_valid", (DL_FUNC) &_arrow_dataset___expr__is_valid, 1}, { "_arrow_dataset___expr__scalar", (DL_FUNC) &_arrow_dataset___expr__scalar, 1}, { "_arrow_dataset___expr__ToString", (DL_FUNC) &_arrow_dataset___expr__ToString, 1}, - { "_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2}, - { "_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2}, - { "_arrow_ipc___feather___TableWriter__Append", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Append, 3}, - { "_arrow_ipc___feather___TableWriter__Finalize", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Finalize, 1}, - { "_arrow_ipc___feather___TableWriter__Open", (DL_FUNC) &_arrow_ipc___feather___TableWriter__Open, 1}, - { "_arrow_ipc___TableWriter__RecordBatch__WriteFeather", (DL_FUNC) &_arrow_ipc___TableWriter__RecordBatch__WriteFeather, 2}, - { "_arrow_ipc___feather___TableReader__GetDescription", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetDescription, 1}, - { "_arrow_ipc___feather___TableReader__HasDescription", (DL_FUNC) &_arrow_ipc___feather___TableReader__HasDescription, 1}, - { "_arrow_ipc___feather___TableReader__version", (DL_FUNC) &_arrow_ipc___feather___TableReader__version, 1}, - { "_arrow_ipc___feather___TableReader__num_rows", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_rows, 1}, - { "_arrow_ipc___feather___TableReader__num_columns", (DL_FUNC) &_arrow_ipc___feather___TableReader__num_columns, 1}, - { "_arrow_ipc___feather___TableReader__GetColumnName", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumnName, 2}, - { "_arrow_ipc___feather___TableReader__GetColumn", (DL_FUNC) &_arrow_ipc___feather___TableReader__GetColumn, 2}, - { "_arrow_ipc___feather___TableReader__Read", (DL_FUNC) &_arrow_ipc___feather___TableReader__Read, 2}, - { "_arrow_ipc___feather___TableReader__Open", (DL_FUNC) &_arrow_ipc___feather___TableReader__Open, 1}, - { "_arrow_ipc___feather___TableReader__column_names", (DL_FUNC) &_arrow_ipc___feather___TableReader__column_names, 1}, + { "_arrow_ipc___WriteFeather__RecordBatch", (DL_FUNC) &_arrow_ipc___WriteFeather__RecordBatch, 2}, + { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, + { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, + { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, + { "_arrow_ipc___feather___Reader__column_names", (DL_FUNC) &_arrow_ipc___feather___Reader__column_names, 1}, { "_arrow_Field__initialize", (DL_FUNC) &_arrow_Field__initialize, 3}, { "_arrow_Field__ToString", (DL_FUNC) &_arrow_Field__ToString, 1}, { "_arrow_Field__name", (DL_FUNC) &_arrow_Field__name, 1}, diff --git a/r/src/feather.cpp b/r/src/feather.cpp index 7bdfeab72b2..c6965035ee4 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -19,103 +19,29 @@ #if defined(ARROW_R_WITH_ARROW) -// ---------- TableWriter +// ---------- WriteFeather // [[arrow::export]] -void ipc___feather___TableWriter__SetDescription( - const std::unique_ptr& writer, - const std::string& description) { - writer->SetDescription(description); -} - -// [[arrow::export]] -void ipc___feather___TableWriter__SetNumRows( - const std::unique_ptr& writer, int64_t num_rows) { - writer->SetNumRows(num_rows); -} - -// [[arrow::export]] -void ipc___feather___TableWriter__Append( - const std::unique_ptr& writer, - const std::string& name, const std::shared_ptr& values) { - STOP_IF_NOT_OK(writer->Append(name, *values)); -} - -// [[arrow::export]] -void ipc___feather___TableWriter__Finalize( - const std::unique_ptr& writer) { - STOP_IF_NOT_OK(writer->Finalize()); -} - -// [[arrow::export]] -std::unique_ptr ipc___feather___TableWriter__Open( - const std::shared_ptr& stream) { - std::unique_ptr writer; - STOP_IF_NOT_OK(arrow::ipc::feather::TableWriter::Open(stream, &writer)); - return writer; -} - -// [[arrow::export]] -void ipc___TableWriter__RecordBatch__WriteFeather( - const std::unique_ptr& writer, +void ipc___WriteFeather__RecordBatch( + const std::shared_ptr& stream, const std::shared_ptr& batch) { - writer->SetNumRows(batch->num_rows()); - - for (int i = 0; i < batch->num_columns(); i++) { - STOP_IF_NOT_OK(writer->Append(batch->column_name(i), *batch->column(i))); - } - STOP_IF_NOT_OK(writer->Finalize()); -} - -// ----------- TableReader - -// [[arrow::export]] -std::string ipc___feather___TableReader__GetDescription( - const std::unique_ptr& reader) { - return reader->GetDescription(); + auto properties = arrow::ipc::feather::WriteProperties::Defaults(); + std::shared_ptr table; + STOP_IF_NOT_OK(arrow::Table::FromRecordBatches({batch}, &table)); + STOP_IF_NOT_OK(arrow::ipc::feather::WriteTable(*table, stream.get(), properties)); } -// [[arrow::export]] -bool ipc___feather___TableReader__HasDescription( - const std::unique_ptr& reader) { - return reader->HasDescription(); -} +// ----------- Reader // [[arrow::export]] -int ipc___feather___TableReader__version( - const std::unique_ptr& reader) { +int ipc___feather___Reader__version( + const std::shared_ptr& reader) { return reader->version(); } // [[arrow::export]] -int64_t ipc___feather___TableReader__num_rows( - const std::unique_ptr& reader) { - return reader->num_rows(); -} - -// [[arrow::export]] -int64_t ipc___feather___TableReader__num_columns( - const std::unique_ptr& reader) { - return reader->num_columns(); -} - -// [[arrow::export]] -std::string ipc___feather___TableReader__GetColumnName( - const std::unique_ptr& reader, int i) { - return reader->GetColumnName(i); -} - -// [[arrow::export]] -std::shared_ptr ipc___feather___TableReader__GetColumn( - const std::unique_ptr& reader, int i) { - std::shared_ptr column; - STOP_IF_NOT_OK(reader->GetColumn(i, &column)); - return column; -} - -// [[arrow::export]] -std::shared_ptr ipc___feather___TableReader__Read( - const std::unique_ptr& reader, SEXP columns) { +std::shared_ptr ipc___feather___Reader__Read( + const std::shared_ptr& reader, SEXP columns) { std::shared_ptr table; switch (TYPEOF(columns)) { @@ -140,20 +66,19 @@ std::shared_ptr ipc___feather___TableReader__Read( } // [[arrow::export]] -std::unique_ptr ipc___feather___TableReader__Open( +std::shared_ptr ipc___feather___Reader__Open( const std::shared_ptr& stream) { - std::unique_ptr reader; - STOP_IF_NOT_OK(arrow::ipc::feather::TableReader::Open(stream, &reader)); - return reader; + return VALUE_OR_STOP(arrow::ipc::feather::Reader::Open(stream)); } // [[arrow::export]] -Rcpp::CharacterVector ipc___feather___TableReader__column_names( - const std::unique_ptr& reader) { - int64_t n = reader->num_columns(); +Rcpp::CharacterVector ipc___feather___Reader__column_names( + const std::shared_ptr& reader) { + auto sch = reader->schema(); + int64_t n = sch->num_fields(); Rcpp::CharacterVector out(n); - for (int64_t i = 0; i < n; i++) { - out[i] = reader->GetColumnName(i); + for (int i = 0; i < n; i++) { + out[i] = sch->field(i)->name(); } return out; } From bc0b514f2e65ad89d5afbcde191e99f935b0a622 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 23 Mar 2020 19:45:29 -0500 Subject: [PATCH 02/15] Add lz4 support --- python/pyarrow/feather.pxi | 2 ++ python/pyarrow/feather.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 3302f7c5fd5..7b637370f50 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -37,6 +37,8 @@ def write_feather(Table table, object dest, compression=None, if compression == 'zstd': properties.compression = CCompressionType_ZSTD + elif compression == 'lz4': + properties.compression = CCompressionType_LZ4 else: properties.compression = CCompressionType_UNCOMPRESSED diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index d5e961b9846..bd8eecca708 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -124,7 +124,7 @@ def write_feather(df, dest, compression=None, compression_level=None, dest : str Local destination path. compression : string, default None - Either "zstd" or None (for now) + {"zstd", "lz4", None} compression_level : int, default None Use a compression level particular to the chosen compressor. If None use the default compression level From c1a352d65bd1a3c1eb892b0b12e339fd9eb125d3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 26 Mar 2020 16:04:46 -0500 Subject: [PATCH 03/15] Expose configurable chunksize --- python/pyarrow/feather.pxi | 6 ++++-- python/pyarrow/feather.py | 12 ++++++++---- python/pyarrow/tests/test_feather.py | 19 +++++++++++++++---- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 7b637370f50..6c1fae31740 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -24,8 +24,7 @@ class FeatherError(Exception): def write_feather(Table table, object dest, compression=None, - compression_level=None, - version=2): + compression_level=None, chunksize=None, version=2): cdef shared_ptr[COutputStream] sink get_writer(dest, &sink) @@ -42,6 +41,9 @@ def write_feather(Table table, object dest, compression=None, else: properties.compression = CCompressionType_UNCOMPRESSED + if chunksize is not None: + properties.chunksize = chunksize + if compression_level is not None: properties.compression_level = compression_level diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index bd8eecca708..c8ff237510d 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -113,14 +113,14 @@ def check_chunked_overflow(name, col): def write_feather(df, dest, compression=None, compression_level=None, - version=2): + chunksize=None, version=2): """ Write a pandas.DataFrame to Feather format. Parameters ---------- - df : pandas.DataFrame - Dataframe to write out as feather format. + df : pandas.DataFrame or pyarrow.Table + Data to write out as Feather format. dest : str Local destination path. compression : string, default None @@ -128,6 +128,9 @@ def write_feather(df, dest, compression=None, compression_level=None, compression_level : int, default None Use a compression level particular to the chosen compressor. If None use the default compression level + chunksize : int, default None + For V2 files, the size of chunks to split the data into. None means use + the default, which is currently 64K version : int, default 2 Feather file version. Version 2 is the current. Version 1 is the more limited legacy format @@ -153,7 +156,8 @@ def write_feather(df, dest, compression=None, compression_level=None, try: ext.write_feather(table, dest, compression=compression, - compression_level=compression_level, version=version) + compression_level=compression_level, + chunksize=chunksize, version=version) except Exception: if isinstance(dest, str): try: diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index ff3d0d0ff4d..ff35be5d748 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -473,9 +473,7 @@ def test_overwritten_file(version): def test_filelike_objects(version): - from io import BytesIO - - buf = BytesIO() + buf = io.BytesIO() # the copy makes it non-strided df = pd.DataFrame(np.arange(12).reshape(4, 3), @@ -526,6 +524,19 @@ class A: _assert_error_on_write(df, TypeError) +def test_v2_set_chunksize(): + df = pd.DataFrame({'A': np.arange(1000)}) + + buf = io.BytesIO() + write_feather(df, buf, chunksize=250, version=2) + + result = buf.getvalue() + + ipc_file = pa.ipc.open_file(pa.BufferReader(result)) + assert ipc_file.num_record_batches == 4 + assert len(ipc_file.get_batch(0)) == 250 + + @pytest.mark.slow def test_large_dataframe(version): df = pd.DataFrame({'A': np.arange(400000000)}) @@ -546,7 +557,7 @@ def test_chunked_binary_error_message(): # Works fine with version 2 buf = io.BytesIO() write_feather(df, buf, version=2) - result = read_feather(pa.input_file(buf.getvalue())) + result = read_feather(pa.BufferReader(buf.getvalue())) assert_frame_equal(result, df) with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " From 21c16d2070397aea6b7cc2320e69b421d6df3d68 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 27 Mar 2020 16:26:44 -0500 Subject: [PATCH 04/15] Set default Feather compression to LZ4 unless it is unavailable --- cpp/src/arrow/ipc/feather.cc | 10 ++++++++++ cpp/src/arrow/ipc/feather.h | 14 ++++++++++---- cpp/src/arrow/ipc/feather_test.cc | 21 ++++++++++++++++++++- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index ffa360454be..5f3e4af94d1 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -794,6 +794,16 @@ Result> Reader::Open( } } +WriteProperties WriteProperties::Defaults() { + WriteProperties result; +#ifdef ARROW_WITH_LZ4 + result.compression = Compression::LZ4; +#else + result.compression = Compression::UNCOMPRESSED; +#endif + return result; +} + Status WriteTable(const Table& table, io::OutputStream* dst, const WriteProperties& properties) { if (properties.version == kFeatherV1Version) { diff --git a/cpp/src/arrow/ipc/feather.h b/cpp/src/arrow/ipc/feather.h index dfb4095aef3..991f55f5bb2 100644 --- a/cpp/src/arrow/ipc/feather.h +++ b/cpp/src/arrow/ipc/feather.h @@ -98,10 +98,10 @@ class ARROW_EXPORT Reader { }; struct ARROW_EXPORT WriteProperties { - static WriteProperties Defaults() { return WriteProperties(); } + static WriteProperties Defaults(); static WriteProperties DefaultsV1() { - WriteProperties props; + WriteProperties props = Defaults(); props.version = kFeatherV1Version; return props; } @@ -118,8 +118,14 @@ struct ARROW_EXPORT WriteProperties { /// faster random row access int64_t chunksize = 1LL << 16; - /// Compression type to use - Compression::type compression = Compression::ZSTD; + /// Compression type to use. Only UNCOMPRESSED, LZ4, and ZSTD are + /// supported. The default compression returned by Defaults() is LZ4 if the + /// project is built with support for it, otherwise + /// UNCOMPRESSED. UNCOMPRESSED is set as the object default here so that if + /// WriteProperties::Defaults() is not used, the default constructor for + /// WriteProperties will work regardless of the options used to build the C++ + /// project. + Compression::type compression = Compression::UNCOMPRESSED; /// Compressor-specific compression level int compression_level = Compression::kUseDefaultCompressionLevel; diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 47412ee865a..6b7f82051b6 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -34,6 +34,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/compression.h" namespace arrow { @@ -59,9 +60,16 @@ class TestFeather : public ::testing::TestWithParam { WriteProperties GetProperties() { auto param = GetParam(); + auto props = WriteProperties::Defaults(); props.version = param.version; - props.compression = param.compression; + + // Don't fail if the build doesn't have LZ4 or ZSTD enabled + if (util::Codec::IsAvailable(param.compression)) { + props.compression = param.compression; + } else { + props.compression = Compression::UNCOMPRESSED; + } return props; } @@ -117,6 +125,16 @@ class TestFeather : public ::testing::TestWithParam { std::shared_ptr output_; }; +TEST(TestFeatherWriteProperties, Defaults) { + auto props = WriteProperties::Defaults(); + +#ifdef ARROW_WITH_LZ4 + ASSERT_EQ(Compression::LZ4, props.compression); +#else + ASSERT_EQ(Compression::UNCOMPRESSED, props.compression); +#endif +} + TEST_P(TestFeather, ReadIndicesOrNames) { std::shared_ptr batch1; ASSERT_OK(ipc::test::MakeIntRecordBatch(&batch1)); @@ -278,6 +296,7 @@ TEST_P(TestFeather, SliceBooleanRoundTrip) { INSTANTIATE_TEST_SUITE_P( FeatherTests, TestFeather, ::testing::Values(TestParam(kFeatherV1Version), TestParam(kFeatherV2Version), + TestParam(kFeatherV2Version, Compression::LZ4), TestParam(kFeatherV2Version, Compression::ZSTD))); } // namespace feather From 7854e2cc7c8bf5a7f4710ad05affeebcd746089d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 27 Mar 2020 17:19:53 -0500 Subject: [PATCH 05/15] Follow Feather changes in GLib and Ruby bindings --- c_glib/arrow-glib/reader.cpp | 178 +++---------------- c_glib/arrow-glib/reader.h | 16 -- c_glib/arrow-glib/reader.hpp | 9 +- c_glib/arrow-glib/writer.cpp | 225 ++---------------------- c_glib/arrow-glib/writer.h | 30 +--- c_glib/arrow-glib/writer.hpp | 3 - c_glib/test/test-feather-file-reader.rb | 118 +------------ c_glib/test/test-feather-file-writer.rb | 60 +------ ruby/red-arrow/lib/arrow/table-saver.rb | 4 +- 9 files changed, 51 insertions(+), 592 deletions(-) diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 3190d240026..513d1a7ea29 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -521,12 +521,12 @@ garrow_record_batch_file_reader_read_record_batch(GArrowRecordBatchFileReader *r typedef struct GArrowFeatherFileReaderPrivate_ { - arrow::ipc::feather::TableReader *feather_table_reader; + std::shared_ptr feather_reader; } GArrowFeatherFileReaderPrivate; enum { PROP_0__, - PROP_FEATHER_TABLE_READER + PROP_FEATHER_READER }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherFileReader, @@ -543,7 +543,7 @@ garrow_feather_file_reader_finalize(GObject *object) { auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(object); - delete priv->feather_table_reader; + priv->feather_reader.~shared_ptr(); G_OBJECT_CLASS(garrow_feather_file_reader_parent_class)->finalize(object); } @@ -557,9 +557,9 @@ garrow_feather_file_reader_set_property(GObject *object, auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(object); switch (prop_id) { - case PROP_FEATHER_TABLE_READER: - priv->feather_table_reader = - static_cast(g_value_get_pointer(value)); + case PROP_FEATHER_READER: + priv->feather_reader = + *static_cast *>(g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -597,12 +597,12 @@ garrow_feather_file_reader_class_init(GArrowFeatherFileReaderClass *klass) gobject_class->set_property = garrow_feather_file_reader_set_property; gobject_class->get_property = garrow_feather_file_reader_get_property; - spec = g_param_spec_pointer("feather-table-reader", - "arrow::ipc::feather::TableReader", - "The raw std::shared *", + spec = g_param_spec_pointer("feather-reader", + "arrow::ipc::feather::Reader", + "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FEATHER_TABLE_READER, spec); + g_object_class_install_property(gobject_class, PROP_FEATHER_READER, spec); } @@ -621,58 +621,18 @@ garrow_feather_file_reader_new(GArrowSeekableInputStream *file, GError **error) { auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(file); - std::unique_ptr arrow_reader; - auto status = - arrow::ipc::feather::TableReader::Open(arrow_random_access_file, - &arrow_reader); - if (garrow_error_check(error, status, "[feather-file-reader][new]")) { - return garrow_feather_file_reader_new_raw(arrow_reader.release()); - } else { - return NULL; - } -} + std::unique_ptr arrow_reader; -/** - * garrow_feather_file_reader_get_description: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: (nullable) (transfer full): - * The description of the file if it exists, - * %NULL otherwise. You can confirm whether description exists or not by - * garrow_feather_file_reader_has_description(). - * - * It should be freed with g_free() when no longer needed. - * - * Since: 0.4.0 - */ -gchar * -garrow_feather_file_reader_get_description(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - if (arrow_reader->HasDescription()) { - auto description = arrow_reader->GetDescription(); - return g_strndup(description.data(), - description.size()); + auto reader = + arrow::ipc::feather::Reader::Open(arrow_random_access_file); + + if (garrow::check(error, reader, "[feather-file-reader][new]")) { + return garrow_feather_file_reader_new_raw(&(reader.ValueOrDie())); } else { return NULL; } } -/** - * garrow_feather_file_reader_has_description: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: Whether the file has description or not. - * - * Since: 0.4.0 - */ -gboolean -garrow_feather_file_reader_has_description(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - return arrow_reader->HasDescription(); -} - /** * garrow_feather_file_reader_get_version: * @reader: A #GArrowFeatherFileReader. @@ -688,102 +648,6 @@ garrow_feather_file_reader_get_version(GArrowFeatherFileReader *reader) return arrow_reader->version(); } -/** - * garrow_feather_file_reader_get_n_rows: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: The number of rows in the file. - * - * Since: 0.4.0 - */ -gint64 -garrow_feather_file_reader_get_n_rows(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - return arrow_reader->num_rows(); -} - -/** - * garrow_feather_file_reader_get_n_columns: - * @reader: A #GArrowFeatherFileReader. - * - * Returns: The number of columns in the file. - * - * Since: 0.4.0 - */ -gint64 -garrow_feather_file_reader_get_n_columns(GArrowFeatherFileReader *reader) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - return arrow_reader->num_columns(); -} - -/** - * garrow_feather_file_reader_get_column_name: - * @reader: A #GArrowFeatherFileReader. - * @i: The index of the target column. If it's negative, index is - * counted backward from the end of the columns. `-1` means the last - * column. - * - * Returns: (nullable) (transfer full): The i-th column name in the file. - * - * It should be freed with g_free() when no longer needed. - * - * Since: 0.4.0 - */ -gchar * -garrow_feather_file_reader_get_column_name(GArrowFeatherFileReader *reader, - gint i) -{ - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - if (!garrow_internal_index_adjust(i, arrow_reader->num_columns())) { - return NULL; - } - const auto &column_name = arrow_reader->GetColumnName(i); - return g_strndup(column_name.data(), - column_name.size()); -} - -/** - * garrow_feather_file_reader_get_column_data: - * @reader: A #GArrowFeatherFileReader. - * @i: The index of the target column. If it's negative, index is - * counted backward from the end of the columns. `-1` means the last - * column. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): - * The i-th column's data in the file or %NULL on error. - * - * Since: 1.0.0 - */ -GArrowChunkedArray * -garrow_feather_file_reader_get_column_data(GArrowFeatherFileReader *reader, - gint i, - GError **error) -{ - const auto tag = "[feather-file-reader][get-column-data]"; - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - - const auto n_columns = arrow_reader->num_columns(); - if (!garrow_internal_index_adjust(i, n_columns)) { - garrow_error_check(error, - arrow::Status::IndexError("Out of index: " - "<0..", n_columns, ">: " - "<", i, ">"), - tag); - return NULL; - } - - std::shared_ptr arrow_chunked_array; - auto status = arrow_reader->GetColumn(i, &arrow_chunked_array); - if (garrow_error_check(error, status, tag)) { - return garrow_chunked_array_new_raw(&arrow_chunked_array); - } else { - return NULL; - } -} - /** * garrow_feather_file_reader_read: * @reader: A #GArrowFeatherFileReader. @@ -858,7 +722,7 @@ garrow_feather_file_reader_read_names(GArrowFeatherFileReader *reader, GError **error) { auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - std::vector cpp_names(n_names); + std::vector cpp_names; for (guint i = 0; i < n_names; ++i) { cpp_names.push_back(names[i]); } @@ -2212,21 +2076,21 @@ garrow_record_batch_file_reader_get_raw(GArrowRecordBatchFileReader *reader) } GArrowFeatherFileReader * -garrow_feather_file_reader_new_raw(arrow::ipc::feather::TableReader *arrow_reader) +garrow_feather_file_reader_new_raw(std::shared_ptr *arrow_reader) { auto reader = GARROW_FEATHER_FILE_READER( g_object_new(GARROW_TYPE_FEATHER_FILE_READER, - "feather-table-reader", arrow_reader, + "feather-reader", arrow_reader, NULL)); return reader; } -arrow::ipc::feather::TableReader * +std::shared_ptr garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader) { auto priv = GARROW_FEATHER_FILE_READER_GET_PRIVATE(reader); - return priv->feather_table_reader; + return priv->feather_reader; } GArrowCSVReader * diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index 6241792c8b2..5054d530703 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -209,24 +209,8 @@ GArrowFeatherFileReader *garrow_feather_file_reader_new( GArrowSeekableInputStream *file, GError **error); -gchar *garrow_feather_file_reader_get_description( - GArrowFeatherFileReader *reader); -gboolean garrow_feather_file_reader_has_description( - GArrowFeatherFileReader *reader); gint garrow_feather_file_reader_get_version( GArrowFeatherFileReader *reader); -gint64 garrow_feather_file_reader_get_n_rows( - GArrowFeatherFileReader *reader); -gint64 garrow_feather_file_reader_get_n_columns( - GArrowFeatherFileReader *reader); -gchar *garrow_feather_file_reader_get_column_name( - GArrowFeatherFileReader *reader, - gint i); -GARROW_AVAILABLE_IN_1_0 -GArrowChunkedArray * -garrow_feather_file_reader_get_column_data(GArrowFeatherFileReader *reader, - gint i, - GError **error); GArrowTable * garrow_feather_file_reader_read(GArrowFeatherFileReader *reader, GError **error); diff --git a/c_glib/arrow-glib/reader.hpp b/c_glib/arrow-glib/reader.hpp index 7fa89ce84c9..c1df700fe13 100644 --- a/c_glib/arrow-glib/reader.hpp +++ b/c_glib/arrow-glib/reader.hpp @@ -34,11 +34,14 @@ GArrowTableBatchReader *garrow_table_batch_reader_new_raw(std::shared_ptr *arrow_reader); -GArrowRecordBatchFileReader *garrow_record_batch_file_reader_new_raw(std::shared_ptr *arrow_reader); +GArrowRecordBatchFileReader * +garrow_record_batch_file_reader_new_raw(std::shared_ptr *arrow_reader); std::shared_ptr garrow_record_batch_file_reader_get_raw(GArrowRecordBatchFileReader *reader); -GArrowFeatherFileReader *garrow_feather_file_reader_new_raw(arrow::ipc::feather::TableReader *arrow_reader); -arrow::ipc::feather::TableReader *garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); +GArrowFeatherFileReader * +garrow_feather_file_reader_new_raw(std::shared_ptr *arrow_reader); +std::shared_ptr +garrow_feather_file_reader_get_raw(GArrowFeatherFileReader *reader); GArrowCSVReader * garrow_csv_reader_new_raw(std::shared_ptr *arrow_reader); diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index 8e9132c5a89..caeee2086d0 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -47,9 +47,6 @@ G_BEGIN_DECLS * * #GArrowRecordBatchFileWriter is a class for writing record * batches in file format into output. - * - * #GArrowFeatherFileWriter is a class for writing arrays - * in Feather file format into output. */ typedef struct GArrowRecordBatchWriterPrivate_ { @@ -298,215 +295,31 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, } } - -typedef struct GArrowFeatherFileWriterPrivate_ { - arrow::ipc::feather::TableWriter *feather_table_writer; -} GArrowFeatherFileWriterPrivate; - -enum { - PROP_0_, - PROP_FEATHER_TABLE_WRITER -}; - -G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherFileWriter, - garrow_feather_file_writer, - G_TYPE_OBJECT); - -#define GARROW_FEATHER_FILE_WRITER_GET_PRIVATE(obj) \ - static_cast( \ - garrow_feather_file_writer_get_instance_private( \ - GARROW_FEATHER_FILE_WRITER(obj))) - -static void -garrow_feather_file_writer_finalize(GObject *object) -{ - auto priv = GARROW_FEATHER_FILE_WRITER_GET_PRIVATE(object); - - delete priv->feather_table_writer; - - G_OBJECT_CLASS(garrow_feather_file_writer_parent_class)->finalize(object); -} - -static void -garrow_feather_file_writer_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_FEATHER_FILE_WRITER_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_FEATHER_TABLE_WRITER: - priv->feather_table_writer = - static_cast(g_value_get_pointer(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_feather_file_writer_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_feather_file_writer_init(GArrowFeatherFileWriter *object) -{ -} - -static void -garrow_feather_file_writer_class_init(GArrowFeatherFileWriterClass *klass) -{ - GObjectClass *gobject_class; - GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = garrow_feather_file_writer_finalize; - gobject_class->set_property = garrow_feather_file_writer_set_property; - gobject_class->get_property = garrow_feather_file_writer_get_property; - - spec = g_param_spec_pointer("feather-table-writer", - "arrow::ipc::feather::TableWriter", - "The raw std::shared *", - static_cast(G_PARAM_WRITABLE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FEATHER_TABLE_WRITER, spec); -} - /** - * garrow_feather_file_writer_new: + * garrow_feather_write_file: + * @table: The table to be written. * @sink: The output of the writer. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable): A newly created #GArrowFeatherFileWriter - * or %NULL on error. - * - * Since: 0.4.0 - */ -GArrowFeatherFileWriter * -garrow_feather_file_writer_new(GArrowOutputStream *sink, - GError **error) -{ - auto arrow_sink = garrow_output_stream_get_raw(sink); - std::unique_ptr arrow_writer; - auto status = arrow::ipc::feather::TableWriter::Open(arrow_sink, - &arrow_writer); - if (garrow_error_check(error, status, "[feature-file-writer][new]")) { - return garrow_feather_file_writer_new_raw(arrow_writer.release()); - } else { - return NULL; - } -} - -/** - * garrow_feather_file_writer_set_description: - * @writer: A #GArrowFeatherFileWriter. - * @description: The description of the file. - * - * Since: 0.4.0 - */ -void -garrow_feather_file_writer_set_description(GArrowFeatherFileWriter *writer, - const gchar *description) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - arrow_writer->SetDescription(std::string(description)); -} - -/** - * garrow_feather_file_writer_set_n_rows: - * @writer: A #GArrowFeatherFileWriter. - * @n_rows: The number of rows in the file. - * - * Since: 0.4.0 - */ -void -garrow_feather_file_writer_set_n_rows(GArrowFeatherFileWriter *writer, - gint64 n_rows) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - arrow_writer->SetNumRows(n_rows); -} - -/** - * garrow_feather_file_writer_append: - * @writer: A #GArrowFeatherFileWriter. - * @name: The name of the array to be appended. - * @array: The array to be appended. - * @error: (nullable): Return location for a #GError or %NULL. - * * Returns: %TRUE on success, %FALSE if there was an error. * - * Since: 0.4.0 + * Since: 0.17.0 */ gboolean -garrow_feather_file_writer_append(GArrowFeatherFileWriter *writer, - const gchar *name, - GArrowArray *array, - GError **error) +garrow_feather_write_file(GArrowTable *table, + GArrowOutputStream *sink, + GError **error) { - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - auto arrow_array = garrow_array_get_raw(array); - - auto status = arrow_writer->Append(std::string(name), *arrow_array); - return garrow_error_check(error, status, "[feather-file-writer][append]"); -} - -/** - * garrow_feather_file_writer_writer: - * @writer: A #GArrowFeatherFileWriter. - * @array: The table to be written. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.12.0 - */ -gboolean -garrow_feather_file_writer_write(GArrowFeatherFileWriter *writer, - GArrowTable *table, - GError **error) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); + auto arrow_sink = garrow_output_stream_get_raw(sink); auto arrow_table = garrow_table_get_raw(table); - auto status = arrow_writer->Write(*arrow_table); - return garrow_error_check(error, status, "[feather-file-writer][write]"); -} - -/** - * garrow_feather_file_writer_close: - * @writer: A #GArrowFeatherFileWriter. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.4.0 - */ -gboolean -garrow_feather_file_writer_close(GArrowFeatherFileWriter *writer, - GError **error) -{ - auto arrow_writer = garrow_feather_file_writer_get_raw(writer); - - auto status = arrow_writer->Finalize(); - return garrow_error_check(error, status, "[feather-file-writer][close]"); + // TODO: Write with options + auto status = arrow::ipc::feather::WriteTable(*arrow_table, arrow_sink.get()); + return garrow_error_check(error, status, "[feather-write-file]"); } G_END_DECLS - GArrowRecordBatchWriter * garrow_record_batch_writer_new_raw(std::shared_ptr *arrow_writer) { @@ -546,21 +359,3 @@ garrow_record_batch_file_writer_new_raw(std::shared_ptrfeather_table_writer; -} diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index 8950b213cf7..ee656c6bc4c 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -189,32 +189,8 @@ GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new( GArrowSchema *schema, GError **error); - -#define GARROW_TYPE_FEATHER_FILE_WRITER (garrow_feather_file_writer_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowFeatherFileWriter, - garrow_feather_file_writer, - GARROW, - FEATHER_FILE_WRITER, - GObject) -struct _GArrowFeatherFileWriterClass -{ - GObjectClass parent_class; -}; - -GArrowFeatherFileWriter *garrow_feather_file_writer_new(GArrowOutputStream *sink, - GError **error); -void garrow_feather_file_writer_set_description(GArrowFeatherFileWriter *writer, - const gchar *description); -void garrow_feather_file_writer_set_n_rows(GArrowFeatherFileWriter *writer, - gint64 n_rows); -gboolean garrow_feather_file_writer_append(GArrowFeatherFileWriter *writer, - const gchar *name, - GArrowArray *array, - GError **error); -gboolean garrow_feather_file_writer_write(GArrowFeatherFileWriter *writer, - GArrowTable *table, - GError **error); -gboolean garrow_feather_file_writer_close(GArrowFeatherFileWriter *writer, - GError **error); +gboolean garrow_feather_write_file(GArrowTable *table, + GArrowOutputStream *sink, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp index 61d9d679dc3..1e188bd3c68 100644 --- a/c_glib/arrow-glib/writer.hpp +++ b/c_glib/arrow-glib/writer.hpp @@ -31,6 +31,3 @@ std::shared_ptr garrow_record_batch_writer_get_ra GArrowRecordBatchStreamWriter *garrow_record_batch_stream_writer_new_raw(std::shared_ptr *arrow_writer); GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new_raw(std::shared_ptr *arrow_writer); - -GArrowFeatherFileWriter *garrow_feather_file_writer_new_raw(arrow::ipc::feather::TableWriter *arrow_writer); -arrow::ipc::feather::TableWriter *garrow_feather_file_writer_get_raw(GArrowFeatherFileWriter *writer); diff --git a/c_glib/test/test-feather-file-reader.rb b/c_glib/test/test-feather-file-reader.rb index 48a4fc75488..9127323cda0 100644 --- a/c_glib/test/test-feather-file-reader.rb +++ b/c_glib/test/test-feather-file-reader.rb @@ -18,26 +18,11 @@ class TestFeatherFileReader < Test::Unit::TestCase include Helper::Buildable - def setup_file(data) + def setup_file(table) tempfile = Tempfile.open("arrow-feather-file-reader") output = Arrow::FileOutputStream.new(tempfile.path, false) begin - writer = Arrow::FeatherFileWriter.new(output) - begin - if data[:description] - writer.description = data[:description] - end - writer.n_rows = data[:n_rows] || 0 - if data[:table] - writer.write(data[:table]) - elsif data[:columns] - data[:columns].each do |name, array| - writer.append(name, array) - end - end - ensure - writer.close - end + Arrow::feather_write_file(table, output) ensure output.close end @@ -51,100 +36,13 @@ def setup_file(data) end end - sub_test_case("#description") do - test("exist") do - setup_file(:description => "Log") do |reader| - assert_equal("Log", reader.description) - end - end - - test("not exist") do - setup_file(:description => nil) do |reader| - assert_nil(reader.description) - end - end - end - - sub_test_case("#has_description?") do - test("exist") do - setup_file(:description => "Log") do |reader| - assert do - reader.has_description? - end - end - end - - test("not exist") do - setup_file(:description => nil) do |reader| - assert do - not reader.has_description? - end - end - end - end - - test("#version") do - setup_file({}) do |reader| - assert do - reader.version >= 2 - end - end - end - - test("#n_rows") do - setup_file(:n_rows => 3) do |reader| - assert_equal(3, reader.n_rows) - end - end - - test("#n_columns") do - columns = { - "message" => build_string_array([]), - "is_critical" => build_boolean_array([]), - } - setup_file(:columns => columns) do |reader| - assert_equal(2, reader.n_columns) - end - end - - test("#get_column_name") do - columns = { - "message" => build_string_array([]), - "is_critical" => build_boolean_array([]), - } - setup_file(:columns => columns) do |reader| - actual_column_names = reader.n_columns.times.collect do |i| - reader.get_column_name(i) - end - assert_equal([ - "message", - "is_critical", - ], - actual_column_names) - end - end - - test("#get_column_data") do - columns = { - "message" => build_string_array(["Hello"]), - "is_critical" => build_boolean_array([false]), - } - setup_file(:columns => columns) do |reader| - actual_columns = reader.n_columns.times.collect do |i| - reader.get_column_data(i).get_chunk(0) - end - assert_equal([ - columns["message"], - columns["is_critical"], - ], - actual_columns) - end - end - test("#read") do table = build_table("message" => build_string_array(["Login"]), "is_critical" => build_boolean_array([true])) - setup_file(:table => table) do |reader| + setup_file(table) do |reader| + assert do + reader.version >= 2 + end assert_equal(table, reader.read) end end @@ -153,7 +51,7 @@ def setup_file(data) table = build_table("message" => build_string_array(["Login"]), "is_critical" => build_boolean_array([true]), "host" => build_string_array(["www"])) - setup_file(:table => table) do |reader| + setup_file(table) do |reader| assert_equal(build_table("message" => build_string_array(["Login"]), "host" => build_string_array(["www"])), reader.read_indices([2, 0])) @@ -164,7 +62,7 @@ def setup_file(data) table = build_table("message" => build_string_array(["Login"]), "is_critical" => build_boolean_array([true]), "host" => build_string_array(["www"])) - setup_file(:table => table) do |reader| + setup_file(table) do |reader| assert_equal(build_table("message" => build_string_array(["Login"]), "host" => build_string_array(["www"])), reader.read_names(["host", "message"])) diff --git a/c_glib/test/test-feather-file-writer.rb b/c_glib/test/test-feather-file-writer.rb index 247d937e93e..eb7385d2b18 100644 --- a/c_glib/test/test-feather-file-writer.rb +++ b/c_glib/test/test-feather-file-writer.rb @@ -18,58 +18,6 @@ class TestFeatherFileWriter < Test::Unit::TestCase include Helper::Buildable - def test_append - tempfile = Tempfile.open("arrow-feather-file-writer") - output = Arrow::FileOutputStream.new(tempfile.path, false) - begin - writer = Arrow::FeatherFileWriter.new(output) - begin - writer.description = "Log" - writer.n_rows = 3 - writer.append("message", - build_string_array(["Crash", "Error", "Shutdown"])) - writer.append("is_critical", - build_boolean_array([true, true, false])) - ensure - writer.close - end - ensure - output.close - end - - input = Arrow::MemoryMappedInputStream.new(tempfile.path) - begin - reader = Arrow::FeatherFileReader.new(input) - columns = reader.n_columns.times.collect do |i| - [ - reader.get_column_name(i), - reader.get_column_data(i).get_chunk(0), - ] - end - assert_equal([ - true, - "Log", - [ - [ - "message", - build_string_array(["Crash", "Error", "Shutdown"]), - ], - [ - "is_critical", - build_boolean_array([true, true, false]), - ], - ], - ], - [ - reader.has_description?, - reader.description, - columns, - ]) - ensure - input.close - end - end - def test_write messages = build_string_array(["Crash", "Error", "Shutdown"]) is_criticals = build_boolean_array([true, true, false]) @@ -79,16 +27,12 @@ def test_write tempfile = Tempfile.open("arrow-feather-file-writer") output = Arrow::FileOutputStream.new(tempfile.path, false) - writer = Arrow::FeatherFileWriter.new(output) - writer.n_rows = table.n_rows - writer.write(table) - writer.close + Arrow::feather_write_file(table, output) output.close input = Arrow::MemoryMappedInputStream.new(tempfile.path) reader = Arrow::FeatherFileReader.new(input) - assert_equal([table.n_rows, table], - [reader.n_rows, reader.read]) + assert_equal(table, reader.read) input.close end end diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb index f098ab6cfb6..e4bff33a0e7 100644 --- a/ruby/red-arrow/lib/arrow/table-saver.rb +++ b/ruby/red-arrow/lib/arrow/table-saver.rb @@ -156,9 +156,7 @@ def save_as_tsv def save_as_feather open_output_stream do |output| - FeatherFileWriter.open(output) do |writer| - writer.write(@table) - end + write_feather_file(output, @table) end end end From 1a15fd82c1ad1164aba631f9ef73be7deba3b47b Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Mar 2020 15:30:43 -0700 Subject: [PATCH 06/15] Wire up Feather write options in R, add tests all around, and update docs --- r/R/arrowExports.R | 4 +- r/R/feather.R | 72 ++++++++++++++--- ...FeatherTableReader.Rd => FeatherReader.Rd} | 20 ++--- r/man/FeatherTableWriter.Rd | 34 -------- r/man/read_feather.Rd | 2 +- r/man/write_feather.Rd | 31 ++++++- r/src/arrowExports.cpp | 14 ++-- r/src/feather.cpp | 10 ++- r/tests/testthat/test-feather.R | 81 +++++++++++++++---- 9 files changed, 181 insertions(+), 87 deletions(-) rename r/man/{FeatherTableReader.Rd => FeatherReader.Rd} (64%) delete mode 100644 r/man/FeatherTableWriter.Rd diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index f38e73b17a8..2e5ed1ae4a5 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -708,8 +708,8 @@ dataset___expr__ToString <- function(x){ .Call(`_arrow_dataset___expr__ToString` , x) } -ipc___WriteFeather__RecordBatch <- function(stream, batch){ - invisible(.Call(`_arrow_ipc___WriteFeather__RecordBatch` , stream, batch)) +ipc___WriteFeather__RecordBatch <- function(stream, batch, version, chunk_size, compression, compression_level){ + invisible(.Call(`_arrow_ipc___WriteFeather__RecordBatch` , stream, batch, version, chunk_size, compression, compression_level)) } ipc___feather___Reader__version <- function(reader){ diff --git a/r/R/feather.R b/r/R/feather.R index 2af912e643d..feb35687819 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -17,11 +17,22 @@ #' Write data in the Feather format #' -#' @param x `data.frame` or RecordBatch -#' @param sink A file path or an OutputStream -#' -#' @return the input `x` invisibly. +#' @param x `data.frame` or `RecordBatch` +#' @param sink A file path or an `OutputStream` +#' @param version integer Feather file version. Version 2 is the current. +#' Version 1 is the more limited legacy format. +#' @param chunk_size For V2 files, the number of rows that each chunk of data +#' should have in the file. Use a smaller `chunk_size` when you need faster +#' random row access. Default is 64K. This option is not supported for V1. +#' @param compression Name of compression codec to use, if any. Default is +#' uncompressed; "lz4" and "zstd" are valid options, but only if your version +#' of the Arrow C++ library was built with support for them. +#' See [codec_is_available()]. This option is not supported for V1. +#' @param compression_level If `compression` is "lz4" or "zstd", you may +#' specify an integer compression level. If omitted, the selected compression's +#' default compression level is used. #' +#' @return The input `x`, invisibly. #' @export #' @examples #' \donttest{ @@ -30,9 +41,42 @@ #' write_feather(mtcars, tf) #' } #' @include arrow-package.R -write_feather <- function(x, sink) { - x_out <- x +write_feather <- function(x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("uncompressed", "lz4", "zstd"), + compression_level = NULL) { + # Handle and validate options before touching data + version <- as.integer(version) + assert_that(version %in% 1:2) + compression <- compression_from_name(match.arg(compression)) + chunk_size <- as.integer(chunk_size) + assert_that(chunk_size > 0) + if (is.null(compression_level)) { + # Use -1 as sentinal for "default" + compression_level <- -1L + } + compression_level <- as.integer(compression_level) + # Now make sure that options make sense together + if (version == 1) { + if (chunk_size != 65536L) { + stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) + } + if (compression > 0) { + stop("Feather version 1 does not support the 'compression' option", call. = FALSE) + } + if (compression_level != -1L) { + stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) + } + } + if (compression == 0 && compression_level != -1L) { + stop("Cannot specify a 'compression_level' when 'compression' is 'uncompressed'", call. = FALSE) + } + # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: + version <- version + 1L + x_out <- x if (is.data.frame(x)) { x <- record_batch(x) } @@ -43,8 +87,7 @@ write_feather <- function(x, sink) { on.exit(sink$close()) } assert_is(sink, "OutputStream") - - ipc___WriteFeather__RecordBatch(sink, x) + ipc___WriteFeather__RecordBatch(sink, x, version, chunk_size, compression, compression_level) invisible(x_out) } @@ -108,21 +151,26 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { #' #' @section Methods: #' -#' - `$version()` -#' - `$Read(columns)` +#' - `$Read(columns)`: Returns a `Table` of the selected columns, a vector of +#' integer indices +#' - `$version`: Active binding, returns `1` or `2`, according to the Feather +#' file version #' #' @export #' @include arrow-package.R FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject, public = list( - version = function() ipc___feather___Reader__version(self), Read = function(columns) { shared_ptr(Table, ipc___feather___Reader__Read(self, columns)) } + ), + active = list( + # versions are officially 2 for V1 and 3 for V2 :shrug: + version = function() ipc___feather___Reader__version(self) - 1L ) ) FeatherReader$create <- function(file, mmap = TRUE, ...) { file <- make_readable_file(file, mmap) shared_ptr(FeatherReader, ipc___feather___Reader__Open(file)) -} \ No newline at end of file +} diff --git a/r/man/FeatherTableReader.Rd b/r/man/FeatherReader.Rd similarity index 64% rename from r/man/FeatherTableReader.Rd rename to r/man/FeatherReader.Rd index f06197a0834..1a0dba194f1 100644 --- a/r/man/FeatherTableReader.Rd +++ b/r/man/FeatherReader.Rd @@ -1,9 +1,9 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/feather.R \docType{class} -\name{FeatherTableReader} -\alias{FeatherTableReader} -\title{FeatherTableReader class} +\name{FeatherReader} +\alias{FeatherReader} +\title{FeatherReader class} \description{ This class enables you to interact with Feather files. Create one to connect to a file or other InputStream, and call \code{Read()} on it to @@ -12,7 +12,7 @@ make an \code{arrow::Table}. See its usage in \code{\link[=read_feather]{read_fe \section{Factory}{ -The \code{FeatherTableReader$create()} factory method instantiates the object and +The \code{FeatherReader$create()} factory method instantiates the object and takes the following arguments: \itemize{ \item \code{file} A character file name, raw vector, or Arrow file connection object @@ -25,14 +25,10 @@ takes the following arguments: \section{Methods}{ \itemize{ -\item \verb{$GetDescription()} -\item \verb{$HasDescription()} -\item \verb{$version()} -\item \verb{$num_rows()} -\item \verb{$num_columns()} -\item \verb{$GetColumnName()} -\item \verb{$GetColumn()} -\item \verb{$Read(columns)} +\item \verb{$Read(columns)}: Returns a \code{Table} of the selected columns, a vector of +integer indices +\item \verb{$version}: Active binding, returns \code{1} or \code{2}, according to the Feather +file version } } diff --git a/r/man/FeatherTableWriter.Rd b/r/man/FeatherTableWriter.Rd deleted file mode 100644 index 3c6afaa134f..00000000000 --- a/r/man/FeatherTableWriter.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/feather.R -\docType{class} -\name{FeatherTableWriter} -\alias{FeatherTableWriter} -\title{FeatherTableWriter class} -\description{ -This class enables you to write Feather files. See its usage in -\code{\link[=write_feather]{write_feather()}}. -} -\section{Factory}{ - - -The \code{FeatherTableWriter$create()} factory method instantiates the object and -takes the following argument: -\itemize{ -\item \code{stream} An \code{OutputStream} -} -} - -\section{Methods}{ - -\itemize{ -\item \verb{$GetDescription()} -\item \verb{$HasDescription()} -\item \verb{$version()} -\item \verb{$num_rows()} -\item \verb{$num_columns()} -\item \verb{$GetColumnName()} -\item \verb{$GetColumn()} -\item \verb{$Read(columns)} -} -} - diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index f5d6c257070..eb842719673 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -8,7 +8,7 @@ read_feather(file, col_select = NULL, as_data_frame = TRUE, ...) } \arguments{ \item{file}{A character file path, a raw vector, or \code{InputStream}, passed to -\code{FeatherTableReader$create()}.} +\code{FeatherReader$create()}.} \item{col_select}{A character vector of column names to keep, as in the "select" argument to \code{data.table::fread()}, or a diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 09c3d42333c..89ad8127ec1 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -4,15 +4,38 @@ \alias{write_feather} \title{Write data in the Feather format} \usage{ -write_feather(x, sink) +write_feather( + x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("uncompressed", "lz4", "zstd"), + compression_level = NULL +) } \arguments{ -\item{x}{\code{data.frame} or RecordBatch} +\item{x}{\code{data.frame} or \code{RecordBatch}} -\item{sink}{A file path or an OutputStream} +\item{sink}{A file path or an \code{OutputStream}} + +\item{version}{integer Feather file version. Version 2 is the current. +Version 1 is the more limited legacy format.} + +\item{chunk_size}{For V2 files, the number of rows that each chunk of data +should have in the file. Use a smaller \code{chunk_size} when you need faster +random row access. Default is 64K. This option is not supported for V1.} + +\item{compression}{Name of compression codec to use, if any. Default is +uncompressed; "lz4" and "zstd" are valid options, but only if your version +of the Arrow C++ library was built with support for them. +See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.} + +\item{compression_level}{If \code{compression} is "lz4" or "zstd", you may +specify an integer compression level. If omitted, the selected compression's +default compression level is used.} } \value{ -the input \code{x} invisibly. +The input \code{x}, invisibly. } \description{ Write data in the Feather format diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index e53414d3a3c..c0d1bc7c2dc 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2739,17 +2739,21 @@ RcppExport SEXP _arrow_dataset___expr__ToString(SEXP x_sexp){ // feather.cpp #if defined(ARROW_R_WITH_ARROW) -void ipc___WriteFeather__RecordBatch(const std::shared_ptr& stream, const std::shared_ptr& batch); -RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp){ +void ipc___WriteFeather__RecordBatch(const std::shared_ptr& stream, const std::shared_ptr& batch, int version, int chunk_size, arrow::Compression::type compression, int compression_level); +RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type stream(stream_sexp); Rcpp::traits::input_parameter&>::type batch(batch_sexp); - ipc___WriteFeather__RecordBatch(stream, batch); + Rcpp::traits::input_parameter::type version(version_sexp); + Rcpp::traits::input_parameter::type chunk_size(chunk_size_sexp); + Rcpp::traits::input_parameter::type compression(compression_sexp); + Rcpp::traits::input_parameter::type compression_level(compression_level_sexp); + ipc___WriteFeather__RecordBatch(stream, batch, version, chunk_size, compression, compression_level); return R_NilValue; END_RCPP } #else -RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp){ +RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ Rf_error("Cannot call ipc___WriteFeather__RecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -5929,7 +5933,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___expr__is_valid", (DL_FUNC) &_arrow_dataset___expr__is_valid, 1}, { "_arrow_dataset___expr__scalar", (DL_FUNC) &_arrow_dataset___expr__scalar, 1}, { "_arrow_dataset___expr__ToString", (DL_FUNC) &_arrow_dataset___expr__ToString, 1}, - { "_arrow_ipc___WriteFeather__RecordBatch", (DL_FUNC) &_arrow_ipc___WriteFeather__RecordBatch, 2}, + { "_arrow_ipc___WriteFeather__RecordBatch", (DL_FUNC) &_arrow_ipc___WriteFeather__RecordBatch, 6}, { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, diff --git a/r/src/feather.cpp b/r/src/feather.cpp index c6965035ee4..4c40d3592c7 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -24,8 +24,16 @@ // [[arrow::export]] void ipc___WriteFeather__RecordBatch( const std::shared_ptr& stream, - const std::shared_ptr& batch) { + const std::shared_ptr& batch, + int version, int chunk_size, arrow::Compression::type compression, + int compression_level) { auto properties = arrow::ipc::feather::WriteProperties::Defaults(); + properties.version = version; + properties.chunksize = chunk_size; + properties.compression = compression; + if (compression_level != -1) { + properties.compression_level = compression_level; + } std::shared_ptr table; STOP_IF_NOT_OK(arrow::Table::FromRecordBatches({batch}, &table)); STOP_IF_NOT_OK(arrow::ipc::feather::WriteTable(*table, stream.get(), properties)); diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index c5153a9891f..9d472a4cd58 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -21,29 +21,30 @@ feather_file <- tempfile() tib <- tibble::tibble(x = 1:10, y = rnorm(10), z = letters[1:10]) test_that("Write a feather file", { - write_feather(tib, feather_file) - expect_true(file.exists(feather_file)) -}) - -test_that("write_feather() returns its input", { tib_out <- write_feather(tib, feather_file) + expect_true(file.exists(feather_file)) + # Input is returned unmodified expect_identical(tib_out, tib) }) -test_that("feather read/write round trip", { +expect_feather_roundtrip <- function(write_fun) { tf2 <- normalizePath(tempfile(), mustWork = FALSE) - write_feather(tib, tf2) + tf3 <- tempfile() + on.exit({ + unlink(tf2) + unlink(tf3) + }) + + # Write two ways. These are what varies with each run + write_fun(tib, tf2) expect_true(file.exists(tf2)) - tf3 <- tempfile() stream <- FileOutputStream$create(tf3) - write_feather(tib, stream) + write_fun(tib, stream) stream$close() expect_true(file.exists(tf3)) - tab1 <- read_feather(feather_file) - expect_is(tab1, "data.frame") - + # Read both back tab2 <- read_feather(tf2) expect_is(tab2, "data.frame") @@ -58,17 +59,50 @@ test_that("feather read/write round trip", { tab5 <- read_feather(ReadableFile$create(tf3)) expect_is(tab5, "data.frame") - expect_equal(tib, tab1) expect_equal(tib, tab2) expect_equal(tib, tab3) expect_equal(tib, tab4) expect_equal(tib, tab5) +} - unlink(tf2) - unlink(tf3) +test_that("feather read/write round trip", { + expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 1)) + expect_feather_roundtrip(function(x, f) write_feather(x, f, version = 2)) + expect_feather_roundtrip(function(x, f) write_feather(x, f, chunk_size = 32)) + if (codec_is_available("lz4")) { + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4")) + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4", compression_level = 3)) + } + if (codec_is_available("zstd")) { + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd")) + expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd", compression_level = 3)) + } }) -test_that("feather handles col_select = ", { +test_that("write_feather option error handling", { + tf <- tempfile() + expect_false(file.exists(tf)) + expect_error( + write_feather(tib, tf, version = 1, chunk_size = 1024), + "Feather version 1 does not support the 'chunk_size' option" + ) + expect_error( + write_feather(tib, tf, version = 1, compression = "lz4"), + "Feather version 1 does not support the 'compression' option" + ) + expect_error( + write_feather(tib, tf, version = 1, compression_level = 1024), + "Feather version 1 does not support the 'compression_level' option" + ) + expect_error( + write_feather(tib, tf, compression_level = 1024), + "Cannot specify a 'compression_level' when 'compression' is 'uncompressed'" + ) + expect_match_arg_error(write_feather(tib, tf, compression = "bz2")) + expect_false(file.exists(tf)) +}) + +test_that("read_feather supports col_select = ", { tab1 <- read_feather(feather_file, col_select = c("x", "y")) expect_is(tab1, "data.frame") @@ -111,4 +145,19 @@ test_that("Read feather from raw vector", { expect_is(df, "data.frame") }) +test_that("FeatherReader", { + v1 <- tempfile() + v2 <- tempfile() + on.exit({ + unlink(v1) + unlink(v2) + }) + write_feather(tib, v1, version = 1) + write_feather(tib, v2) + reader1 <- FeatherReader$create(v1) + expect_identical(reader1$version, 1L) + reader2 <- FeatherReader$create(v2) + expect_identical(reader2$version, 2L) +}) + unlink(feather_file) From b8f942db5d4ee25a878d16942cba46e0aef1d83f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Mar 2020 15:37:41 -0700 Subject: [PATCH 07/15] Have I mentioned lately how I feel about automating lint and style fixing? --- r/src/feather.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/src/feather.cpp b/r/src/feather.cpp index 4c40d3592c7..35b8037ed6f 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -24,9 +24,8 @@ // [[arrow::export]] void ipc___WriteFeather__RecordBatch( const std::shared_ptr& stream, - const std::shared_ptr& batch, - int version, int chunk_size, arrow::Compression::type compression, - int compression_level) { + const std::shared_ptr& batch, int version, int chunk_size, + arrow::Compression::type compression, int compression_level) { auto properties = arrow::ipc::feather::WriteProperties::Defaults(); properties.version = version; properties.chunksize = chunk_size; From 75260305937e38f6f83048ce0cd415e93ac2038a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 27 Mar 2020 16:19:35 -0700 Subject: [PATCH 08/15] Don't allow compression_level unless using zstd --- r/R/feather.R | 13 +++++++------ r/man/write_feather.Rd | 4 ++-- r/tests/testthat/test-feather.R | 3 +-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/r/R/feather.R b/r/R/feather.R index feb35687819..08ecf9466c1 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -28,8 +28,8 @@ #' uncompressed; "lz4" and "zstd" are valid options, but only if your version #' of the Arrow C++ library was built with support for them. #' See [codec_is_available()]. This option is not supported for V1. -#' @param compression_level If `compression` is "lz4" or "zstd", you may -#' specify an integer compression level. If omitted, the selected compression's +#' @param compression_level If `compression` is "zstd", you may +#' specify an integer compression level. If omitted, the compression codec's #' default compression level is used. #' #' @return The input `x`, invisibly. @@ -50,7 +50,7 @@ write_feather <- function(x, # Handle and validate options before touching data version <- as.integer(version) assert_that(version %in% 1:2) - compression <- compression_from_name(match.arg(compression)) + compression <- match.arg(compression) chunk_size <- as.integer(chunk_size) assert_that(chunk_size > 0) if (is.null(compression_level)) { @@ -63,18 +63,19 @@ write_feather <- function(x, if (chunk_size != 65536L) { stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) } - if (compression > 0) { + if (compression != "uncompressed") { stop("Feather version 1 does not support the 'compression' option", call. = FALSE) } if (compression_level != -1L) { stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) } } - if (compression == 0 && compression_level != -1L) { - stop("Cannot specify a 'compression_level' when 'compression' is 'uncompressed'", call. = FALSE) + if (compression != "zstd" && compression_level != -1L) { + stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE) } # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: version <- version + 1L + compression <- compression_from_name(compression) x_out <- x if (is.data.frame(x)) { diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 89ad8127ec1..6b57df20b02 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -30,8 +30,8 @@ uncompressed; "lz4" and "zstd" are valid options, but only if your version of the Arrow C++ library was built with support for them. See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.} -\item{compression_level}{If \code{compression} is "lz4" or "zstd", you may -specify an integer compression level. If omitted, the selected compression's +\item{compression_level}{If \code{compression} is "zstd", you may +specify an integer compression level. If omitted, the compression codec's default compression level is used.} } \value{ diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 9d472a4cd58..fad0e47cd16 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -71,7 +71,6 @@ test_that("feather read/write round trip", { expect_feather_roundtrip(function(x, f) write_feather(x, f, chunk_size = 32)) if (codec_is_available("lz4")) { expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4")) - expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "lz4", compression_level = 3)) } if (codec_is_available("zstd")) { expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd")) @@ -96,7 +95,7 @@ test_that("write_feather option error handling", { ) expect_error( write_feather(tib, tf, compression_level = 1024), - "Cannot specify a 'compression_level' when 'compression' is 'uncompressed'" + "Can only specify a 'compression_level' when 'compression' is 'zstd'" ) expect_match_arg_error(write_feather(tib, tf, compression = "bz2")) expect_false(file.exists(tf)) From 6212d8891406346dcb68bc87a8cf09e31855a42d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 28 Mar 2020 11:54:13 +0900 Subject: [PATCH 09/15] [GLib][Ruby] Improve writer API --- c_glib/arrow-glib/reader.cpp | 17 +-- c_glib/arrow-glib/table.cpp | 183 +++++++++++++++++++++++- c_glib/arrow-glib/table.h | 25 ++++ c_glib/arrow-glib/table.hpp | 4 + c_glib/arrow-glib/writer.cpp | 23 --- c_glib/arrow-glib/writer.h | 4 - c_glib/test/test-feather-file-reader.rb | 2 +- c_glib/test/test-feather-file-writer.rb | 38 ----- c_glib/test/test-table.rb | 44 ++++++ ruby/red-arrow/lib/arrow/table-saver.rb | 10 +- ruby/red-arrow/test/test-feather.rb | 27 +++- 11 files changed, 287 insertions(+), 90 deletions(-) delete mode 100644 c_glib/test/test-feather-file-writer.rb diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 513d1a7ea29..6315e51cb78 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -525,8 +525,7 @@ typedef struct GArrowFeatherFileReaderPrivate_ { } GArrowFeatherFileReaderPrivate; enum { - PROP_0__, - PROP_FEATHER_READER + PROP_FEATHER_READER = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherFileReader, @@ -588,15 +587,13 @@ garrow_feather_file_reader_init(GArrowFeatherFileReader *object) static void garrow_feather_file_reader_class_init(GArrowFeatherFileReaderClass *klass) { - GObjectClass *gobject_class; - GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); + auto gobject_class = G_OBJECT_CLASS(klass); gobject_class->finalize = garrow_feather_file_reader_finalize; gobject_class->set_property = garrow_feather_file_reader_set_property; gobject_class->get_property = garrow_feather_file_reader_get_property; + GParamSpec *spec; spec = g_param_spec_pointer("feather-reader", "arrow::ipc::feather::Reader", "The raw std::shared *", @@ -621,13 +618,9 @@ garrow_feather_file_reader_new(GArrowSeekableInputStream *file, GError **error) { auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(file); - std::unique_ptr arrow_reader; - - auto reader = - arrow::ipc::feather::Reader::Open(arrow_random_access_file); - + auto reader = arrow::ipc::feather::Reader::Open(arrow_random_access_file); if (garrow::check(error, reader, "[feather-file-reader][new]")) { - return garrow_feather_file_reader_new_raw(&(reader.ValueOrDie())); + return garrow_feather_file_reader_new_raw(&(*reader)); } else { return NULL; } diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index e23cc0434b0..2bc4f48a8a2 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -23,9 +23,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -36,10 +38,16 @@ G_BEGIN_DECLS /** * SECTION: table + * @section_id: GArrowTable + * @title: GArrowTable * @short_description: Table class + * @include: arrow-glib/arrow-glib.h * * #GArrowTable is a class for table. Table has zero or more * #GArrowChunkedArrays and zero or more records. + * + * #GArrowFeatherWriteProperties is a class to customize how to write + * Feather data. */ typedef struct GArrowTablePrivate_ { @@ -61,13 +69,13 @@ G_DEFINE_TYPE_WITH_PRIVATE(GArrowTable, GARROW_TABLE(obj))) static void -garrow_table_dispose(GObject *object) +garrow_table_finalize(GObject *object) { auto priv = GARROW_TABLE_GET_PRIVATE(object); priv->table.~shared_ptr(); - G_OBJECT_CLASS(garrow_table_parent_class)->dispose(object); + G_OBJECT_CLASS(garrow_table_parent_class)->finalize(object); } static void @@ -117,13 +125,13 @@ garrow_table_class_init(GArrowTableClass *klass) gobject_class = G_OBJECT_CLASS(klass); - gobject_class->dispose = garrow_table_dispose; + gobject_class->finalize = garrow_table_finalize; gobject_class->set_property = garrow_table_set_property; gobject_class->get_property = garrow_table_get_property; spec = g_param_spec_pointer("table", "Table", - "The raw std::shared *", + "The raw std::shared_ptr *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_TABLE, spec); @@ -632,6 +640,167 @@ garrow_table_combine_chunks(GArrowTable *table, } } + +typedef struct GArrowFeatherWritePropertiesPrivate_ { + arrow::ipc::feather::WriteProperties properties; +} GArrowFeatherWritePropertiesPrivate; + +enum { + PROP_VERSION = 1, + PROP_CHUNK_SIZE, + PROP_COMPRESSION, + PROP_COMPRESSION_LEVEL, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowFeatherWriteProperties, + garrow_feather_write_properties, + G_TYPE_OBJECT) + +#define GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(obj) \ + static_cast( \ + garrow_feather_write_properties_get_instance_private( \ + GARROW_FEATHER_WRITE_PROPERTIES(obj))) + +static void +garrow_feather_write_properties_finalize(GObject *object) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + + priv->properties.~WriteProperties(); + + G_OBJECT_CLASS(garrow_feather_write_properties_parent_class)->finalize(object); +} + +static void +garrow_feather_write_properties_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_COMPRESSION: + priv->properties.compression = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_feather_write_properties_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_COMPRESSION: + g_value_set_enum(value, priv->properties.compression); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_feather_write_properties_init(GArrowFeatherWriteProperties *object) +{ + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(object); + new(&priv->properties) arrow::ipc::feather::WriteProperties; + priv->properties = arrow::ipc::feather::WriteProperties::Defaults(); +} + +static void +garrow_feather_write_properties_class_init(GArrowFeatherWritePropertiesClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_feather_write_properties_finalize; + gobject_class->set_property = garrow_feather_write_properties_set_property; + gobject_class->get_property = garrow_feather_write_properties_get_property; + + auto properties = arrow::ipc::feather::WriteProperties::Defaults(); + GParamSpec *spec; + // TODO: version + // TODO: chunk_size + + /** + * GArrowFeatherWriteProperties:compression: + * + * Compression type to use. Only + * %GARROW_COMPRESSION_TYPE_UNCOMPRESSED, + * %GARROW_COMPRESSION_TYPE_LZ4 and %GARROW_COMPRESSION_TYPE_ZSTD + * are supported. The default compression is + * %GARROW_COMPRESSION_TYPE_LZ4 if Apache Arrow C++ is built with + * support for it, otherwise %GARROW_COMPRESSION_TYPE_UNCOMPRESSED. + * %GARROW_COMPRESSION_TYPE_UNCOMPRESSED is set as the object + * default here. + * + * Since: 0.17.0 + */ + spec = g_param_spec_enum("compression", + "Compression", + "The compression type to use", + GARROW_TYPE_COMPRESSION_TYPE, + properties.compression, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_COMPRESSION, spec); + // TODO: compression_level +} + +/** + * garrow_feather_write_properties_new: + * + * Returns: A newly created #GArrowFeatherWriteProperties. + * + * Since: 0.17.0 + */ +GArrowFeatherWriteProperties * +garrow_feather_write_properties_new(void) +{ + auto properties = g_object_new(GARROW_TYPE_FEATHER_WRITE_PROPERTIES, NULL); + return GARROW_FEATHER_WRITE_PROPERTIES(properties); +} + +/** + * garrow_table_write_as_feather: + * @table: A #GArrowTable. + * @sink: The output. + * @properties: (nullable): The properties for this write. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Writes the @table as Feather format data to the @sink. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.17.0 + */ +gboolean +garrow_table_write_as_feather(GArrowTable *table, + GArrowOutputStream *sink, + GArrowFeatherWriteProperties *properties, + GError **error) +{ + auto arrow_table = garrow_table_get_raw(table); + auto arrow_sink = garrow_output_stream_get_raw(sink); + arrow::Status status; + if (properties) { + auto arrow_properties = garrow_feather_write_properties_get_raw(properties); + status = arrow::ipc::feather::WriteTable(*arrow_table, + arrow_sink.get(), + arrow_properties); + } else { + status = arrow::ipc::feather::WriteTable(*arrow_table, arrow_sink.get()); + } + return garrow::check(error, status, "[feather-write-file]"); +} + G_END_DECLS GArrowTable * @@ -649,3 +818,9 @@ garrow_table_get_raw(GArrowTable *table) auto priv = GARROW_TABLE_GET_PRIVATE(table); return priv->table; } + +arrow::ipc::feather::WriteProperties & +garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties) +{ + return GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(properties)->properties; +} diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index 5c00acf11d0..97f4544ef2c 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include #include @@ -110,4 +111,28 @@ GArrowTable* garrow_table_combine_chunks(GArrowTable *table, GError **error); + +#define GARROW_TYPE_FEATHER_WRITE_PROPERTIES \ + (garrow_feather_write_properties_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFeatherWriteProperties, + garrow_feather_write_properties, + GARROW, + FEATHER_WRITE_PROPERTIES, + GObject) +struct _GArrowFeatherWritePropertiesClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_0_17 +GArrowFeatherWriteProperties * +garrow_feather_write_properties_new(void); + +GARROW_AVAILABLE_IN_0_17 +gboolean +garrow_table_write_as_feather(GArrowTable *table, + GArrowOutputStream *sink, + GArrowFeatherWriteProperties *properties, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-glib/table.hpp b/c_glib/arrow-glib/table.hpp index 22b0fad5024..111725e1b8d 100644 --- a/c_glib/arrow-glib/table.hpp +++ b/c_glib/arrow-glib/table.hpp @@ -20,8 +20,12 @@ #pragma once #include +#include #include GArrowTable *garrow_table_new_raw(std::shared_ptr *arrow_table); std::shared_ptr garrow_table_get_raw(GArrowTable *table); + +arrow::ipc::feather::WriteProperties & +garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties); diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index caeee2086d0..074c83af120 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -295,29 +295,6 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, } } -/** - * garrow_feather_write_file: - * @table: The table to be written. - * @sink: The output of the writer. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.17.0 - */ -gboolean -garrow_feather_write_file(GArrowTable *table, - GArrowOutputStream *sink, - GError **error) -{ - auto arrow_sink = garrow_output_stream_get_raw(sink); - auto arrow_table = garrow_table_get_raw(table); - - // TODO: Write with options - auto status = arrow::ipc::feather::WriteTable(*arrow_table, arrow_sink.get()); - return garrow_error_check(error, status, "[feather-write-file]"); -} - G_END_DECLS GArrowRecordBatchWriter * diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index ee656c6bc4c..a0d22fe626c 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -189,8 +189,4 @@ GArrowRecordBatchFileWriter *garrow_record_batch_file_writer_new( GArrowSchema *schema, GError **error); -gboolean garrow_feather_write_file(GArrowTable *table, - GArrowOutputStream *sink, - GError **error); - G_END_DECLS diff --git a/c_glib/test/test-feather-file-reader.rb b/c_glib/test/test-feather-file-reader.rb index 9127323cda0..33ff6a46b67 100644 --- a/c_glib/test/test-feather-file-reader.rb +++ b/c_glib/test/test-feather-file-reader.rb @@ -22,7 +22,7 @@ def setup_file(table) tempfile = Tempfile.open("arrow-feather-file-reader") output = Arrow::FileOutputStream.new(tempfile.path, false) begin - Arrow::feather_write_file(table, output) + table.write_as_feather(output) ensure output.close end diff --git a/c_glib/test/test-feather-file-writer.rb b/c_glib/test/test-feather-file-writer.rb deleted file mode 100644 index eb7385d2b18..00000000000 --- a/c_glib/test/test-feather-file-writer.rb +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestFeatherFileWriter < Test::Unit::TestCase - include Helper::Buildable - - def test_write - messages = build_string_array(["Crash", "Error", "Shutdown"]) - is_criticals = build_boolean_array([true, true, false]) - table = build_table("message" => messages, - "is_critical" => is_criticals) - - tempfile = Tempfile.open("arrow-feather-file-writer") - - output = Arrow::FileOutputStream.new(tempfile.path, false) - Arrow::feather_write_file(table, output) - output.close - - input = Arrow::MemoryMappedInputStream.new(tempfile.path) - reader = Arrow::FeatherFileReader.new(input) - assert_equal(table, reader.read) - input.close - end -end diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index acf97fd4c0f..bef1cffd57b 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -225,5 +225,49 @@ def test_combine_chunks assert_equal([[[true, false, true, false, true, false]]], all_values) end + + sub_test_case("#write_as_feather") do + def setup + super + @tempfile = Tempfile.open("arrow-table-write-as-feather") + begin + yield + ensure + @tempfile.close! + end + end + + def read_feather + input = Arrow::MemoryMappedInputStream.new(@tempfile.path) + reader = Arrow::FeatherFileReader.new(input) + begin + yield(reader.read) + ensure + input.close + end + end + + test("default") do + output = Arrow::FileOutputStream.new(@tempfile.path, false) + @table.write_as_feather(output) + output.close + + read_feather do |read_table| + assert_equal(@table, read_table) + end + end + + test("compression") do + output = Arrow::FileOutputStream.new(@tempfile.path, false) + properties = Arrow::FeatherWriteProperties.new + properties.compression = :zstd + @table.write_as_feather(output, properties) + output.close + + read_feather do |read_table| + assert_equal(@table, read_table) + end + end + end end end diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb index e4bff33a0e7..6c44b99ed30 100644 --- a/ruby/red-arrow/lib/arrow/table-saver.rb +++ b/ruby/red-arrow/lib/arrow/table-saver.rb @@ -155,8 +155,14 @@ def save_as_tsv end def save_as_feather - open_output_stream do |output| - write_feather_file(output, @table) + open_raw_output_stream do |output| + properties = FeatherWriteProperties.new + properties.class.properties.each do |name| + value = @options[name.to_sym] + next if value.nil? + properties.__send__("#{name}=", value) + end + @table.write_as_feather(output, properties) end end end diff --git a/ruby/red-arrow/test/test-feather.rb b/ruby/red-arrow/test/test-feather.rb index 75113e96fae..d36df5ed6fa 100644 --- a/ruby/red-arrow/test/test-feather.rb +++ b/ruby/red-arrow/test/test-feather.rb @@ -18,17 +18,32 @@ class FeatherTest < Test::Unit::TestCase include Helper::Fixture - def test_save_load + def setup columns = { "message" => Arrow::StringArray.new(["Start", "Crash", "Shutdown"]), "is_critical" => Arrow::BooleanArray.new([false, true, false]), } - table = Arrow::Table.new(columns) + @table = Arrow::Table.new(columns) - output = Tempfile.new(["red-arrow", ".feather"]) - table.save(output.path) - output.close + @output = Tempfile.new(["red-arrow", ".feather"]) + begin + yield(@output) + rescue + @output.close! + end + end + + def test_default + @table.save(@output.path) + @output.close + + assert_equal(@table, Arrow::Table.load(@output.path)) + end + + def test_compression + @table.save(@output.path, compression: :zstd) + @output.close - assert_equal(table, Arrow::Table.load(output.path)) + assert_equal(@table, Arrow::Table.load(@output.path)) end end From b312d1729e3f6bc76ee3601b9d45c3bdf53d97e5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sat, 28 Mar 2020 16:23:29 +0900 Subject: [PATCH 10/15] Enable zstd --- .github/workflows/ruby.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index e03615b97da..2877736c70d 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -97,6 +97,7 @@ jobs: ARROW_WITH_LZ4: ON ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON + ARROW_WITH_ZSTD: ON XML_CATALOG_FILES: /usr/local/etc/xml/catalog steps: - name: Checkout Arrow From a72a9a5729060d2740c8fbc1998d1f06a0012b64 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 29 Mar 2020 06:19:47 +0900 Subject: [PATCH 11/15] Use pointer instead of reference for consistency --- c_glib/arrow-glib/file-system.cpp | 49 ++++++++++++++++--------------- c_glib/arrow-glib/file-system.hpp | 2 +- c_glib/arrow-glib/table.cpp | 7 +++-- c_glib/arrow-glib/table.hpp | 2 +- 4 files changed, 31 insertions(+), 29 deletions(-) diff --git a/c_glib/arrow-glib/file-system.cpp b/c_glib/arrow-glib/file-system.cpp index 779d19a4afb..5336f07d3a5 100644 --- a/c_glib/arrow-glib/file-system.cpp +++ b/c_glib/arrow-glib/file-system.cpp @@ -85,27 +85,27 @@ garrow_file_info_set_property(GObject *object, const GValue *value, GParamSpec *pspec) { - auto &arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object)); + auto arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object)); switch (prop_id) { case PROP_FILE_INFO_TYPE: { auto arrow_file_type = static_cast(g_value_get_enum(value)); - arrow_file_info.set_type(arrow_file_type); + arrow_file_info->set_type(arrow_file_type); } break; case PROP_FILE_INFO_PATH: - arrow_file_info.set_path(g_value_get_string(value)); + arrow_file_info->set_path(g_value_get_string(value)); break; case PROP_FILE_INFO_SIZE: - arrow_file_info.set_size(g_value_get_int64(value)); + arrow_file_info->set_size(g_value_get_int64(value)); break; case PROP_FILE_INFO_MTIME: { const gint64 mtime = g_value_get_int64(value); const arrow::fs::TimePoint::duration duration(mtime); - arrow_file_info.set_mtime(arrow::fs::TimePoint(duration)); + arrow_file_info->set_mtime(arrow::fs::TimePoint(duration)); } break; default: @@ -120,35 +120,35 @@ garrow_file_info_get_property(GObject *object, GValue *value, GParamSpec *pspec) { - const auto &arrow_file_info = + const auto arrow_file_info = garrow_file_info_get_raw(GARROW_FILE_INFO(object)); switch (prop_id) { case PROP_FILE_INFO_TYPE: { - const auto arrow_file_type = arrow_file_info.type(); + const auto arrow_file_type = arrow_file_info->type(); const auto file_type = static_cast(arrow_file_type); g_value_set_enum(value, file_type); } break; case PROP_FILE_INFO_PATH: - g_value_set_string(value, arrow_file_info.path().c_str()); + g_value_set_string(value, arrow_file_info->path().c_str()); break; case PROP_FILE_INFO_BASE_NAME: - g_value_set_string(value, arrow_file_info.base_name().c_str()); + g_value_set_string(value, arrow_file_info->base_name().c_str()); break; case PROP_FILE_INFO_DIR_NAME: - g_value_set_string(value, arrow_file_info.dir_name().c_str()); + g_value_set_string(value, arrow_file_info->dir_name().c_str()); break; case PROP_FILE_INFO_EXTENSION: - g_value_set_string(value, arrow_file_info.extension().c_str()); + g_value_set_string(value, arrow_file_info->extension().c_str()); break; case PROP_FILE_INFO_SIZE: - g_value_set_int64(value, arrow_file_info.size()); + g_value_set_int64(value, arrow_file_info->size()); break; case PROP_FILE_INFO_MTIME: { - const auto arrow_mtime = arrow_file_info.mtime(); + const auto arrow_mtime = arrow_file_info->mtime(); const auto mtime = arrow_mtime.time_since_epoch().count(); g_value_set_int64(value, mtime); } @@ -317,9 +317,9 @@ gboolean garrow_file_info_equal(GArrowFileInfo *file_info, GArrowFileInfo *other_file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - const auto &arrow_other_file_info = garrow_file_info_get_raw(other_file_info); - return arrow_file_info.Equals(arrow_other_file_info); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + const auto arrow_other_file_info = garrow_file_info_get_raw(other_file_info); + return arrow_file_info->Equals(*arrow_other_file_info); } /** @@ -333,8 +333,8 @@ garrow_file_info_equal(GArrowFileInfo *file_info, gboolean garrow_file_info_is_file(GArrowFileInfo *file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - return arrow_file_info.IsFile(); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + return arrow_file_info->IsFile(); } /** @@ -348,8 +348,8 @@ garrow_file_info_is_file(GArrowFileInfo *file_info) gboolean garrow_file_info_is_dir(GArrowFileInfo *file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - return arrow_file_info.IsDirectory(); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + return arrow_file_info->IsDirectory(); } /** @@ -365,8 +365,8 @@ garrow_file_info_is_dir(GArrowFileInfo *file_info) gchar * garrow_file_info_to_string(GArrowFileInfo *file_info) { - const auto &arrow_file_info = garrow_file_info_get_raw(file_info); - auto string = arrow_file_info.ToString(); + const auto arrow_file_info = garrow_file_info_get_raw(file_info); + auto string = arrow_file_info->ToString(); return g_strndup(string.data(), string.size()); } @@ -1308,10 +1308,11 @@ garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info) return file_info; } -arrow::fs::FileInfo & +arrow::fs::FileInfo * garrow_file_info_get_raw(GArrowFileInfo *file_info) { - return GARROW_FILE_INFO_GET_PRIVATE(file_info)->file_info; + auto priv = GARROW_FILE_INFO_GET_PRIVATE(file_info); + return &(priv->file_info); } std::shared_ptr diff --git a/c_glib/arrow-glib/file-system.hpp b/c_glib/arrow-glib/file-system.hpp index db0e0a45680..175ac093ab2 100644 --- a/c_glib/arrow-glib/file-system.hpp +++ b/c_glib/arrow-glib/file-system.hpp @@ -26,7 +26,7 @@ GArrowFileInfo * garrow_file_info_new_raw(const arrow::fs::FileInfo &arrow_file_info); -arrow::fs::FileInfo & +arrow::fs::FileInfo * garrow_file_info_get_raw(GArrowFileInfo *file_info); std::shared_ptr diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index 2bc4f48a8a2..7b05e252e00 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -794,7 +794,7 @@ garrow_table_write_as_feather(GArrowTable *table, auto arrow_properties = garrow_feather_write_properties_get_raw(properties); status = arrow::ipc::feather::WriteTable(*arrow_table, arrow_sink.get(), - arrow_properties); + *arrow_properties); } else { status = arrow::ipc::feather::WriteTable(*arrow_table, arrow_sink.get()); } @@ -819,8 +819,9 @@ garrow_table_get_raw(GArrowTable *table) return priv->table; } -arrow::ipc::feather::WriteProperties & +arrow::ipc::feather::WriteProperties * garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties) { - return GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(properties)->properties; + auto priv = GARROW_FEATHER_WRITE_PROPERTIES_GET_PRIVATE(properties); + return &(priv->properties); } diff --git a/c_glib/arrow-glib/table.hpp b/c_glib/arrow-glib/table.hpp index 111725e1b8d..dc972d80cff 100644 --- a/c_glib/arrow-glib/table.hpp +++ b/c_glib/arrow-glib/table.hpp @@ -27,5 +27,5 @@ GArrowTable *garrow_table_new_raw(std::shared_ptr *arrow_table); std::shared_ptr garrow_table_get_raw(GArrowTable *table); -arrow::ipc::feather::WriteProperties & +arrow::ipc::feather::WriteProperties * garrow_feather_write_properties_get_raw(GArrowFeatherWriteProperties *properties); From d41807fa5ee4d88094fd069c6d51ab70182c579b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 29 Mar 2020 16:36:37 -0500 Subject: [PATCH 12/15] Code review comments, check unsupported V1 options more thoroughly in Python --- python/pyarrow/feather.py | 75 ++++++++++++++++------------ python/pyarrow/tests/test_feather.py | 49 ++++++++++++++++-- 2 files changed, 88 insertions(+), 36 deletions(-) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index c8ff237510d..172a9312b44 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -124,13 +124,15 @@ def write_feather(df, dest, compression=None, compression_level=None, dest : str Local destination path. compression : string, default None - {"zstd", "lz4", None} + Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses + LZ4 for V2 files if it is available, otherwise uncompressed. compression_level : int, default None Use a compression level particular to the chosen compressor. If None use the default compression level chunksize : int, default None - For V2 files, the size of chunks to split the data into. None means use - the default, which is currently 64K + For V2 files, the internal maximum size of Arrow RecordBatch chunks + when writing the Arrow IPC file format. None means use the default, + which is currently 64K version : int, default 2 Feather file version. Version 2 is the current. Version 1 is the more limited legacy format @@ -151,8 +153,22 @@ def write_feather(df, dest, compression=None, compression_level=None, else: table = df - if version == 1 and len(table.column_names) > len(set(table.column_names)): - raise ValueError("cannot serialize duplicate column names") + if version == 1: + if len(table.column_names) > len(set(table.column_names)): + raise ValueError("cannot serialize duplicate column names") + + if compression is not None: + raise ValueError("Feather V1 files do not support compression " + "option") + + if chunksize is not None: + raise ValueError("Feather V1 files do not support chunksize " + "option") + + supported_compression_options = (None, 'lz4', 'zstd', 'uncompressed') + if compression not in supported_compression_options: + raise ValueError('compression="{}" not supported, must be one of {}' + .format(compression, supported_compression_options)) try: ext.write_feather(table, dest, compression=compression, @@ -167,19 +183,17 @@ def write_feather(df, dest, compression=None, compression_level=None, raise -def read_feather(source, columns=None, as_pandas=True, use_threads=True): +def read_feather(source, columns=None, use_threads=True): """ - Read a pandas.DataFrame from Feather format. + Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use + feather.read_table. Parameters ---------- source : str file path, or file-like object - columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read. - as_pandas : bool, default True - Convert result to pandas.DataFrame use_threads: bool, default True Whether to parallelize reading using multiple threads. @@ -187,28 +201,7 @@ def read_feather(source, columns=None, as_pandas=True, use_threads=True): ------- df : pandas.DataFrame """ - _check_pandas_version() - reader = ext.FeatherReader() - reader.open(source) - - def _handle_result(result): - if as_pandas: - result = result.to_pandas(use_threads=use_threads) - return result - - if columns is None: - return _handle_result(reader.read()) - - column_types = [type(column) for column in columns] - if all(map(lambda t: t == int, column_types)): - return _handle_result(reader.read_indices(columns)) - elif all(map(lambda t: t == str, column_types)): - return _handle_result(reader.read_names(columns)) - - column_type_names = [t.__name__ for t in column_types] - raise TypeError("Columns must be indices or names. " - "Got columns {} of types {}" - .format(columns, column_type_names)) + return read_table(source, columns=columns).to_pandas(use_threads=True) def read_table(source, columns=None): @@ -226,4 +219,20 @@ def read_table(source, columns=None): ------- table : pyarrow.Table """ - return read_feather(source, columns=columns, as_pandas=False) + _check_pandas_version() + reader = ext.FeatherReader() + reader.open(source) + + if columns is None: + return reader.read() + + column_types = [type(column) for column in columns] + if all(map(lambda t: t == int, column_types)): + return reader.read_indices(columns) + elif all(map(lambda t: t == str, column_types)): + return reader.read_names(columns) + + column_type_names = [t.__name__ for t in column_types] + raise TypeError("Columns must be indices or names. " + "Got columns {} of types {}" + .format(columns, column_type_names)) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index ff35be5d748..7b74182bde5 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -74,12 +74,14 @@ def test_file_not_exist(): def _check_pandas_roundtrip(df, expected=None, path=None, columns=None, use_threads=False, - version=None, compression=None): + version=None, compression=None, + compression_level=None): if path is None: path = random_path() TEST_FILES.append(path) - write_feather(df, path, compression=compression, version=version) + write_feather(df, path, compression=compression, + compression_level=compression_level, version=version) if not os.path.exists(path): raise Exception('file not written') @@ -526,9 +528,10 @@ class A: def test_v2_set_chunksize(): df = pd.DataFrame({'A': np.arange(1000)}) + table = pa.table(df) buf = io.BytesIO() - write_feather(df, buf, chunksize=250, version=2) + write_feather(table, buf, chunksize=250, version=2) result = buf.getvalue() @@ -537,6 +540,46 @@ def test_v2_set_chunksize(): assert len(ipc_file.get_batch(0)) == 250 +def test_v2_compression_options(): + df = pd.DataFrame({'A': np.arange(1000)}) + + cases = [ + # compression, compression_level + ('uncompressed', None), + ('lz4', None), + ('zstd', 1), + ('zstd', 10) + ] + + for compression, compression_level in cases: + _check_pandas_roundtrip(df, compression=compression, + compression_level=compression_level) + + buf = io.BytesIO() + + # LZ4 doesn't support compression_level + with pytest.raises(pa.ArrowInvalid, + match="doesn't support setting a compression level"): + write_feather(df, buf, compression='lz4', compression_level=10) + + # Trying to compress with V1 + with pytest.raises( + ValueError, + match="Feather V1 files do not support compression option"): + write_feather(df, buf, compression='lz4', version=1) + + # Trying to set chunksize with V1 + with pytest.raises( + ValueError, + match="Feather V1 files do not support chunksize option"): + write_feather(df, buf, chunksize=4096, version=1) + + # Unsupported compressor + with pytest.raises(ValueError, + match='compression="snappy" not supported'): + write_feather(df, buf, compression='snappy') + + @pytest.mark.slow def test_large_dataframe(version): df = pd.DataFrame({'A': np.arange(400000000)}) From 5ce53cb19cb2f202216938e58f7227c682e63674 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 29 Mar 2020 16:43:04 -0500 Subject: [PATCH 13/15] Better error message when trying to write unsupported Arrow type with Feather V1 format --- cpp/src/arrow/ipc/feather.cc | 3 ++- python/pyarrow/tests/test_feather.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 5f3e4af94d1..fc0a78a7b75 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -433,7 +433,8 @@ Result ToFlatbufferType(const DataType& type) { case Type::TIME64: return fbs::Type::INT64; default: - return Status::TypeError("Unsupported Feather V1 type: ", type.ToString()); + return Status::TypeError("Unsupported Feather V1 type: ", type.ToString(), + ". Use V2 format to serialize all Arrow types."); } } diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 7b74182bde5..a5aba443836 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -580,6 +580,17 @@ def test_v2_compression_options(): write_feather(df, buf, compression='snappy') +def test_v1_unsupported_types(): + table = pa.table([pa.array([[1, 2, 3], [], None])], names=['f0']) + + buf = io.BytesIO() + with pytest.raises(TypeError, + match=("Unsupported Feather V1 type: " + "list. " + "Use V2 format to serialize all Arrow types.")): + write_feather(table, buf, version=1) + + @pytest.mark.slow def test_large_dataframe(version): df = pd.DataFrame({'A': np.arange(400000000)}) From 5b16bb3206961abcf30067e74c7767c3235ba903 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 29 Mar 2020 17:01:03 -0500 Subject: [PATCH 14/15] [R] Accept either RecordBatch or Table in arrow::write_feather. Use LZ4 as default compression from R if it is available --- r/R/arrowExports.R | 4 ++-- r/R/feather.R | 24 ++++++++++++++++++------ r/man/write_feather.Rd | 9 +++++---- r/src/arrowExports.cpp | 14 +++++++------- r/src/feather.cpp | 10 ++++------ r/tests/testthat/test-feather.R | 4 ++++ 6 files changed, 40 insertions(+), 25 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 2e5ed1ae4a5..dd1f6d620cc 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -708,8 +708,8 @@ dataset___expr__ToString <- function(x){ .Call(`_arrow_dataset___expr__ToString` , x) } -ipc___WriteFeather__RecordBatch <- function(stream, batch, version, chunk_size, compression, compression_level){ - invisible(.Call(`_arrow_ipc___WriteFeather__RecordBatch` , stream, batch, version, chunk_size, compression, compression_level)) +ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level){ + invisible(.Call(`_arrow_ipc___WriteFeather__Table` , stream, table, version, chunk_size, compression, compression_level)) } ipc___feather___Reader__version <- function(reader){ diff --git a/r/R/feather.R b/r/R/feather.R index 08ecf9466c1..b2262630d7a 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -17,7 +17,7 @@ #' Write data in the Feather format #' -#' @param x `data.frame` or `RecordBatch` +#' @param x `data.frame`, `RecordBatch`, or `Table` #' @param sink A file path or an `OutputStream` #' @param version integer Feather file version. Version 2 is the current. #' Version 1 is the more limited legacy format. @@ -25,8 +25,9 @@ #' should have in the file. Use a smaller `chunk_size` when you need faster #' random row access. Default is 64K. This option is not supported for V1. #' @param compression Name of compression codec to use, if any. Default is -#' uncompressed; "lz4" and "zstd" are valid options, but only if your version -#' of the Arrow C++ library was built with support for them. +#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +#' "uncompressed". "zstd" is the other available codec and generally has better +#' compression ratios in exchange for slower read and write performance #' See [codec_is_available()]. This option is not supported for V1. #' @param compression_level If `compression` is "zstd", you may #' specify an integer compression level. If omitted, the compression codec's @@ -45,7 +46,7 @@ write_feather <- function(x, sink, version = 2, chunk_size = 65536L, - compression = c("uncompressed", "lz4", "zstd"), + compression = c("default", "lz4", "uncompressed", "zstd"), compression_level = NULL) { # Handle and validate options before touching data version <- as.integer(version) @@ -53,6 +54,13 @@ write_feather <- function(x, compression <- match.arg(compression) chunk_size <- as.integer(chunk_size) assert_that(chunk_size > 0) + if (compression == "default") { + if (version == 2 && codec_is_available("lz4")) { + compression <- "lz4" + } else { + compression <- "uncompressed" + } + } if (is.null(compression_level)) { # Use -1 as sentinal for "default" compression_level <- -1L @@ -81,14 +89,18 @@ write_feather <- function(x, if (is.data.frame(x)) { x <- record_batch(x) } - assert_is(x, "RecordBatch") + + if (inherits(x, "RecordBatch")) { + x <- Table$create(x) + } + assert_is(x, "Table") if (is.character(sink)) { sink <- FileOutputStream$create(sink) on.exit(sink$close()) } assert_is(sink, "OutputStream") - ipc___WriteFeather__RecordBatch(sink, x, version, chunk_size, compression, compression_level) + ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level) invisible(x_out) } diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index 6b57df20b02..daeb66b6fec 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -9,12 +9,12 @@ write_feather( sink, version = 2, chunk_size = 65536L, - compression = c("uncompressed", "lz4", "zstd"), + compression = c("default", "lz4", "uncompressed", "zstd"), compression_level = NULL ) } \arguments{ -\item{x}{\code{data.frame} or \code{RecordBatch}} +\item{x}{\code{data.frame}, \code{RecordBatch}, or \code{Table}} \item{sink}{A file path or an \code{OutputStream}} @@ -26,8 +26,9 @@ should have in the file. Use a smaller \code{chunk_size} when you need faster random row access. Default is 64K. This option is not supported for V1.} \item{compression}{Name of compression codec to use, if any. Default is -uncompressed; "lz4" and "zstd" are valid options, but only if your version -of the Arrow C++ library was built with support for them. +"lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +"uncompressed". "zstd" is the other available codec and generally has better +compression ratios in exchange for slower read and write performance See \code{\link[=codec_is_available]{codec_is_available()}}. This option is not supported for V1.} \item{compression_level}{If \code{compression} is "zstd", you may diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index c0d1bc7c2dc..051d5a6f948 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2739,22 +2739,22 @@ RcppExport SEXP _arrow_dataset___expr__ToString(SEXP x_sexp){ // feather.cpp #if defined(ARROW_R_WITH_ARROW) -void ipc___WriteFeather__RecordBatch(const std::shared_ptr& stream, const std::shared_ptr& batch, int version, int chunk_size, arrow::Compression::type compression, int compression_level); -RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ +void ipc___WriteFeather__Table(const std::shared_ptr& stream, const std::shared_ptr& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level); +RcppExport SEXP _arrow_ipc___WriteFeather__Table(SEXP stream_sexp, SEXP table_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type stream(stream_sexp); - Rcpp::traits::input_parameter&>::type batch(batch_sexp); + Rcpp::traits::input_parameter&>::type table(table_sexp); Rcpp::traits::input_parameter::type version(version_sexp); Rcpp::traits::input_parameter::type chunk_size(chunk_size_sexp); Rcpp::traits::input_parameter::type compression(compression_sexp); Rcpp::traits::input_parameter::type compression_level(compression_level_sexp); - ipc___WriteFeather__RecordBatch(stream, batch, version, chunk_size, compression, compression_level); + ipc___WriteFeather__Table(stream, table, version, chunk_size, compression, compression_level); return R_NilValue; END_RCPP } #else -RcppExport SEXP _arrow_ipc___WriteFeather__RecordBatch(SEXP stream_sexp, SEXP batch_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ - Rf_error("Cannot call ipc___WriteFeather__RecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); +RcppExport SEXP _arrow_ipc___WriteFeather__Table(SEXP stream_sexp, SEXP table_sexp, SEXP version_sexp, SEXP chunk_size_sexp, SEXP compression_sexp, SEXP compression_level_sexp){ + Rf_error("Cannot call ipc___WriteFeather__Table(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -5933,7 +5933,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_dataset___expr__is_valid", (DL_FUNC) &_arrow_dataset___expr__is_valid, 1}, { "_arrow_dataset___expr__scalar", (DL_FUNC) &_arrow_dataset___expr__scalar, 1}, { "_arrow_dataset___expr__ToString", (DL_FUNC) &_arrow_dataset___expr__ToString, 1}, - { "_arrow_ipc___WriteFeather__RecordBatch", (DL_FUNC) &_arrow_ipc___WriteFeather__RecordBatch, 6}, + { "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6}, { "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1}, { "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2}, { "_arrow_ipc___feather___Reader__Open", (DL_FUNC) &_arrow_ipc___feather___Reader__Open, 1}, diff --git a/r/src/feather.cpp b/r/src/feather.cpp index 35b8037ed6f..7ab73606f8b 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -22,10 +22,10 @@ // ---------- WriteFeather // [[arrow::export]] -void ipc___WriteFeather__RecordBatch( - const std::shared_ptr& stream, - const std::shared_ptr& batch, int version, int chunk_size, - arrow::Compression::type compression, int compression_level) { +void ipc___WriteFeather__Table(const std::shared_ptr& stream, + const std::shared_ptr& table, int version, + int chunk_size, arrow::Compression::type compression, + int compression_level) { auto properties = arrow::ipc::feather::WriteProperties::Defaults(); properties.version = version; properties.chunksize = chunk_size; @@ -33,8 +33,6 @@ void ipc___WriteFeather__RecordBatch( if (compression_level != -1) { properties.compression_level = compression_level; } - std::shared_ptr table; - STOP_IF_NOT_OK(arrow::Table::FromRecordBatches({batch}, &table)); STOP_IF_NOT_OK(arrow::ipc::feather::WriteTable(*table, stream.get(), properties)); } diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index fad0e47cd16..7ae52eb2118 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -76,6 +76,10 @@ test_that("feather read/write round trip", { expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd")) expect_feather_roundtrip(function(x, f) write_feather(x, f, compression = "zstd", compression_level = 3)) } + + # Write from Arrow data structures + expect_feather_roundtrip(function(x, f) write_feather(RecordBatch$create(x), f)) + expect_feather_roundtrip(function(x, f) write_feather(Table$create(x), f)) }) test_that("write_feather option error handling", { From 4ebac36e7bef3170be05f4278e2f662071111bf3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 29 Mar 2020 17:05:29 -0500 Subject: [PATCH 15/15] [R] more concise conversion to Table --- r/R/feather.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/r/R/feather.R b/r/R/feather.R index b2262630d7a..9d5e0375949 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -86,11 +86,7 @@ write_feather <- function(x, compression <- compression_from_name(compression) x_out <- x - if (is.data.frame(x)) { - x <- record_batch(x) - } - - if (inherits(x, "RecordBatch")) { + if (is.data.frame(x) || inherits(x, "RecordBatch")) { x <- Table$create(x) } assert_is(x, "Table")