From 55011be634e39fcfad83b91b027f99c16001b596 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 21 Jun 2020 18:22:33 +0000 Subject: [PATCH 01/63] Add ByteSwap for 8-bit and floating point --- cpp/src/arrow/util/bit_util.h | 21 +++++++++++++++++---- cpp/src/arrow/util/bit_util_test.cc | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 74f7e61e9cc..89e2e8b43d6 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -322,6 +322,15 @@ static inline uint16_t ByteSwap(uint16_t value) { return static_cast(ByteSwap(static_cast(value))); } static inline uint8_t ByteSwap(uint8_t value) { return value; } +static inline int8_t ByteSwap(int8_t value) { return value; } +static inline double ByteSwap(double value) { + auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(&value)); + return *reinterpret_cast(&swapped); +} +static inline float ByteSwap(float value) { + auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(&value)); + return *reinterpret_cast(&swapped); +} // Write the swapped bytes into dst. Src and dst cannot overlap. static inline void ByteSwap(void* dst, const void* src, int len) { @@ -353,28 +362,32 @@ static inline void ByteSwap(void* dst, const void* src, int len) { #if ARROW_LITTLE_ENDIAN template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T ToBigEndian(T value) { return ByteSwap(value); } template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T ToLittleEndian(T value) { return value; } #else template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T ToBigEndian(T value) { return value; } template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T ToLittleEndian(T value) { return ByteSwap(value); } diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index c71abde9409..9ddd3de10ca 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1786,6 +1786,20 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x1122)), 0x2211); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11)), 0x11); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0x11)), 0x11); + + EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); + uint32_t srci32 = 0xaabbccdd, expectedi32 = 0xddccbbaa; + EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci32)), + *reinterpret_cast(expectedi32)); + uint64_t srci64 = 0xaabb11223344ccdd, expectedi64 = 0xddcc44332211bbaa; + EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci64)), + *reinterpret_cast(expectedi64)); } TEST(BitUtil, Log2) { From 2a9178b468936ed5b56c6be3035c1957888c4b06 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 21 Jun 2020 18:24:05 +0000 Subject: [PATCH 02/63] add endianness flag to arrow::schema --- cpp/src/arrow/type.cc | 24 +++++++++++++++++++++++- cpp/src/arrow/type.h | 16 ++++++++++++++++ cpp/src/arrow/type_fwd.h | 12 ++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 12d3951865f..0bbc83120e6 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1301,26 +1301,37 @@ void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); } class Schema::Impl { public: Impl(std::vector> fields, + Endianness endianness, std::shared_ptr metadata) : fields_(std::move(fields)), + endianness_(endianness), name_to_index_(CreateNameToIndexMap(fields_)), metadata_(std::move(metadata)) {} std::vector> fields_; + Endianness endianness_; std::unordered_multimap name_to_index_; std::shared_ptr metadata_; }; Schema::Schema(std::vector> fields, + Endianness endianness, std::shared_ptr metadata) : detail::Fingerprintable(), - impl_(new Impl(std::move(fields), std::move(metadata))) {} + impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {} + +Schema::Schema(std::vector> fields, + std::shared_ptr metadata) + : detail::Fingerprintable(), + impl_(new Impl(std::move(fields), NATIVE_ENDIANNESS, std::move(metadata))) {} Schema::Schema(const Schema& schema) : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} Schema::~Schema() = default; +Endianness Schema::endianness() const { return impl_->endianness_; } + int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } const std::shared_ptr& Schema::field(int i) const { @@ -1338,6 +1349,11 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const { return true; } + // checks endianness equality + if (endianness() != other.endianness()) { + return false; + } + // checks field equality if (num_fields() != other.num_fields()) { return false; @@ -1661,6 +1677,12 @@ std::shared_ptr schema(std::vector> fields, return std::make_shared(std::move(fields), std::move(metadata)); } +std::shared_ptr schema(std::vector> fields, + Endianness endianness, + std::shared_ptr metadata) { + return std::make_shared(std::move(fields), endianness, std::move(metadata)); +} + Result> UnifySchemas( const std::vector>& schemas, const Field::MergeOptions field_merge_options) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 56718b7c512..92330d09306 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1604,6 +1604,15 @@ class ARROW_EXPORT FieldRef { // ---------------------------------------------------------------------- // Schema +enum class Endianness { LITTLE, BIG }; +#if ARROW_LITTLE_ENDIAN +#define NATIVE_ENDIANNESS Endianness::LITTLE +#define NONNATIVE_ENDIANNESS Endianness::BIG +#else +#define NATIVE_ENDIANNESS Endianness::BIG +#define NONNATIVE_ENDIANNESS Endianness::LITTLE +#endif + /// \class Schema /// \brief Sequence of arrow::Field objects describing the columns of a record /// batch or table data structure @@ -1611,6 +1620,10 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, public util::EqualityComparable, public util::ToStringOstreamable { public: + explicit Schema(std::vector> fields, + Endianness endianness, + std::shared_ptr metadata = NULLPTR); + explicit Schema(std::vector> fields, std::shared_ptr metadata = NULLPTR); @@ -1622,6 +1635,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, bool Equals(const Schema& other, bool check_metadata = false) const; bool Equals(const std::shared_ptr& other, bool check_metadata = false) const; + /// Return endianness in the schema + Endianness endianness() const; + /// \brief Return the number of fields (columns) in the schema int num_fields() const; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index f1000d1fe7f..14329675c8f 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -52,6 +52,7 @@ class DataType; class Field; class FieldRef; class KeyValueMetadata; +enum class Endianness; class Schema; using DataTypeVector = std::vector>; @@ -635,6 +636,17 @@ std::shared_ptr schema( std::vector> fields, std::shared_ptr metadata = NULLPTR); +/// \brief Create a Schema instance +/// +/// \param fields the schema's fields +/// \param endianness the endianness of the data +/// \param metadata any custom key-value metadata, default null +/// \return schema shared_ptr to Schema +ARROW_EXPORT +std::shared_ptr schema( + std::vector> fields, Endianness endianness, + std::shared_ptr metadata = NULLPTR); + /// @} /// Return the process-wide default memory pool. From 3c15729728a6c618fc65822be311b4ad52b1b25d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 13 Jul 2020 03:59:11 +0000 Subject: [PATCH 03/63] fix test failure --- cpp/src/arrow/util/bit_util.h | 2 +- cpp/src/arrow/util/bit_util_test.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 89e2e8b43d6..daa91a20e5d 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -328,7 +328,7 @@ static inline double ByteSwap(double value) { return *reinterpret_cast(&swapped); } static inline float ByteSwap(float value) { - auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(&value)); + auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(&value)); return *reinterpret_cast(&swapped); } diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index 9ddd3de10ca..f9f6fdf1ada 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1796,10 +1796,10 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); uint32_t srci32 = 0xaabbccdd, expectedi32 = 0xddccbbaa; EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci32)), - *reinterpret_cast(expectedi32)); + *reinterpret_cast(&expectedi32)); uint64_t srci64 = 0xaabb11223344ccdd, expectedi64 = 0xddcc44332211bbaa; EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci64)), - *reinterpret_cast(expectedi64)); + *reinterpret_cast(&expectedi64)); } TEST(BitUtil, Log2) { From e3902e62155fbd296d2f29505f9bac4a6b2b50cb Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 14:23:04 +0000 Subject: [PATCH 04/63] add missing types of functions to convert endianness --- cpp/src/arrow/util/bit_util.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index daa91a20e5d..77cb1a992fa 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -397,28 +397,32 @@ static inline T ToLittleEndian(T value) { #if ARROW_LITTLE_ENDIAN template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T FromBigEndian(T value) { return ByteSwap(value); } template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T FromLittleEndian(T value) { return value; } #else template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T FromBigEndian(T value) { return value; } template > + int16_t, uint16_t, uint8_t, int8_t, + float, double>> static inline T FromLittleEndian(T value) { return ByteSwap(value); } From e78bbb970af1f6d41c8068b146287920633ed0aa Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 14:26:01 +0000 Subject: [PATCH 05/63] support to convert endian of each element of primitive types if the endianness in the received schema is dirrent from the native endianness --- cpp/src/arrow/array/util.cc | 84 ++++++++++++++++++++++++++ cpp/src/arrow/array/util.h | 5 ++ cpp/src/arrow/ipc/metadata_internal.cc | 5 +- cpp/src/arrow/ipc/reader.cc | 39 +++++++++--- cpp/src/arrow/ipc/reader.h | 6 ++ cpp/src/arrow/type.cc | 4 ++ cpp/src/arrow/type.h | 7 ++- 7 files changed, 136 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 0d498931d42..8a53575644b 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -74,6 +74,83 @@ class ArrayDataWrapper { std::shared_ptr* out_; }; +class ArrayDataEndianSwapper { + public: + ArrayDataEndianSwapper(std::shared_ptr& data) : data_(data) {} + + template + Status Visit(const T&) { + using value_type = typename T::c_type; + auto buffer = const_cast( + reinterpret_cast(data_->buffers[1]->data())); + int64_t length = data_->length; + for (int64_t i = 0; i < length; i++) { +#if ARROW_LITTLE_ENDIAN + buffer[i] = BitUtil::FromBigEndian(buffer[i]); +#else + buffer[i] = BitUtil::FromLittleEndian(buffer[i]); +#endif + } + return Status::OK(); + } + + Status Visit(const NullType& type) { return Status::OK(); } + Status Visit(const BooleanType& type) { return Status::OK(); } + Status Visit(const StringType& type) { return Status::OK(); } + Status Visit(const LargeStringType& type) { return Status::OK(); } + Status Visit(const BinaryType& type) { return Status::OK(); } + Status Visit(const LargeBinaryType& type) { return Status::OK(); } + Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } + + Status Visit(const Decimal128Type& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const ListType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const LargeListType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const FixedSizeListType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const MapType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const StructType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const SparseUnionType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const DenseUnionType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + Status Visit(const DictionaryType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + + Status Visit(const ExtensionType& type) { + assert(false && "not supported yet"); + return Status::OK(); + } + + std::shared_ptr& data_; +}; + } // namespace internal std::shared_ptr MakeArray(const std::shared_ptr& data) { @@ -84,6 +161,13 @@ std::shared_ptr MakeArray(const std::shared_ptr& data) { return out; } +void SwapEndianArrayData(std::shared_ptr& data) { + if (data->buffers[1] != NULLPTR) { + internal::ArrayDataEndianSwapper swapper_visitor(data); + DCHECK_OK(VisitTypeInline(*data->type, &swapper_visitor)); + } +} + // ---------------------------------------------------------------------- // Misc APIs diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index b400255c18e..77ef124e226 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -37,6 +37,11 @@ namespace arrow { ARROW_EXPORT std::shared_ptr MakeArray(const std::shared_ptr& data); +/// \brief Swap endian of each element in a generic ArrayData +/// \param[in] data the array contents to be swapped +ARROW_EXPORT +void SwapEndianArrayData(std::shared_ptr& data); + /// \brief Create a strongly-typed Array instance with all elements null /// \param[in] type the array type /// \param[in] length the array length diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index f818aebab24..f0d2555bf01 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1335,7 +1335,10 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr metadata; RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata)); - *out = ::arrow::schema(std::move(fields), metadata); + // set endianess using the value in flatbuf schema + auto endianness = schema->endianness() == flatbuf::Endianness::Little ? + Endianness::LITTLE : Endianness::BIG; + *out = ::arrow::schema(std::move(fields), endianness, metadata); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 82fb4c743a4..6d843d4648b 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -441,7 +441,7 @@ Result> LoadRecordBatchSubset( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector* inclusion_mask, const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, MetadataVersion metadata_version, - Compression::type compression, io::RandomAccessFile* file) { + Compression::type compression, io::RandomAccessFile* file, bool swap_endian = false) { ArrayLoader loader(metadata, metadata_version, options, file); ArrayDataVector columns(schema->num_fields()); @@ -485,6 +485,12 @@ Result> LoadRecordBatchSubset( RETURN_NOT_OK(DecompressBuffers(compression, options, &filtered_columns)); } + // swap endian in a set of ArrayData if necessary (swap_endian == true) + if (swap_endian) { + for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { + SwapEndianArrayData(filtered_columns[i]); + } + } return RecordBatch::Make(filtered_schema, metadata->length(), std::move(filtered_columns)); } @@ -493,13 +499,13 @@ Result> LoadRecordBatch( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, MetadataVersion metadata_version, - Compression::type compression, io::RandomAccessFile* file) { + Compression::type compression, io::RandomAccessFile* file, bool swap_endian = false) { if (inclusion_mask.size() > 0) { return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, dictionary_memo, - options, metadata_version, compression, file); + options, metadata_version, compression, file, swap_endian); } else { return LoadRecordBatchSubset(metadata, schema, nullptr, dictionary_memo, options, - metadata_version, compression, file); + metadata_version, compression, file, swap_endian); } } @@ -578,7 +584,7 @@ Result> ReadRecordBatch( Result> ReadRecordBatchInternal( const Buffer& metadata, const std::shared_ptr& schema, const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, io::RandomAccessFile* file) { + const IpcReadOptions& options, io::RandomAccessFile* file, bool swap_endian = false) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); auto batch = message->header_as_RecordBatch(); @@ -597,7 +603,7 @@ Result> ReadRecordBatchInternal( } return LoadRecordBatch(batch, schema, inclusion_mask, dictionary_memo, options, internal::GetMetadataVersion(message->version()), compression, - file); + file, swap_endian); } // If we are selecting only certain fields, populate an inclusion mask for fast lookups. @@ -755,8 +761,14 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { return Status::Invalid("Tried reading schema message, was null or length 0"); } - return UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, - &out_schema_, &field_inclusion_mask_); + auto status = UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, + &out_schema_, &field_inclusion_mask_); + swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; + if (swap_endian_) { + // set native endianness to the schemas before actually swapping endian in ArrayData + out_schema_->setNativeEndianness(); + } + return status; } Status ReadNext(std::shared_ptr* batch) override { @@ -789,7 +801,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get()) + &dictionary_memo_, options_, reader.get(), + swap_endian_) .Value(batch); } @@ -944,7 +957,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { ARROW_ASSIGN_OR_RAISE( auto batch, ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get())); + &dictionary_memo_, options_, reader.get(), + swap_endian_)); ++stats_.num_record_batches; return batch; } @@ -965,6 +979,11 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Get the schema and record any observed dictionaries RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); + swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; + if (swap_endian_) { + // set native endianness to the schemas before actually swapping endian in ArrayData + out_schema_->setNativeEndianness(); + } ++stats_.num_messages; return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index fe9a3b72e16..3d4f0a31a47 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -96,6 +96,9 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { /// \brief Return current read statistics virtual ReadStats stats() const = 0; + + protected: + bool swap_endian_; }; /// \brief Reads the record batch file format @@ -169,6 +172,9 @@ class ARROW_EXPORT RecordBatchFileReader { /// \brief Return current read statistics virtual ReadStats stats() const = 0; + + protected: + bool swap_endian_; }; /// \brief A general listener class to receive events. diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0bbc83120e6..3ee459008b6 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1330,6 +1330,8 @@ Schema::Schema(const Schema& schema) Schema::~Schema() = default; +void Schema::setNativeEndianness() { impl_->endianness_ = NATIVE_ENDIANNESS; } + Endianness Schema::endianness() const { return impl_->endianness_; } int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } @@ -1489,6 +1491,8 @@ std::shared_ptr Schema::RemoveMetadata() const { std::string Schema::ToString(bool show_metadata) const { std::stringstream buffer; + buffer << "Endianess: " << (impl_->endianness_ == Endianness::LITTLE ? "little" : "big") + << std::endl; int i = 0; for (const auto& field : impl_->fields_) { if (i > 0) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 92330d09306..bcad10c95b8 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1607,10 +1607,8 @@ class ARROW_EXPORT FieldRef { enum class Endianness { LITTLE, BIG }; #if ARROW_LITTLE_ENDIAN #define NATIVE_ENDIANNESS Endianness::LITTLE -#define NONNATIVE_ENDIANNESS Endianness::BIG #else #define NATIVE_ENDIANNESS Endianness::BIG -#define NONNATIVE_ENDIANNESS Endianness::LITTLE #endif /// \class Schema @@ -1635,7 +1633,10 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, bool Equals(const Schema& other, bool check_metadata = false) const; bool Equals(const std::shared_ptr& other, bool check_metadata = false) const; - /// Return endianness in the schema + /// \brief Set platform-native endianness in the schema + void setNativeEndianness(); + + /// \brief Return endianness in the schema Endianness endianness() const; /// \brief Return the number of fields (columns) in the schema From db60ec5b19ce44874f6b1cd1e57e445c489b81f6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 15:07:17 +0000 Subject: [PATCH 06/63] fix lint errors --- cpp/src/arrow/array/util.cc | 4 +- cpp/src/arrow/ipc/metadata_internal.cc | 5 ++- cpp/src/arrow/type.cc | 6 +-- cpp/src/arrow/type.h | 3 +- cpp/src/arrow/util/bit_util.h | 56 +++++++++++--------------- cpp/src/arrow/util/bit_util_test.cc | 2 +- 6 files changed, 33 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 8a53575644b..6ca365afaf8 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -81,8 +81,8 @@ class ArrayDataEndianSwapper { template Status Visit(const T&) { using value_type = typename T::c_type; - auto buffer = const_cast( - reinterpret_cast(data_->buffers[1]->data())); + auto buffer = const_cast( + reinterpret_cast(data_->buffers[1]->data())); int64_t length = data_->length; for (int64_t i = 0; i < length; i++) { #if ARROW_LITTLE_ENDIAN diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index f0d2555bf01..b4520121cd8 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1336,8 +1336,9 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr metadata; RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata)); // set endianess using the value in flatbuf schema - auto endianness = schema->endianness() == flatbuf::Endianness::Little ? - Endianness::LITTLE : Endianness::BIG; + auto endianness = schema->endianness() == flatbuf::Endianness::Little + ? Endianness::LITTLE + : Endianness::BIG; *out = ::arrow::schema(std::move(fields), endianness, metadata); return Status::OK(); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 3ee459008b6..ed164e80804 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1300,8 +1300,7 @@ void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); } class Schema::Impl { public: - Impl(std::vector> fields, - Endianness endianness, + Impl(std::vector> fields, Endianness endianness, std::shared_ptr metadata) : fields_(std::move(fields)), endianness_(endianness), @@ -1314,8 +1313,7 @@ class Schema::Impl { std::shared_ptr metadata_; }; -Schema::Schema(std::vector> fields, - Endianness endianness, +Schema::Schema(std::vector> fields, Endianness endianness, std::shared_ptr metadata) : detail::Fingerprintable(), impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {} diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index bcad10c95b8..0fa86a2a1fe 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1618,8 +1618,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, public util::EqualityComparable, public util::ToStringOstreamable { public: - explicit Schema(std::vector> fields, - Endianness endianness, + explicit Schema(std::vector> fields, Endianness endianness, std::shared_ptr metadata = NULLPTR); explicit Schema(std::vector> fields, diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 77cb1a992fa..6ccda476016 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -360,34 +360,30 @@ static inline void ByteSwap(void* dst, const void* src, int len) { // Convert to little/big endian format from the machine's native endian format. #if ARROW_LITTLE_ENDIAN -template > +template > static inline T ToBigEndian(T value) { return ByteSwap(value); } -template > +template > static inline T ToLittleEndian(T value) { return value; } #else -template > +template > static inline T ToBigEndian(T value) { return value; } -template > +template > static inline T ToLittleEndian(T value) { return ByteSwap(value); } @@ -395,34 +391,30 @@ static inline T ToLittleEndian(T value) { // Convert from big/little endian format to the machine's native endian format. #if ARROW_LITTLE_ENDIAN -template > +template > static inline T FromBigEndian(T value) { return ByteSwap(value); } -template > +template > static inline T FromLittleEndian(T value) { return value; } #else -template > +template > static inline T FromBigEndian(T value) { return value; } -template > +template > static inline T FromLittleEndian(T value) { return ByteSwap(value); } diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index f9f6fdf1ada..cd9086d3ee8 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -1796,7 +1796,7 @@ TEST(BitUtil, ByteSwap) { EXPECT_EQ(BitUtil::ByteSwap(static_cast(0)), 0); uint32_t srci32 = 0xaabbccdd, expectedi32 = 0xddccbbaa; EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci32)), - *reinterpret_cast(&expectedi32)); + *reinterpret_cast(&expectedi32)); uint64_t srci64 = 0xaabb11223344ccdd, expectedi64 = 0xddcc44332211bbaa; EXPECT_EQ(BitUtil::ByteSwap(*reinterpret_cast(&srci64)), *reinterpret_cast(&expectedi64)); From 4d5e4c2577f5b9da0a5ec7e96e8b4b3175f8bfda Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 15:46:35 +0000 Subject: [PATCH 07/63] fix test failures --- cpp/src/arrow/ipc/reader.cc | 6 +++--- cpp/src/arrow/type.h | 1 + cpp/src/arrow/type_test.cc | 24 ++++++++++++++++++++++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6d843d4648b..6b22f644a08 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -761,14 +761,14 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { return Status::Invalid("Tried reading schema message, was null or length 0"); } - auto status = UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, - &out_schema_, &field_inclusion_mask_); + RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, + &out_schema_, &field_inclusion_mask_)); swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; if (swap_endian_) { // set native endianness to the schemas before actually swapping endian in ArrayData out_schema_->setNativeEndianness(); } - return status; + return Status::OK(); } Status ReadNext(std::shared_ptr* batch) override { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 0fa86a2a1fe..f1a69973ac7 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -29,6 +29,7 @@ #include "arrow/result.h" #include "arrow/type_fwd.h" // IWYU pragma: export +#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" #include "arrow/util/variant.h" diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 81a0315d6d1..b6cfdb5ebba 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -487,20 +487,40 @@ TEST_F(TestSchema, ToString) { auto schema = ::arrow::schema({f0, f1, f2, f3}, metadata); std::string result = schema->ToString(); - std::string expected = R"(f0: int32 +#if ARROW_LITTLE_ENDIAN + std::string expected = R"(Endianess: little +f0: int32 f1: uint8 not null f2: string f3: list)"; +#else + std::string expected = R"(Endianess: big +f0: int32 +f1: uint8 not null +f2: string +f3: list)"; +#endif ASSERT_EQ(expected, result); result = schema->ToString(/*print_metadata=*/true); - expected = R"(f0: int32 +#if ARROW_LITTLE_ENDIAN + expected = R"(Endianess: little +f0: int32 +f1: uint8 not null +f2: string +f3: list +-- metadata -- +foo: bar)"; +#else + expected = R"(Endianess: big +f0: int32 f1: uint8 not null f2: string f3: list -- metadata -- foo: bar)"; +#endif ASSERT_EQ(expected, result); } From 8735963efa254938e39bc9bcd68bf9b78fc28ee9 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 17:31:13 +0000 Subject: [PATCH 08/63] fix lint error --- cpp/src/arrow/array/util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 6ca365afaf8..f033b1ca420 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -76,7 +76,7 @@ class ArrayDataWrapper { class ArrayDataEndianSwapper { public: - ArrayDataEndianSwapper(std::shared_ptr& data) : data_(data) {} + explicit ArrayDataEndianSwapper(std::shared_ptr& data) : data_(data) {} template Status Visit(const T&) { From 37518a1121b29aed674fc4e044e7f22a84270cf5 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 17:38:41 +0000 Subject: [PATCH 09/63] fix compilation error with gcc --- cpp/src/arrow/util/bit_util.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 6ccda476016..f793811e3d0 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -324,12 +324,28 @@ static inline uint16_t ByteSwap(uint16_t value) { static inline uint8_t ByteSwap(uint8_t value) { return value; } static inline int8_t ByteSwap(int8_t value) { return value; } static inline double ByteSwap(double value) { +#if defined(__GNUC__) + // avoid dereferencing type-punned pointer will break strict-aliasing rules + int8_t *in_ptr = reinterpret_cast(&value); + auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(in_ptr)); + int8_t *out_ptr = reinterpret_cast(&swapped); + return *reinterpret_cast(out_ptr); +#else auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(&value)); return *reinterpret_cast(&swapped); +#endif } static inline float ByteSwap(float value) { +#if defined(__GNUC__) + // avoid dereferencing type-punned pointer will break strict-aliasing rules + int8_t *in_ptr = reinterpret_cast(&value); + auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(in_ptr)); + int8_t *out_ptr = reinterpret_cast(&swapped); + return *reinterpret_cast(out_ptr); +#else auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(&value)); return *reinterpret_cast(&swapped); +#endif } // Write the swapped bytes into dst. Src and dst cannot overlap. From 1d6e78e137cda5f2ec49d0627ff720843dc52709 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 17:45:26 +0000 Subject: [PATCH 10/63] fix lint errors --- cpp/src/arrow/util/bit_util.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index f793811e3d0..775e08272b2 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -326,9 +326,9 @@ static inline int8_t ByteSwap(int8_t value) { return value; } static inline double ByteSwap(double value) { #if defined(__GNUC__) // avoid dereferencing type-punned pointer will break strict-aliasing rules - int8_t *in_ptr = reinterpret_cast(&value); + int8_t* in_ptr = reinterpret_cast(&value); auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(in_ptr)); - int8_t *out_ptr = reinterpret_cast(&swapped); + int8_t* out_ptr = reinterpret_cast(&swapped); return *reinterpret_cast(out_ptr); #else auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(&value)); @@ -338,9 +338,9 @@ static inline double ByteSwap(double value) { static inline float ByteSwap(float value) { #if defined(__GNUC__) // avoid dereferencing type-punned pointer will break strict-aliasing rules - int8_t *in_ptr = reinterpret_cast(&value); + int8_t* in_ptr = reinterpret_cast(&value); auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(in_ptr)); - int8_t *out_ptr = reinterpret_cast(&swapped); + int8_t* out_ptr = reinterpret_cast(&swapped); return *reinterpret_cast(out_ptr); #else auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(&value)); From af79bbfb208b98c7b4a625afab205e3120f93330 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 19 Jul 2020 18:40:07 +0000 Subject: [PATCH 11/63] revert change in Schema.ToString --- cpp/src/arrow/type.cc | 2 -- cpp/src/arrow/type_test.cc | 24 ++---------------------- 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ed164e80804..617b7055e7e 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1489,8 +1489,6 @@ std::shared_ptr Schema::RemoveMetadata() const { std::string Schema::ToString(bool show_metadata) const { std::stringstream buffer; - buffer << "Endianess: " << (impl_->endianness_ == Endianness::LITTLE ? "little" : "big") - << std::endl; int i = 0; for (const auto& field : impl_->fields_) { if (i > 0) { diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index b6cfdb5ebba..81a0315d6d1 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -487,40 +487,20 @@ TEST_F(TestSchema, ToString) { auto schema = ::arrow::schema({f0, f1, f2, f3}, metadata); std::string result = schema->ToString(); -#if ARROW_LITTLE_ENDIAN - std::string expected = R"(Endianess: little -f0: int32 + std::string expected = R"(f0: int32 f1: uint8 not null f2: string f3: list)"; -#else - std::string expected = R"(Endianess: big -f0: int32 -f1: uint8 not null -f2: string -f3: list)"; -#endif ASSERT_EQ(expected, result); result = schema->ToString(/*print_metadata=*/true); -#if ARROW_LITTLE_ENDIAN - expected = R"(Endianess: little -f0: int32 -f1: uint8 not null -f2: string -f3: list --- metadata -- -foo: bar)"; -#else - expected = R"(Endianess: big -f0: int32 + expected = R"(f0: int32 f1: uint8 not null f2: string f3: list -- metadata -- foo: bar)"; -#endif ASSERT_EQ(expected, result); } From 2c20ef74291b5026a787e2c13346ff917be220d7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jul 2020 04:58:01 +0000 Subject: [PATCH 12/63] support to convert endian of each element for all types --- cpp/src/arrow/array/util.cc | 146 ++++++++++++++++++++++++++++++------ cpp/src/arrow/ipc/reader.cc | 25 ++++-- 2 files changed, 140 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index f033b1ca420..170137419b6 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -76,14 +76,61 @@ class ArrayDataWrapper { class ArrayDataEndianSwapper { public: - explicit ArrayDataEndianSwapper(std::shared_ptr& data) : data_(data) {} + ArrayDataEndianSwapper(std::shared_ptr& data, int64_t length) + : data_(data), length_(length) {} + + Status SwapType(const DataType& type) { return VisitTypeInline(type, this); } + + Status SwapChildren(std::vector> child_fields) { + int i = 0; + for (const auto& child_field : child_fields) { + auto orig_data = data_; + auto orig_length = length_; + data_ = data_->child_data[i++]; + length_ = data_->length; + RETURN_NOT_OK(SwapType(*child_field.get()->type())); + length_ = orig_length; + data_ = orig_data; + } + return Status::OK(); + } + + template + Status SwapOffset(const T&, int index) { + if (data_->buffers[index] == nullptr) { return Status::OK(); } + using value_type = typename T::c_type; + auto buffer = const_cast( + reinterpret_cast(data_->buffers[index]->data())); + // offset has one more element rather than data->length + int64_t length = length_ + 1; + for (int64_t i = 0; i < length; i++) { +#if ARROW_LITTLE_ENDIAN + buffer[i] = BitUtil::FromBigEndian(buffer[i]); +#else + buffer[i] = BitUtil::FromLittleEndian(buffer[i]); +#endif + } + return Status::OK(); + } + + Status SwapSmallOffset(int index = 1) { + Int32Type i32; + RETURN_NOT_OK(SwapOffset(i32, index)); + return Status::OK(); + } + + Status SwapLargeOffset() { + Int64Type i64; + RETURN_NOT_OK(SwapOffset(i64, 1)); + return Status::OK(); + } template Status Visit(const T&) { using value_type = typename T::c_type; auto buffer = const_cast( reinterpret_cast(data_->buffers[1]->data())); - int64_t length = data_->length; + int64_t length = length_; for (int64_t i = 0; i < length; i++) { #if ARROW_LITTLE_ENDIAN buffer[i] = BitUtil::FromBigEndian(buffer[i]); @@ -94,61 +141,116 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const Decimal128Type& type) { + auto buffer = const_cast( + reinterpret_cast(data_->buffers[1]->data())); + int64_t length = length_; + for (int64_t i = 0; i < length; i++) { + uint64_t tmp; + auto idx = i * 2; +#if ARROW_LITTLE_ENDIAN + tmp = BitUtil::FromBigEndian(buffer[idx]); + buffer[idx] = BitUtil::FromBigEndian(buffer[idx + 1]); + buffer[idx + 1] = tmp; +#else + tmp = BitUtil::FromLittleEndian(buffer[idx]); + buffer[idx] = BitUtil::FromLittleEndian(buffer[idx + 1]); + buffer[idx + 1] = tmp; +#endif + } + return Status::OK(); + } + + Status Visit(const DayTimeIntervalType& type) { + auto buffer = const_cast( + reinterpret_cast(data_->buffers[1]->data())); + int64_t length = length_; + for (int64_t i = 0; i < length; i++) { + uint32_t tmp; + auto idx = i * 2; +#if ARROW_LITTLE_ENDIAN + buffer[idx]= BitUtil::FromBigEndian(buffer[idx]); + buffer[idx + 1] = BitUtil::FromBigEndian(buffer[idx + 1]); +#else + buffer[idx] = BitUtil::FromLittleEndian(buffer[idx]); + buffer[idx + 1] = BitUtil::FromLittleEndian(buffer[idx + 1]); +#endif + } + return Status::OK(); + } + Status Visit(const NullType& type) { return Status::OK(); } Status Visit(const BooleanType& type) { return Status::OK(); } - Status Visit(const StringType& type) { return Status::OK(); } - Status Visit(const LargeStringType& type) { return Status::OK(); } - Status Visit(const BinaryType& type) { return Status::OK(); } - Status Visit(const LargeBinaryType& type) { return Status::OK(); } + Status Visit(const Int8Type& type) { return Status::OK(); } + Status Visit(const UInt8Type& type) { return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } - Status Visit(const Decimal128Type& type) { - assert(false && "not supported yet"); + Status Visit(const StringType& type) { + SwapSmallOffset(); return Status::OK(); } - Status Visit(const DayTimeIntervalType& type) { - assert(false && "not supported yet"); + Status Visit(const LargeStringType& type) { + SwapLargeOffset(); + return Status::OK(); + } + Status Visit(const BinaryType& type) { + SwapSmallOffset(); return Status::OK(); } + Status Visit(const LargeBinaryType& type) { + SwapLargeOffset(); + return Status::OK(); + } + Status Visit(const ListType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapSmallOffset()); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } Status Visit(const LargeListType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapLargeOffset()); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } Status Visit(const FixedSizeListType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } + Status Visit(const MapType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapSmallOffset()); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } + Status Visit(const StructType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } + Status Visit(const SparseUnionType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } + Status Visit(const DenseUnionType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapSmallOffset(2)); + RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } + Status Visit(const DictionaryType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapType(*type.index_type())); return Status::OK(); } Status Visit(const ExtensionType& type) { - assert(false && "not supported yet"); + RETURN_NOT_OK(SwapType(*type.storage_type())); return Status::OK(); } std::shared_ptr& data_; + int64_t length_; }; } // namespace internal @@ -162,10 +264,8 @@ std::shared_ptr MakeArray(const std::shared_ptr& data) { } void SwapEndianArrayData(std::shared_ptr& data) { - if (data->buffers[1] != NULLPTR) { - internal::ArrayDataEndianSwapper swapper_visitor(data); - DCHECK_OK(VisitTypeInline(*data->type, &swapper_visitor)); - } + internal::ArrayDataEndianSwapper swapper_visitor(data, data->length); + DCHECK_OK(VisitTypeInline(*data->type, &swapper_visitor)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6b22f644a08..4c8e8ab73af 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -680,7 +680,7 @@ Result> ReadRecordBatch( Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, const IpcReadOptions& options, DictionaryKind* kind, - io::RandomAccessFile* file) { + io::RandomAccessFile* file, bool swap_endian) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); const auto dictionary_batch = message->header_as_DictionaryBatch(); @@ -736,13 +736,14 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, } Status ReadDictionary(const Message& message, DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, DictionaryKind* kind) { + const IpcReadOptions& options, DictionaryKind* kind, + bool swap_endian) { // Only invoke this method if we already know we have a dictionary message DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH); CHECK_HAS_BODY(message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); return ReadDictionary(*message.metadata(), dictionary_memo, options, kind, - reader.get()); + reader.get(), swap_endian); } // ---------------------------------------------------------------------- @@ -833,7 +834,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { Status ReadDictionary(const Message& message) { DictionaryKind kind; RETURN_NOT_OK( - ::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, &kind)); + ::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, &kind, + swap_endian_)); switch (kind) { case DictionaryKind::New: break; @@ -1028,7 +1030,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; RETURN_NOT_OK(ReadDictionary(*message->metadata(), &dictionary_memo_, options_, - &kind, reader.get())); + &kind, reader.get(), swap_endian_)); ++stats_.num_dictionary_batches; if (kind != DictionaryKind::New) { return Status::Invalid( @@ -1212,6 +1214,11 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); + swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; + if (swap_endian_) { + // set native endianness to the schemas before actually swapping endian in ArrayData + out_schema_->setNativeEndianness(); + } n_required_dictionaries_ = dictionary_memo_.fields().num_fields(); if (n_required_dictionaries_ == 0) { @@ -1247,7 +1254,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { ARROW_ASSIGN_OR_RAISE( auto batch, ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get())); + &dictionary_memo_, options_, reader.get(), + swap_endian_)); ++stats_.num_record_batches; return listener_->OnRecordBatchDecoded(std::move(batch)); } @@ -1256,8 +1264,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { // Read dictionary from dictionary batch Status ReadDictionary(const Message& message) { DictionaryKind kind; - RETURN_NOT_OK( - ::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, &kind)); + RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, + &kind, swap_endian_)); ++stats_.num_dictionary_batches; switch (kind) { case DictionaryKind::New: @@ -1281,6 +1289,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { DictionaryMemo dictionary_memo_; std::shared_ptr schema_, out_schema_; ReadStats stats_; + bool swap_endian_; }; StreamDecoder::StreamDecoder(std::shared_ptr listener, IpcReadOptions options) { From 469f6836de141aa67f38ca56cbba9b84a2b87eab Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 26 Jul 2020 05:31:06 +0000 Subject: [PATCH 13/63] fix build errors --- cpp/src/arrow/array/util.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 170137419b6..e6a2a2ac32c 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -97,7 +97,9 @@ class ArrayDataEndianSwapper { template Status SwapOffset(const T&, int index) { - if (data_->buffers[index] == nullptr) { return Status::OK(); } + if (data_->buffers[index] == nullptr) { + return Status::OK(); + } using value_type = typename T::c_type; auto buffer = const_cast( reinterpret_cast(data_->buffers[index]->data())); @@ -166,10 +168,9 @@ class ArrayDataEndianSwapper { reinterpret_cast(data_->buffers[1]->data())); int64_t length = length_; for (int64_t i = 0; i < length; i++) { - uint32_t tmp; auto idx = i * 2; #if ARROW_LITTLE_ENDIAN - buffer[idx]= BitUtil::FromBigEndian(buffer[idx]); + buffer[idx] = BitUtil::FromBigEndian(buffer[idx]); buffer[idx + 1] = BitUtil::FromBigEndian(buffer[idx + 1]); #else buffer[idx] = BitUtil::FromLittleEndian(buffer[idx]); @@ -186,19 +187,19 @@ class ArrayDataEndianSwapper { Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } Status Visit(const StringType& type) { - SwapSmallOffset(); + RETURN_NOT_OK(SwapSmallOffset()); return Status::OK(); } Status Visit(const LargeStringType& type) { - SwapLargeOffset(); + RETURN_NOT_OK(SwapLargeOffset()); return Status::OK(); } Status Visit(const BinaryType& type) { - SwapSmallOffset(); + RETURN_NOT_OK(SwapSmallOffset()); return Status::OK(); } Status Visit(const LargeBinaryType& type) { - SwapLargeOffset(); + RETURN_NOT_OK(SwapLargeOffset()); return Status::OK(); } From 094626dd72754681bc68c72df3e4e543318e5a1a Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 29 Jul 2020 02:14:12 +0000 Subject: [PATCH 14/63] add be/le tests into integration --- ci/scripts/integration_arrow.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index abd53759d83..6cbbedfcda3 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -30,4 +30,6 @@ pip install -e $arrow_dir/dev/archery archery integration --with-all --run-flight \ --gold-dirs=$gold_dir/0.14.1 \ --gold-dirs=$gold_dir/0.17.1 \ + --gold-dirs=$gold_dir/1.0.0_bigendian \ + --gold-dirs=$gold_dir/1.0.0_littleendian \ --gold-dirs=$gold_dir/2.0.0-compression \ From 98d5d25272dee7100f3adb2cc39aaf02e9af4f28 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 1 Aug 2020 02:51:43 +0000 Subject: [PATCH 15/63] do integration test for endianness with only cpp --- ci/scripts/integration_arrow.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 6cbbedfcda3..54e170deba4 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -30,6 +30,9 @@ pip install -e $arrow_dir/dev/archery archery integration --with-all --run-flight \ --gold-dirs=$gold_dir/0.14.1 \ --gold-dirs=$gold_dir/0.17.1 \ + --gold-dirs=$gold_dir/2.0.0-compression \ + +# TODO: support other languages +archery integration --with-cpp=1 --run-flight \ --gold-dirs=$gold_dir/1.0.0_bigendian \ --gold-dirs=$gold_dir/1.0.0_littleendian \ - --gold-dirs=$gold_dir/2.0.0-compression \ From dc2c5789a7bedeb73b615da7e40b18456caea290 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 2 Aug 2020 06:10:36 +0000 Subject: [PATCH 16/63] add a script to generate test files for endian test --- dev/archery/tests/generate_files_for_endian_test.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 dev/archery/tests/generate_files_for_endian_test.sh diff --git a/dev/archery/tests/generate_files_for_endian_test.sh b/dev/archery/tests/generate_files_for_endian_test.sh new file mode 100755 index 00000000000..35b183aaf0e --- /dev/null +++ b/dev/archery/tests/generate_files_for_endian_test.sh @@ -0,0 +1,9 @@ +#!/bin/bash +arrow_dir= + +json_file=`archery integration --with-cpp=1 | grep "Testing file" | tail -1 | awk '{ print $3 }'` +json_dir=`dirname $json_file` +for f in $json_dir/*.json ; do $arrow_dir/cpp/build/debug/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; done +for f in $json_dir/*.arrow_file ; do $arrow_dir/cpp/build/debug/arrow-file-to-stream $f > ${f%.*}.stream; done +for f in $json_dir/*.json ; do gzip $f ; done +echo "The files are under $json_dir" From 492a261abac2c55e23b7c2b4b47a4d9eb548aaf9 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 2 Aug 2020 06:31:08 +0000 Subject: [PATCH 17/63] fix lint error --- .../tests/generate_files_for_endian_test.sh | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/dev/archery/tests/generate_files_for_endian_test.sh b/dev/archery/tests/generate_files_for_endian_test.sh index 35b183aaf0e..4f8a3abb28a 100755 --- a/dev/archery/tests/generate_files_for_endian_test.sh +++ b/dev/archery/tests/generate_files_for_endian_test.sh @@ -1,9 +1,26 @@ #!/bin/bash -arrow_dir= +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +: ${ARROW_DIR:=/arrow} json_file=`archery integration --with-cpp=1 | grep "Testing file" | tail -1 | awk '{ print $3 }'` json_dir=`dirname $json_file` -for f in $json_dir/*.json ; do $arrow_dir/cpp/build/debug/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; done -for f in $json_dir/*.arrow_file ; do $arrow_dir/cpp/build/debug/arrow-file-to-stream $f > ${f%.*}.stream; done +for f in $json_dir/*.json ; do $ARROW_DIR/cpp/build/debug/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; done +for f in $json_dir/*.arrow_file ; do $ARROW_DIR/cpp/build/debug/arrow-file-to-stream $f > ${f%.*}.stream; done for f in $json_dir/*.json ; do gzip $f ; done echo "The files are under $json_dir" From e40796b2625ba65ea4a7a0209c4a1b34c7efcbe9 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 4 Aug 2020 19:11:00 +0000 Subject: [PATCH 18/63] address review comments --- cpp/src/arrow/array/util.cc | 17 ++++++----------- cpp/src/arrow/ipc/reader.cc | 18 +++++++++--------- cpp/src/arrow/type.cc | 18 ++++++++++++++++-- cpp/src/arrow/type.h | 14 +++++++------- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index e6a2a2ac32c..b9ffc324eb5 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -95,14 +95,13 @@ class ArrayDataEndianSwapper { return Status::OK(); } - template - Status SwapOffset(const T&, int index) { + template + Status SwapOffset(int index) { if (data_->buffers[index] == nullptr) { return Status::OK(); } - using value_type = typename T::c_type; - auto buffer = const_cast( - reinterpret_cast(data_->buffers[index]->data())); + auto buffer = const_cast( + reinterpret_cast(data_->buffers[index]->data())); // offset has one more element rather than data->length int64_t length = length_ + 1; for (int64_t i = 0; i < length; i++) { @@ -116,15 +115,11 @@ class ArrayDataEndianSwapper { } Status SwapSmallOffset(int index = 1) { - Int32Type i32; - RETURN_NOT_OK(SwapOffset(i32, index)); - return Status::OK(); + return SwapOffset(index); } Status SwapLargeOffset() { - Int64Type i64; - RETURN_NOT_OK(SwapOffset(i64, 1)); - return Status::OK(); + return SwapOffset(1); } template diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 4c8e8ab73af..05a8bd8a338 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -764,10 +764,10 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; + swap_endian_ = !out_schema_->IsNativeEndianness(); if (swap_endian_) { - // set native endianness to the schemas before actually swapping endian in ArrayData - out_schema_->setNativeEndianness(); + // create a new schema with native endianness before swapping endian in ArrayData + out_schema_ = out_schema_->WithNativeEndianness(); } return Status::OK(); } @@ -981,10 +981,10 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Get the schema and record any observed dictionaries RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; + swap_endian_ = !out_schema_->IsNativeEndianness(); if (swap_endian_) { - // set native endianness to the schemas before actually swapping endian in ArrayData - out_schema_->setNativeEndianness(); + // create a new schema with native endianness before swapping endian in ArrayData + out_schema_ = out_schema_->WithNativeEndianness(); } ++stats_.num_messages; return Status::OK(); @@ -1214,10 +1214,10 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = out_schema_->endianness() != NATIVE_ENDIANNESS; + swap_endian_ = !out_schema_->IsNativeEndianness(); if (swap_endian_) { - // set native endianness to the schemas before actually swapping endian in ArrayData - out_schema_->setNativeEndianness(); + // create a new schema with native endianness before swapping endian in ArrayData + out_schema_ = out_schema_->WithNativeEndianness(); } n_required_dictionaries_ = dictionary_memo_.fields().num_fields(); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 617b7055e7e..5c6b82d51c9 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1321,17 +1321,31 @@ Schema::Schema(std::vector> fields, Endianness endianness Schema::Schema(std::vector> fields, std::shared_ptr metadata) : detail::Fingerprintable(), - impl_(new Impl(std::move(fields), NATIVE_ENDIANNESS, std::move(metadata))) {} +#if ARROW_LITTLE_ENDIAN + impl_(new Impl(std::move(fields), Endianness::LITTLE, std::move(metadata))) {} +#else + impl_(new Impl(std::move(fields), Endianness::BIG, std::move(metadata))) {} +#endif Schema::Schema(const Schema& schema) : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} Schema::~Schema() = default; -void Schema::setNativeEndianness() { impl_->endianness_ = NATIVE_ENDIANNESS; } +std::shared_ptr Schema::WithNativeEndianness() const { + return std::make_shared(impl_->fields_, impl_->metadata_); +} Endianness Schema::endianness() const { return impl_->endianness_; } +bool Schema::IsNativeEndianness() const { +#if ARROW_LITTLE_ENDIAN + return impl_->endianness_ == Endianness::LITTLE; +#else + return impl_->endianness_ == Endianness::BIG; +#endif +} + int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } const std::shared_ptr& Schema::field(int i) const { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f1a69973ac7..312a6c69ea5 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1606,11 +1606,6 @@ class ARROW_EXPORT FieldRef { // Schema enum class Endianness { LITTLE, BIG }; -#if ARROW_LITTLE_ENDIAN -#define NATIVE_ENDIANNESS Endianness::LITTLE -#else -#define NATIVE_ENDIANNESS Endianness::BIG -#endif /// \class Schema /// \brief Sequence of arrow::Field objects describing the columns of a record @@ -1633,12 +1628,17 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, bool Equals(const Schema& other, bool check_metadata = false) const; bool Equals(const std::shared_ptr& other, bool check_metadata = false) const; - /// \brief Set platform-native endianness in the schema - void setNativeEndianness(); + /// \brief Replace endianness with platform-native endianness in the schema + /// + /// \return new Schema + std::shared_ptr WithNativeEndianness() const; /// \brief Return endianness in the schema Endianness endianness() const; + /// \brief Indicate if endianness is equal to platform-native endianness + bool IsNativeEndianness() const; + /// \brief Return the number of fields (columns) in the schema int num_fields() const; From f2cfe02ffb736e7fd593524c9869feff7a2f1890 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 4 Aug 2020 19:27:36 +0000 Subject: [PATCH 19/63] fix lint errors --- cpp/src/arrow/array/util.cc | 8 ++------ cpp/src/arrow/type.cc | 6 ++++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index b9ffc324eb5..da092e867a4 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -114,13 +114,9 @@ class ArrayDataEndianSwapper { return Status::OK(); } - Status SwapSmallOffset(int index = 1) { - return SwapOffset(index); - } + Status SwapSmallOffset(int index = 1) { return SwapOffset(index); } - Status SwapLargeOffset() { - return SwapOffset(1); - } + Status SwapLargeOffset() { return SwapOffset(1); } template Status Visit(const T&) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5c6b82d51c9..e7c109148d8 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1322,9 +1322,11 @@ Schema::Schema(std::vector> fields, std::shared_ptr metadata) : detail::Fingerprintable(), #if ARROW_LITTLE_ENDIAN - impl_(new Impl(std::move(fields), Endianness::LITTLE, std::move(metadata))) {} + impl_(new Impl(std::move(fields), Endianness::LITTLE, std::move(metadata))) { +} #else - impl_(new Impl(std::move(fields), Endianness::BIG, std::move(metadata))) {} + impl_(new Impl(std::move(fields), Endianness::BIG, std::move(metadata))) { +} #endif Schema::Schema(const Schema& schema) From 7609cd838b7804ca3174243aa1ed9c03aedf35db Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 5 Aug 2020 16:43:55 +0000 Subject: [PATCH 20/63] define Endianness::NATIVE --- cpp/src/arrow/type.cc | 14 ++------------ cpp/src/arrow/type.h | 10 +++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index e7c109148d8..d5e5bd7a65d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1321,13 +1321,7 @@ Schema::Schema(std::vector> fields, Endianness endianness Schema::Schema(std::vector> fields, std::shared_ptr metadata) : detail::Fingerprintable(), -#if ARROW_LITTLE_ENDIAN - impl_(new Impl(std::move(fields), Endianness::LITTLE, std::move(metadata))) { -} -#else - impl_(new Impl(std::move(fields), Endianness::BIG, std::move(metadata))) { -} -#endif + impl_(new Impl(std::move(fields), Endianness::NATIVE, std::move(metadata))) {} Schema::Schema(const Schema& schema) : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} @@ -1341,11 +1335,7 @@ std::shared_ptr Schema::WithNativeEndianness() const { Endianness Schema::endianness() const { return impl_->endianness_; } bool Schema::IsNativeEndianness() const { -#if ARROW_LITTLE_ENDIAN - return impl_->endianness_ == Endianness::LITTLE; -#else - return impl_->endianness_ == Endianness::BIG; -#endif + return impl_->endianness_ == Endianness::NATIVE; } int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 312a6c69ea5..2ba28ba91b1 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1605,7 +1605,15 @@ class ARROW_EXPORT FieldRef { // ---------------------------------------------------------------------- // Schema -enum class Endianness { LITTLE, BIG }; +enum class Endianness { + LITTLE = 0, + BIG = 1, +#if ARROW_LITTLE_ENDIAN + NATIVE = LITTLE +#else + NATIVE = BIG +#endif +}; /// \class Schema /// \brief Sequence of arrow::Field objects describing the columns of a record From 5857780e70c7598a64bfe5ba71f6c3fa0712d180 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 6 Aug 2020 06:11:42 +0000 Subject: [PATCH 21/63] fix failure when execute arrow-stream-to-file add use_native_endian to IpcReadOption --- cpp/src/arrow/ipc/options.h | 5 +++++ cpp/src/arrow/ipc/reader.cc | 9 +++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/ipc/options.h b/cpp/src/arrow/ipc/options.h index aa939e24378..8f6b29fbc18 100644 --- a/cpp/src/arrow/ipc/options.h +++ b/cpp/src/arrow/ipc/options.h @@ -137,6 +137,11 @@ struct ARROW_EXPORT IpcReadOptions { /// like decompression bool use_threads = true; + /// \brief Convert endian of data element to platform-native endianness + /// if the endianness of the received schema is not equal to + /// platform-native endianness + bool use_native_endian = true; + static IpcReadOptions Defaults(); }; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 05a8bd8a338..6492935aa29 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -636,7 +636,8 @@ Status GetInclusionMaskAndOutSchema(const std::shared_ptr& full_schema, included_fields.push_back(full_schema->field(i)); } - *out_schema = schema(std::move(included_fields), full_schema->metadata()); + *out_schema = schema(std::move(included_fields), full_schema->endianness(), + full_schema->metadata()); return Status::OK(); } @@ -764,7 +765,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = !out_schema_->IsNativeEndianness(); + swap_endian_ = options.use_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData out_schema_ = out_schema_->WithNativeEndianness(); @@ -981,7 +982,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Get the schema and record any observed dictionaries RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = !out_schema_->IsNativeEndianness(); + swap_endian_ = options.use_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData out_schema_ = out_schema_->WithNativeEndianness(); @@ -1214,7 +1215,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = !out_schema_->IsNativeEndianness(); + swap_endian_ = options_.use_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData out_schema_ = out_schema_->WithNativeEndianness(); From d1989682939dfb05316d3a959c627b16be51bbb6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 6 Aug 2020 09:46:35 +0000 Subject: [PATCH 22/63] fix failure when execute arrow-stream-to-file --- cpp/src/arrow/ipc/reader.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6492935aa29..eeca0a4e312 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -768,6 +768,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { swap_endian_ = options.use_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData + schema_ = schema_->WithNativeEndianness(); out_schema_ = out_schema_->WithNativeEndianness(); } return Status::OK(); @@ -985,6 +986,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { swap_endian_ = options.use_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData + schema_ = schema_->WithNativeEndianness(); out_schema_ = out_schema_->WithNativeEndianness(); } ++stats_.num_messages; @@ -1218,6 +1220,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { swap_endian_ = options_.use_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData + schema_ = schema_->WithNativeEndianness(); out_schema_ = out_schema_->WithNativeEndianness(); } From aa26fdcbaf606b3f4d01ba8a7f2ddbce6260d2f1 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 10 Aug 2020 16:56:52 +0000 Subject: [PATCH 23/63] address review comments --- cpp/src/arrow/array/util.cc | 41 +++++++------------ cpp/src/arrow/util/bit_util.h | 25 +++-------- .../tests/generate_files_for_endian_test.sh | 6 ++- 3 files changed, 24 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index da092e867a4..d0ebf1d3e5a 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -79,18 +79,21 @@ class ArrayDataEndianSwapper { ArrayDataEndianSwapper(std::shared_ptr& data, int64_t length) : data_(data), length_(length) {} - Status SwapType(const DataType& type) { return VisitTypeInline(type, this); } + Status SwapType(const DataType& type) { + RETURN_NOT_OK(VisitTypeInline(type, this)); + RETURN_NOT_OK(SwapChildren(type.fields())); + return Status::OK(); + } Status SwapChildren(std::vector> child_fields) { int i = 0; for (const auto& child_field : child_fields) { - auto orig_data = data_; - auto orig_length = length_; - data_ = data_->child_data[i++]; - length_ = data_->length; - RETURN_NOT_OK(SwapType(*child_field.get()->type())); - length_ = orig_length; - data_ = orig_data; + ArrayDataEndianSwapper + swapper_child_visitor(data_->child_data[i], data_->child_data[i]->length); + RETURN_NOT_OK(VisitTypeInline(*child_field.get()->type(), &swapper_child_visitor)); + RETURN_NOT_OK( + swapper_child_visitor.SwapChildren((*child_field.get()->type()).fields())); + i++; } return Status::OK(); } @@ -176,6 +179,9 @@ class ArrayDataEndianSwapper { Status Visit(const Int8Type& type) { return Status::OK(); } Status Visit(const UInt8Type& type) { return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } + Status Visit(const FixedSizeListType& type) { return Status::OK(); } + Status Visit(const StructType& type) { return Status::OK(); } + Status Visit(const SparseUnionType& type) { return Status::OK(); } Status Visit(const StringType& type) { RETURN_NOT_OK(SwapSmallOffset()); @@ -196,38 +202,20 @@ class ArrayDataEndianSwapper { Status Visit(const ListType& type) { RETURN_NOT_OK(SwapSmallOffset()); - RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } Status Visit(const LargeListType& type) { RETURN_NOT_OK(SwapLargeOffset()); - RETURN_NOT_OK(SwapChildren(type.fields())); - return Status::OK(); - } - Status Visit(const FixedSizeListType& type) { - RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } Status Visit(const MapType& type) { RETURN_NOT_OK(SwapSmallOffset()); - RETURN_NOT_OK(SwapChildren(type.fields())); - return Status::OK(); - } - - Status Visit(const StructType& type) { - RETURN_NOT_OK(SwapChildren(type.fields())); - return Status::OK(); - } - - Status Visit(const SparseUnionType& type) { - RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } Status Visit(const DenseUnionType& type) { RETURN_NOT_OK(SwapSmallOffset(2)); - RETURN_NOT_OK(SwapChildren(type.fields())); return Status::OK(); } @@ -258,6 +246,7 @@ std::shared_ptr MakeArray(const std::shared_ptr& data) { void SwapEndianArrayData(std::shared_ptr& data) { internal::ArrayDataEndianSwapper swapper_visitor(data, data->length); DCHECK_OK(VisitTypeInline(*data->type, &swapper_visitor)); + DCHECK_OK(swapper_visitor.SwapChildren((*data->type).fields())); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 775e08272b2..093f36c698e 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -62,6 +62,7 @@ #include "arrow/util/macros.h" #include "arrow/util/type_traits.h" +#include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" namespace arrow { @@ -324,28 +325,12 @@ static inline uint16_t ByteSwap(uint16_t value) { static inline uint8_t ByteSwap(uint8_t value) { return value; } static inline int8_t ByteSwap(int8_t value) { return value; } static inline double ByteSwap(double value) { -#if defined(__GNUC__) - // avoid dereferencing type-punned pointer will break strict-aliasing rules - int8_t* in_ptr = reinterpret_cast(&value); - auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(in_ptr)); - int8_t* out_ptr = reinterpret_cast(&swapped); - return *reinterpret_cast(out_ptr); -#else - auto swapped = ARROW_BYTE_SWAP64(*reinterpret_cast(&value)); - return *reinterpret_cast(&swapped); -#endif + const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy(value)); + return util::SafeCopy(swapped); } static inline float ByteSwap(float value) { -#if defined(__GNUC__) - // avoid dereferencing type-punned pointer will break strict-aliasing rules - int8_t* in_ptr = reinterpret_cast(&value); - auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(in_ptr)); - int8_t* out_ptr = reinterpret_cast(&swapped); - return *reinterpret_cast(out_ptr); -#else - auto swapped = ARROW_BYTE_SWAP32(*reinterpret_cast(&value)); - return *reinterpret_cast(&swapped); -#endif + const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy(value)); + return util::SafeCopy(swapped); } // Write the swapped bytes into dst. Src and dst cannot overlap. diff --git a/dev/archery/tests/generate_files_for_endian_test.sh b/dev/archery/tests/generate_files_for_endian_test.sh index 4f8a3abb28a..78ca8c659af 100755 --- a/dev/archery/tests/generate_files_for_endian_test.sh +++ b/dev/archery/tests/generate_files_for_endian_test.sh @@ -17,9 +17,11 @@ # under the License. : ${ARROW_DIR:=/arrow} +: ${TMP_DIR:=/tmp/arrow} -json_file=`archery integration --with-cpp=1 | grep "Testing file" | tail -1 | awk '{ print $3 }'` -json_dir=`dirname $json_file` +json_dir=$TMP_DIR/arrow.$$ +mkdir -p $json_dir +archery integration --with-cpp=1 --tempdir=$json_dir for f in $json_dir/*.json ; do $ARROW_DIR/cpp/build/debug/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; done for f in $json_dir/*.arrow_file ; do $ARROW_DIR/cpp/build/debug/arrow-file-to-stream $f > ${f%.*}.stream; done for f in $json_dir/*.json ; do gzip $f ; done From 0995889614ee13fb19a4fe442fbe33f58ac7160d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 10 Aug 2020 17:31:38 +0000 Subject: [PATCH 24/63] add endian.h --- cpp/src/arrow/array/util.cc | 1 + cpp/src/arrow/ipc/message.cc | 1 + cpp/src/arrow/ipc/reader.cc | 1 + cpp/src/arrow/ipc/writer.cc | 1 + cpp/src/arrow/type.h | 2 +- cpp/src/arrow/util/bit_block_counter.cc | 3 + cpp/src/arrow/util/bit_util.h | 149 -------------------- cpp/src/arrow/util/bitmap.h | 1 + cpp/src/arrow/util/bitmap_writer.h | 1 + cpp/src/arrow/util/bpacking.h | 1 + cpp/src/arrow/util/decimal.cc | 2 +- cpp/src/arrow/util/endian.h | 179 ++++++++++++++++++++++++ cpp/src/arrow/util/hashing.h | 1 + 13 files changed, 192 insertions(+), 151 deletions(-) create mode 100644 cpp/src/arrow/util/endian.h diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index d0ebf1d3e5a..2cc667b827b 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -41,6 +41,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 6569e71b454..906cb00ef07 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -32,6 +32,7 @@ #include "arrow/ipc/options.h" #include "arrow/ipc/util.h" #include "arrow/status.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index eeca0a4e312..7d52fcef5f8 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -46,6 +46,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index ac866daa8d2..c14ff5ec9bc 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -49,6 +49,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 2ba28ba91b1..2bc37b4b116 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -29,8 +29,8 @@ #include "arrow/result.h" #include "arrow/type_fwd.h" // IWYU pragma: export -#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" #include "arrow/util/variant.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc index c67cedc4a06..57b2a408db4 100644 --- a/cpp/src/arrow/util/bit_block_counter.cc +++ b/cpp/src/arrow/util/bit_block_counter.cc @@ -22,7 +22,10 @@ #include #include "arrow/buffer.h" +#include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/endian.h" +#include "arrow/util/ubsan.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 093f36c698e..01845791faa 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -17,42 +17,14 @@ #pragma once -#ifdef _WIN32 -#define ARROW_LITTLE_ENDIAN 1 -#else -#if defined(__APPLE__) || defined(__FreeBSD__) -#include // IWYU pragma: keep -#else -#include // IWYU pragma: keep -#endif -# -#ifndef __BYTE_ORDER__ -#error "__BYTE_ORDER__ not defined" -#endif -# -#ifndef __ORDER_LITTLE_ENDIAN__ -#error "__ORDER_LITTLE_ENDIAN__ not defined" -#endif -# -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define ARROW_LITTLE_ENDIAN 1 -#else -#define ARROW_LITTLE_ENDIAN 0 -#endif -#endif - #if defined(_MSC_VER) #include // IWYU pragma: keep #include #pragma intrinsic(_BitScanReverse) #pragma intrinsic(_BitScanForward) -#define ARROW_BYTE_SWAP64 _byteswap_uint64 -#define ARROW_BYTE_SWAP32 _byteswap_ulong #define ARROW_POPCOUNT64 __popcnt64 #define ARROW_POPCOUNT32 __popcnt #else -#define ARROW_BYTE_SWAP64 __builtin_bswap64 -#define ARROW_BYTE_SWAP32 __builtin_bswap32 #define ARROW_POPCOUNT64 __builtin_popcountll #define ARROW_POPCOUNT32 __builtin_popcount #endif @@ -61,8 +33,6 @@ #include #include "arrow/util/macros.h" -#include "arrow/util/type_traits.h" -#include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" namespace arrow { @@ -302,125 +272,6 @@ static inline int Log2(uint64_t x) { return NumRequiredBits(x - 1); } -// -// Byte-swap 16-bit, 32-bit and 64-bit values -// - -// Swap the byte order (i.e. endianness) -static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } -static inline uint64_t ByteSwap(uint64_t value) { - return static_cast(ARROW_BYTE_SWAP64(value)); -} -static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); } -static inline uint32_t ByteSwap(uint32_t value) { - return static_cast(ARROW_BYTE_SWAP32(value)); -} -static inline int16_t ByteSwap(int16_t value) { - constexpr auto m = static_cast(0xff); - return static_cast(((value >> 8) & m) | ((value & m) << 8)); -} -static inline uint16_t ByteSwap(uint16_t value) { - return static_cast(ByteSwap(static_cast(value))); -} -static inline uint8_t ByteSwap(uint8_t value) { return value; } -static inline int8_t ByteSwap(int8_t value) { return value; } -static inline double ByteSwap(double value) { - const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy(value)); - return util::SafeCopy(swapped); -} -static inline float ByteSwap(float value) { - const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy(value)); - return util::SafeCopy(swapped); -} - -// Write the swapped bytes into dst. Src and dst cannot overlap. -static inline void ByteSwap(void* dst, const void* src, int len) { - switch (len) { - case 1: - *reinterpret_cast(dst) = *reinterpret_cast(src); - return; - case 2: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - case 4: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - case 8: - *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); - return; - default: - break; - } - - auto d = reinterpret_cast(dst); - auto s = reinterpret_cast(src); - for (int i = 0; i < len; ++i) { - d[i] = s[len - i - 1]; - } -} - -// Convert to little/big endian format from the machine's native endian format. -#if ARROW_LITTLE_ENDIAN -template > -static inline T ToBigEndian(T value) { - return ByteSwap(value); -} - -template > -static inline T ToLittleEndian(T value) { - return value; -} -#else -template > -static inline T ToBigEndian(T value) { - return value; -} - -template > -static inline T ToLittleEndian(T value) { - return ByteSwap(value); -} -#endif - -// Convert from big/little endian format to the machine's native endian format. -#if ARROW_LITTLE_ENDIAN -template > -static inline T FromBigEndian(T value) { - return ByteSwap(value); -} - -template > -static inline T FromLittleEndian(T value) { - return value; -} -#else -template > -static inline T FromBigEndian(T value) { - return value; -} - -template > -static inline T FromLittleEndian(T value) { - return ByteSwap(value); -} -#endif - // // Utilities for reading and writing individual bits by their index // in a memory area. diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h index c26d75f0b53..8562c55e3d5 100644 --- a/cpp/src/arrow/util/bitmap.h +++ b/cpp/src/arrow/util/bitmap.h @@ -30,6 +30,7 @@ #include "arrow/buffer.h" #include "arrow/util/bit_util.h" #include "arrow/util/compare.h" +#include "arrow/util/endian.h" #include "arrow/util/functional.h" #include "arrow/util/string_builder.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h index 7cb2fc6a98f..d4f02f37a41 100644 --- a/cpp/src/arrow/util/bitmap_writer.h +++ b/cpp/src/arrow/util/bitmap_writer.h @@ -21,6 +21,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" namespace arrow { diff --git a/cpp/src/arrow/util/bpacking.h b/cpp/src/arrow/util/bpacking.h index 71714c4c7d8..e5a4dbbed89 100644 --- a/cpp/src/arrow/util/bpacking.h +++ b/cpp/src/arrow/util/bpacking.h @@ -17,6 +17,7 @@ #pragma once +#include "arrow/util/endian.h" #include "arrow/util/visibility.h" #include diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index dcb2023616a..c683e198cd6 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -29,8 +29,8 @@ #include #include "arrow/status.h" -#include "arrow/util/bit_util.h" #include "arrow/util/decimal.h" +#include "arrow/util/endian.h" #include "arrow/util/formatting.h" #include "arrow/util/int128_internal.h" #include "arrow/util/int_util_internal.h" diff --git a/cpp/src/arrow/util/endian.h b/cpp/src/arrow/util/endian.h new file mode 100644 index 00000000000..81577e9091f --- /dev/null +++ b/cpp/src/arrow/util/endian.h @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#ifdef _WIN32 +#define ARROW_LITTLE_ENDIAN 1 +#else +#if defined(__APPLE__) || defined(__FreeBSD__) +#include // IWYU pragma: keep +#else +#include // IWYU pragma: keep +#endif +# +#ifndef __BYTE_ORDER__ +#error "__BYTE_ORDER__ not defined" +#endif +# +#ifndef __ORDER_LITTLE_ENDIAN__ +#error "__ORDER_LITTLE_ENDIAN__ not defined" +#endif +# +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define ARROW_LITTLE_ENDIAN 1 +#else +#define ARROW_LITTLE_ENDIAN 0 +#endif +#endif + +#if defined(_MSC_VER) +#include // IWYU pragma: keep +#define ARROW_BYTE_SWAP64 _byteswap_uint64 +#define ARROW_BYTE_SWAP32 _byteswap_ulong +#else +#define ARROW_BYTE_SWAP64 __builtin_bswap64 +#define ARROW_BYTE_SWAP32 __builtin_bswap32 +#endif + +#include "arrow/util/type_traits.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace BitUtil { + +// +// Byte-swap 16-bit, 32-bit and 64-bit values +// + +// Swap the byte order (i.e. endianness) +static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); } +static inline uint64_t ByteSwap(uint64_t value) { + return static_cast(ARROW_BYTE_SWAP64(value)); +} +static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); } +static inline uint32_t ByteSwap(uint32_t value) { + return static_cast(ARROW_BYTE_SWAP32(value)); +} +static inline int16_t ByteSwap(int16_t value) { + constexpr auto m = static_cast(0xff); + return static_cast(((value >> 8) & m) | ((value & m) << 8)); +} +static inline uint16_t ByteSwap(uint16_t value) { + return static_cast(ByteSwap(static_cast(value))); +} +static inline uint8_t ByteSwap(uint8_t value) { return value; } +static inline int8_t ByteSwap(int8_t value) { return value; } +static inline double ByteSwap(double value) { + const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy(value)); + return util::SafeCopy(swapped); +} +static inline float ByteSwap(float value) { + const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy(value)); + return util::SafeCopy(swapped); +} + +// Write the swapped bytes into dst. Src and dst cannot overlap. +static inline void ByteSwap(void* dst, const void* src, int len) { + switch (len) { + case 1: + *reinterpret_cast(dst) = *reinterpret_cast(src); + return; + case 2: + *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); + return; + case 4: + *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); + return; + case 8: + *reinterpret_cast(dst) = ByteSwap(*reinterpret_cast(src)); + return; + default: + break; + } + + auto d = reinterpret_cast(dst); + auto s = reinterpret_cast(src); + for (int i = 0; i < len; ++i) { + d[i] = s[len - i - 1]; + } +} + +// Convert to little/big endian format from the machine's native endian format. +#if ARROW_LITTLE_ENDIAN +template > +static inline T ToBigEndian(T value) { + return ByteSwap(value); +} + +template > +static inline T ToLittleEndian(T value) { + return value; +} +#else +template > +static inline T ToBigEndian(T value) { + return value; +} + +template > +static inline T ToLittleEndian(T value) { + return ByteSwap(value); +} +#endif + +// Convert from big/little endian format to the machine's native endian format. +#if ARROW_LITTLE_ENDIAN +template > +static inline T FromBigEndian(T value) { + return ByteSwap(value); +} + +template > +static inline T FromLittleEndian(T value) { + return value; +} +#else +template > +static inline T FromBigEndian(T value) { + return value; +} + +template > +static inline T FromLittleEndian(T value) { + return ByteSwap(value); +} +#endif + +} // namespace BitUtil +} // namespace arrow diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 28c273fea99..f55ac88fb91 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -39,6 +39,7 @@ #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" From 220cee418a4327101f9c320e970bbdc5569b5e17 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 10 Aug 2020 17:50:31 +0000 Subject: [PATCH 25/63] fix compilation error when adding endian.h --- cpp/src/gandiva/selection_vector.cc | 1 + cpp/src/parquet/arrow/reader_internal.cc | 1 + cpp/src/parquet/column_writer.cc | 1 + cpp/src/parquet/encoding_test.cc | 1 + cpp/src/plasma/io.cc | 2 +- 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/selection_vector.cc b/cpp/src/gandiva/selection_vector.cc index 47e45d3359b..a30bba6864e 100644 --- a/cpp/src/gandiva/selection_vector.cc +++ b/cpp/src/gandiva/selection_vector.cc @@ -23,6 +23,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "gandiva/selection_vector_impl.h" diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index d7cbfdf1f9e..360078f254c 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -40,6 +40,7 @@ #include "arrow/util/base64.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/endian.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 3ca5a80f675..48219ce2f7d 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -38,6 +38,7 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/rle_encoding.h" #include "arrow/visitor_inline.h" diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index e9fce9de838..02e81becd47 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -33,6 +33,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_writer.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/endian.h" #include "parquet/encoding.h" #include "parquet/platform.h" diff --git a/cpp/src/plasma/io.cc b/cpp/src/plasma/io.cc index f119f8f81a9..002f4e9991f 100644 --- a/cpp/src/plasma/io.cc +++ b/cpp/src/plasma/io.cc @@ -22,7 +22,7 @@ #include #include "arrow/status.h" -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "plasma/common.h" From 64f0eeda39edae249c9d7b3355f6c47390038701 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 10 Aug 2020 17:50:42 +0000 Subject: [PATCH 26/63] fix lint error --- cpp/src/arrow/array/util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 2cc667b827b..8482c884992 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -89,8 +89,8 @@ class ArrayDataEndianSwapper { Status SwapChildren(std::vector> child_fields) { int i = 0; for (const auto& child_field : child_fields) { - ArrayDataEndianSwapper - swapper_child_visitor(data_->child_data[i], data_->child_data[i]->length); + ArrayDataEndianSwapper swapper_child_visitor(data_->child_data[i], + data_->child_data[i]->length); RETURN_NOT_OK(VisitTypeInline(*child_field.get()->type(), &swapper_child_visitor)); RETURN_NOT_OK( swapper_child_visitor.SwapChildren((*child_field.get()->type()).fields())); From 794cfee357025174a45922dc89e34bc2e181f978 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 10 Aug 2020 18:26:30 +0000 Subject: [PATCH 27/63] add endian.h to files where ARROW_..._ENDIAN macro is used --- cpp/src/arrow/array/array_view_test.cc | 2 +- cpp/src/arrow/chunked_array_test.cc | 2 +- cpp/src/arrow/util/basic_decimal.cc | 1 + cpp/src/arrow/util/bit_run_reader.h | 1 + cpp/src/arrow/util/bit_util_test.cc | 2 +- cpp/src/arrow/util/bitmap_ops.cc | 1 + cpp/src/arrow/util/decimal_test.cc | 1 + cpp/src/parquet/level_conversion.h | 1 + cpp/src/parquet/types_test.cc | 2 +- 9 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index e73bbda7abc..07dc3014e40 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -29,7 +29,7 @@ #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 3144f5786d7..c5907549fe4 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -27,7 +27,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/key_value_metadata.h" namespace arrow { diff --git a/cpp/src/arrow/util/basic_decimal.cc b/cpp/src/arrow/util/basic_decimal.cc index 78d5b15d1c0..d9d6f4f42fa 100644 --- a/cpp/src/arrow/util/basic_decimal.cc +++ b/cpp/src/arrow/util/basic_decimal.cc @@ -28,6 +28,7 @@ #include #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/int128_internal.h" #include "arrow/util/int_util_internal.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h index 39ff049428d..b24632a6e5e 100644 --- a/cpp/src/arrow/util/bit_run_reader.h +++ b/cpp/src/arrow/util/bit_run_reader.h @@ -24,6 +24,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc index cd9086d3ee8..e5a5e4c39be 100644 --- a/cpp/src/arrow/util/bit_util_test.cc +++ b/cpp/src/arrow/util/bit_util_test.cc @@ -44,7 +44,6 @@ #include "arrow/type_fwd.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_stream_utils.h" -#include "arrow/util/bit_util.h" #include "arrow/util/bitmap.h" #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" @@ -52,6 +51,7 @@ #include "arrow/util/bitmap_visit.h" #include "arrow/util/bitmap_writer.h" #include "arrow/util/bitset_stack.h" +#include "arrow/util/endian.h" namespace arrow { diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc index 9f1c63653d6..1f9cf19bbd0 100644 --- a/cpp/src/arrow/util/bitmap_ops.cc +++ b/cpp/src/arrow/util/bitmap_ops.cc @@ -28,6 +28,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" #include "arrow/util/bitmap_writer.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/util/decimal_test.cc b/cpp/src/arrow/util/decimal_test.cc index 40ae49da2ce..0bc838d0c29 100644 --- a/cpp/src/arrow/util/decimal_test.cc +++ b/cpp/src/arrow/util/decimal_test.cc @@ -32,6 +32,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/decimal.h" +#include "arrow/util/endian.h" #include "arrow/util/int128_internal.h" #include "arrow/util/macros.h" diff --git a/cpp/src/parquet/level_conversion.h b/cpp/src/parquet/level_conversion.h index d4d68457a13..e45a288e8c0 100644 --- a/cpp/src/parquet/level_conversion.h +++ b/cpp/src/parquet/level_conversion.h @@ -19,6 +19,7 @@ #include +#include "arrow/util/endian.h" #include "parquet/platform.h" #include "parquet/schema.h" diff --git a/cpp/src/parquet/types_test.cc b/cpp/src/parquet/types_test.cc index 13c3ffab729..e0ca7d63566 100644 --- a/cpp/src/parquet/types_test.cc +++ b/cpp/src/parquet/types_test.cc @@ -19,7 +19,7 @@ #include -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "parquet/types.h" namespace parquet { From d8e0c1caa755ffade5b6f6e4c9f5e0df5942fc0c Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 11 Aug 2020 01:51:58 +0000 Subject: [PATCH 28/63] add endian.h to files where ARROW_..._ENDIAN macro is used --- cpp/src/arrow/c/bridge_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index fc11f126e72..81b818b72dc 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -34,6 +34,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/string_view.h" From e1db0962604651319acbdcb2b414ea4f79d9dd00 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 11 Aug 2020 01:53:30 +0000 Subject: [PATCH 29/63] rename use_native_endian to ensure_native_endian --- cpp/src/arrow/ipc/options.h | 2 +- cpp/src/arrow/ipc/reader.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/ipc/options.h b/cpp/src/arrow/ipc/options.h index 8f6b29fbc18..fdfd19dec78 100644 --- a/cpp/src/arrow/ipc/options.h +++ b/cpp/src/arrow/ipc/options.h @@ -140,7 +140,7 @@ struct ARROW_EXPORT IpcReadOptions { /// \brief Convert endian of data element to platform-native endianness /// if the endianness of the received schema is not equal to /// platform-native endianness - bool use_native_endian = true; + bool ensure_native_endian = true; static IpcReadOptions Defaults(); }; diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 7d52fcef5f8..ce060cef2e4 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -766,7 +766,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = options.use_native_endian && !out_schema_->IsNativeEndianness(); + swap_endian_ = options.ensure_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData schema_ = schema_->WithNativeEndianness(); @@ -984,7 +984,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Get the schema and record any observed dictionaries RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = options.use_native_endian && !out_schema_->IsNativeEndianness(); + swap_endian_ = options.ensure_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData schema_ = schema_->WithNativeEndianness(); @@ -1218,7 +1218,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = options_.use_native_endian && !out_schema_->IsNativeEndianness(); + swap_endian_ = options_.ensure_native_endian && !out_schema_->IsNativeEndianness(); if (swap_endian_) { // create a new schema with native endianness before swapping endian in ArrayData schema_ = schema_->WithNativeEndianness(); From ce4bcee5bdd0f6a18f7c71a3aeca7e8d597f59fc Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 11 Aug 2020 05:30:57 +0000 Subject: [PATCH 30/63] add comment to ensure_use_native --- cpp/src/arrow/ipc/options.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/options.h b/cpp/src/arrow/ipc/options.h index fdfd19dec78..057d17780a3 100644 --- a/cpp/src/arrow/ipc/options.h +++ b/cpp/src/arrow/ipc/options.h @@ -139,7 +139,8 @@ struct ARROW_EXPORT IpcReadOptions { /// \brief Convert endian of data element to platform-native endianness /// if the endianness of the received schema is not equal to - /// platform-native endianness + /// platform-native endianness. This is effective at RecordBatchFileReader.Open(), + /// RecordBatchStreamReader(), or StreamDecoder(). bool ensure_native_endian = true; static IpcReadOptions Defaults(); From ba2adc0fbfcf7026b02a77c1a4ceb1cbc8bdae5d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 11 Aug 2020 05:31:11 +0000 Subject: [PATCH 31/63] address review comment --- cpp/src/arrow/ipc/reader.cc | 118 +++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index ce060cef2e4..9643c0838c2 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -109,6 +109,23 @@ Status InvalidMessageType(MessageType expected, MessageType actual) { // ---------------------------------------------------------------------- // Record batch read path +/// \brief Structure to keep common arguments to be passed +struct IpcReadContext { + IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, DictionaryKind* kind, + bool swap) + : dictionary_memo(memo), options(option), kind(kind), swap_endian(swap) {} + + DictionaryMemo* dictionary_memo; + + const IpcReadOptions& options; + + DictionaryKind* kind; + + /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements + /// if this flag is true + const bool swap_endian; +}; + /// The field_index and buffer_index are incremented based on how much of the /// batch is "consumed" (through nested data reconstruction, for example) class ArrayLoader { @@ -440,10 +457,10 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op Result> LoadRecordBatchSubset( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, - const std::vector* inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, MetadataVersion metadata_version, - Compression::type compression, io::RandomAccessFile* file, bool swap_endian = false) { - ArrayLoader loader(metadata, metadata_version, options, file); + const std::vector* inclusion_mask, const IpcReadContext& context, + MetadataVersion metadata_version, Compression::type compression, + io::RandomAccessFile* file) { + ArrayLoader loader(metadata, metadata_version, context.options, file); ArrayDataVector columns(schema->num_fields()); ArrayDataVector filtered_columns; @@ -473,7 +490,8 @@ Result> LoadRecordBatchSubset( // Dictionary resolution needs to happen on the unfiltered columns, // because fields are mapped structurally (by path in the original schema). - RETURN_NOT_OK(ResolveDictionaries(columns, *dictionary_memo, options.memory_pool)); + RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo, + context.options.memory_pool)); if (inclusion_mask) { filtered_schema = ::arrow::schema(std::move(filtered_fields), schema->metadata()); @@ -483,11 +501,11 @@ Result> LoadRecordBatchSubset( filtered_columns = std::move(columns); } if (compression != Compression::UNCOMPRESSED) { - RETURN_NOT_OK(DecompressBuffers(compression, options, &filtered_columns)); + RETURN_NOT_OK(DecompressBuffers(compression, context.options, &filtered_columns)); } // swap endian in a set of ArrayData if necessary (swap_endian == true) - if (swap_endian) { + if (context.swap_endian) { for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { SwapEndianArrayData(filtered_columns[i]); } @@ -498,15 +516,15 @@ Result> LoadRecordBatchSubset( Result> LoadRecordBatch( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, - const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, MetadataVersion metadata_version, - Compression::type compression, io::RandomAccessFile* file, bool swap_endian = false) { + const std::vector& inclusion_mask, const IpcReadContext& context, + MetadataVersion metadata_version, Compression::type compression, + io::RandomAccessFile* file) { if (inclusion_mask.size() > 0) { - return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, dictionary_memo, - options, metadata_version, compression, file, swap_endian); + return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, + metadata_version, compression, file); } else { - return LoadRecordBatchSubset(metadata, schema, nullptr, dictionary_memo, options, - metadata_version, compression, file, swap_endian); + return LoadRecordBatchSubset(metadata, schema, nullptr, context, metadata_version, + compression, file); } } @@ -584,8 +602,8 @@ Result> ReadRecordBatch( Result> ReadRecordBatchInternal( const Buffer& metadata, const std::shared_ptr& schema, - const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, io::RandomAccessFile* file, bool swap_endian = false) { + const std::vector& inclusion_mask, const IpcReadContext& context, + io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); auto batch = message->header_as_RecordBatch(); @@ -602,9 +620,9 @@ Result> ReadRecordBatchInternal( // in 0.17.x RETURN_NOT_OK(GetCompressionExperimental(message, &compression)); } - return LoadRecordBatch(batch, schema, inclusion_mask, dictionary_memo, options, + return LoadRecordBatch(batch, schema, inclusion_mask, context, internal::GetMetadataVersion(message->version()), compression, - file, swap_endian); + file); } // If we are selecting only certain fields, populate an inclusion mask for fast lookups. @@ -674,15 +692,16 @@ Result> ReadRecordBatch( std::shared_ptr out_schema; // Empty means do not use std::vector inclusion_mask; - RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, options.included_fields, + DictionaryKind kind; + IpcReadContext context(const_cast(dictionary_memo), options, &kind, + false); + RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields, &inclusion_mask, &out_schema)); - return ReadRecordBatchInternal(metadata, schema, inclusion_mask, dictionary_memo, - options, file); + return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file); } -Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, DictionaryKind* kind, - io::RandomAccessFile* file, bool swap_endian) { +Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, + io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); const auto dictionary_batch = message->header_as_DictionaryBatch(); @@ -709,43 +728,40 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, // Look up the dictionary value type, which must have been added to the // DictionaryMemo already prior to invoking this function - ARROW_ASSIGN_OR_RAISE(auto value_type, dictionary_memo->GetDictionaryType(id)); + ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id)); // Load the dictionary data from the dictionary batch ArrayLoader loader(batch_meta, internal::GetMetadataVersion(message->version()), - options, file); + context.options, file); const auto dict_data = std::make_shared(); const Field dummy_field("", value_type); RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get())); if (compression != Compression::UNCOMPRESSED) { ArrayDataVector dict_fields{dict_data}; - RETURN_NOT_OK(DecompressBuffers(compression, options, &dict_fields)); + RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields)); } if (dictionary_batch->isDelta()) { - if (kind != nullptr) { - *kind = DictionaryKind::Delta; + if (context.kind != nullptr) { + *context.kind = DictionaryKind::Delta; } - return dictionary_memo->AddDictionaryDelta(id, dict_data); + return context.dictionary_memo->AddDictionaryDelta(id, dict_data); } ARROW_ASSIGN_OR_RAISE(bool inserted, - dictionary_memo->AddOrReplaceDictionary(id, dict_data)); - if (kind != nullptr) { - *kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement; + context.dictionary_memo->AddOrReplaceDictionary(id, dict_data)); + if (context.kind != nullptr) { + *context.kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement; } return Status::OK(); } -Status ReadDictionary(const Message& message, DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, DictionaryKind* kind, - bool swap_endian) { +Status ReadDictionary(const Message& message, const IpcReadContext& context) { // Only invoke this method if we already know we have a dictionary message DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH); CHECK_HAS_BODY(message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadDictionary(*message.metadata(), dictionary_memo, options, kind, - reader.get(), swap_endian); + return ReadDictionary(*message.metadata(), context, reader.get()); } // ---------------------------------------------------------------------- @@ -804,9 +820,10 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); + DictionaryKind kind; + IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get(), - swap_endian_) + context, reader.get()) .Value(batch); } @@ -836,9 +853,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { // Read dictionary from dictionary batch Status ReadDictionary(const Message& message) { DictionaryKind kind; - RETURN_NOT_OK( - ::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, &kind, - swap_endian_)); + IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); + RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context)); switch (kind) { case DictionaryKind::New: break; @@ -959,11 +975,11 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - ARROW_ASSIGN_OR_RAISE( - auto batch, - ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get(), - swap_endian_)); + DictionaryKind kind; + IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); + ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal( + *message->metadata(), schema_, + field_inclusion_mask_, context, reader.get())); ++stats_.num_record_batches; return batch; } @@ -1033,8 +1049,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; - RETURN_NOT_OK(ReadDictionary(*message->metadata(), &dictionary_memo_, options_, - &kind, reader.get(), swap_endian_)); + IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); + RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, reader.get())); ++stats_.num_dictionary_batches; if (kind != DictionaryKind::New) { return Status::Invalid( @@ -1251,6 +1267,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { } Status OnRecordBatchMessageDecoded(std::unique_ptr message) { + DictionaryKind kind; + IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); if (message->type() == MessageType::DICTIONARY_BATCH) { return ReadDictionary(*message); } else { From 37591409f49bceb9f277eb2bd46bd4675cf87e9f Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 11 Aug 2020 05:38:31 +0000 Subject: [PATCH 32/63] fix lint error --- cpp/src/arrow/c/bridge_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 81b818b72dc..317fd01f17c 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -33,8 +33,8 @@ #include "arrow/memory_pool.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" -#include "arrow/util/key_value_metadata.h" #include "arrow/util/endian.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/string_view.h" From 61ef7705b3d8f0c58d0f7ccb97ee95d67eb09c27 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 19 Aug 2020 09:40:47 +0000 Subject: [PATCH 33/63] reallocate buffer when endian is swapped --- cpp/src/arrow/array/util.cc | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 8482c884992..f4fda31b9a9 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -125,53 +125,59 @@ class ArrayDataEndianSwapper { template Status Visit(const T&) { using value_type = typename T::c_type; - auto buffer = const_cast( - reinterpret_cast(data_->buffers[1]->data())); + auto data = reinterpret_cast(data_->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); int64_t length = length_; for (int64_t i = 0; i < length; i++) { #if ARROW_LITTLE_ENDIAN - buffer[i] = BitUtil::FromBigEndian(buffer[i]); + new_data[i] = BitUtil::FromBigEndian(data[i]); #else - buffer[i] = BitUtil::FromLittleEndian(buffer[i]); + new_data[i] = BitUtil::FromLittleEndian(data[i]); #endif } + data_->buffers[1] = std::move(new_buffer); return Status::OK(); } Status Visit(const Decimal128Type& type) { - auto buffer = const_cast( - reinterpret_cast(data_->buffers[1]->data())); + auto data = reinterpret_cast(data_->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); int64_t length = length_; for (int64_t i = 0; i < length; i++) { uint64_t tmp; auto idx = i * 2; #if ARROW_LITTLE_ENDIAN - tmp = BitUtil::FromBigEndian(buffer[idx]); - buffer[idx] = BitUtil::FromBigEndian(buffer[idx + 1]); - buffer[idx + 1] = tmp; + tmp = BitUtil::FromBigEndian(data[idx]); + new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]); + new_data[idx + 1] = tmp; #else - tmp = BitUtil::FromLittleEndian(buffer[idx]); - buffer[idx] = BitUtil::FromLittleEndian(buffer[idx + 1]); - buffer[idx + 1] = tmp; + tmp = BitUtil::FromLittleEndian(data[idx]); + new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]); + new_data[idx + 1] = tmp; #endif } + data_->buffers[1] = std::move(new_buffer); return Status::OK(); } Status Visit(const DayTimeIntervalType& type) { - auto buffer = const_cast( - reinterpret_cast(data_->buffers[1]->data())); + auto data = reinterpret_cast(data_->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); int64_t length = length_; for (int64_t i = 0; i < length; i++) { auto idx = i * 2; #if ARROW_LITTLE_ENDIAN - buffer[idx] = BitUtil::FromBigEndian(buffer[idx]); - buffer[idx + 1] = BitUtil::FromBigEndian(buffer[idx + 1]); + new_data[idx] = BitUtil::FromBigEndian(data[idx]); + new_data[idx + 1] = BitUtil::FromBigEndian(data[idx + 1]); #else - buffer[idx] = BitUtil::FromLittleEndian(buffer[idx]); - buffer[idx + 1] = BitUtil::FromLittleEndian(buffer[idx + 1]); + new_data[idx] = BitUtil::FromLittleEndian(data[idx]); + new_data[idx + 1] = BitUtil::FromLittleEndian(data[idx + 1]); #endif } + data_->buffers[1] = std::move(new_buffer); return Status::OK(); } From 550a13eea1b65275196bea781150afed1d33ad7c Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 25 Aug 2020 17:43:45 +0000 Subject: [PATCH 34/63] address review comment --- cpp/src/arrow/array/util.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index f4fda31b9a9..c342995dea8 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -104,17 +104,20 @@ class ArrayDataEndianSwapper { if (data_->buffers[index] == nullptr) { return Status::OK(); } - auto buffer = const_cast( - reinterpret_cast(data_->buffers[index]->data())); + auto data = reinterpret_cast(data_->buffers[index]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, + AllocateBuffer(data_->buffers[index]->size() + 1)); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); // offset has one more element rather than data->length int64_t length = length_ + 1; for (int64_t i = 0; i < length; i++) { #if ARROW_LITTLE_ENDIAN - buffer[i] = BitUtil::FromBigEndian(buffer[i]); + new_data[i] = BitUtil::FromBigEndian(data[i]); #else - buffer[i] = BitUtil::FromLittleEndian(buffer[i]); + new_data[i] = BitUtil::FromLittleEndian(data[i]); #endif } + data_->buffers[index] = std::move(new_buffer); return Status::OK(); } From 5b1cc4b16cca16bf94e1e02a40caa4167ad3a9b4 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 26 Aug 2020 19:24:40 +0000 Subject: [PATCH 35/63] fix typo --- cpp/src/arrow/array/util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index c342995dea8..8f31ed94e51 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -166,9 +166,9 @@ class ArrayDataEndianSwapper { } Status Visit(const DayTimeIntervalType& type) { - auto data = reinterpret_cast(data_->buffers[1]->data()); + auto data = reinterpret_cast(data_->buffers[1]->data()); ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); - auto new_data = reinterpret_cast(new_buffer->mutable_data()); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); int64_t length = length_; for (int64_t i = 0; i < length; i++) { auto idx = i * 2; From a7ed3c6eada0d9defd27871859844fc7225560a0 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Thu, 10 Sep 2020 02:37:39 +0000 Subject: [PATCH 36/63] swap endian in dictionary if necessary --- cpp/src/arrow/ipc/reader.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 9643c0838c2..a17fac9396a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -742,6 +742,11 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields)); } + // swap endian in dict_data if necessary (swap_endian == true) + if (context.swap_endian) { + SwapEndianArrayData(dict_data); + } + if (dictionary_batch->isDelta()) { if (context.kind != nullptr) { *context.kind = DictionaryKind::Delta; From 9dc999b7bdafd558306930b1ef912cf0063ea4ca Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 23 Sep 2020 17:00:18 +0000 Subject: [PATCH 37/63] fix build failure --- cpp/src/arrow/util/compression_lz4.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index bb0295e6858..9314dfd7faf 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -27,6 +27,7 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" From 62bc52df8ddebdb50e2cb3d0da7594952e97772d Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 6 Oct 2020 19:05:56 +0000 Subject: [PATCH 38/63] fix compilation error --- cpp/src/arrow/ipc/reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index a17fac9396a..78a66650ae0 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -733,7 +733,7 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, // Load the dictionary data from the dictionary batch ArrayLoader loader(batch_meta, internal::GetMetadataVersion(message->version()), context.options, file); - const auto dict_data = std::make_shared(); + auto dict_data = std::make_shared(); const Field dummy_field("", value_type); RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get())); From 6bce03ea86efc24279c38984211f04e7663b210e Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 24 Oct 2020 17:48:42 +0000 Subject: [PATCH 39/63] initial support Decimal256 --- cpp/src/arrow/array/util.cc | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 8f31ed94e51..1330cd5f1d7 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -165,6 +165,36 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const Decimal256Type& type) { + auto data = reinterpret_cast(data_->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); + auto new_data = reinterpret_cast(new_buffer->mutable_data()); + int64_t length = length_; + for (int64_t i = 0; i < length; i++) { + uint64_t tmp0, tmp1, tmp2; + auto idx = i * 4; +#if ARROW_LITTLE_ENDIAN + tmp0 = BitUtil::FromBigEndian(data[idx]); + tmp1 = BitUtil::FromBigEndian(data[idx + 1]); + tmp2 = BitUtil::FromBigEndian(data[idx + 2]); + new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]); + new_data[idx + 1] = tmp2; + new_data[idx + 2] = tmp1; + new_data[idx + 3] = tmp0; +#else + tmp0 = BitUtil::FromLittleEndian(data[idx]); + tmp1 = BitUtil::FromLittleEndian(data[idx + 1]); + tmp2 = BitUtil::FromLittleEndian(data[idx + 2]); + new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]); + new_data[idx + 1] = tmp2; + new_data[idx + 2] = tmp1; + new_data[idx + 3] = tmp0; +#endif + } + data_->buffers[1] = std::move(new_buffer); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { auto data = reinterpret_cast(data_->buffers[1]->data()); ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); From c57dc2280a65584b17bc2ccd96500b7f5d9168f6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 11 Nov 2020 18:08:32 +0000 Subject: [PATCH 40/63] refactor updating schema --- cpp/src/arrow/array/util.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 1330cd5f1d7..b79cc6cdd94 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -223,6 +223,13 @@ class ArrayDataEndianSwapper { Status Visit(const StructType& type) { return Status::OK(); } Status Visit(const SparseUnionType& type) { return Status::OK(); } +#if 0 + template + enable_if_binary_like Visit(const T& type) { + RETURN_NOT_OK(SwapSmallOffset()); + return Status::OK(); + } +#else Status Visit(const StringType& type) { RETURN_NOT_OK(SwapSmallOffset()); return Status::OK(); @@ -239,7 +246,8 @@ class ArrayDataEndianSwapper { RETURN_NOT_OK(SwapLargeOffset()); return Status::OK(); } - +#endif + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapSmallOffset()); return Status::OK(); From 689e1b39e54a58b73d564ec0f5e11e142880d6a4 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 8 Dec 2020 18:33:52 +0000 Subject: [PATCH 41/63] address review comments --- cpp/src/arrow/array/util.cc | 121 +++++++++++++++--------------------- cpp/src/arrow/array/util.h | 13 ++-- cpp/src/arrow/ipc/reader.cc | 99 +++++++++++++---------------- 3 files changed, 103 insertions(+), 130 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index b79cc6cdd94..5fab1fdcc8d 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -89,35 +89,38 @@ class ArrayDataEndianSwapper { Status SwapChildren(std::vector> child_fields) { int i = 0; for (const auto& child_field : child_fields) { - ArrayDataEndianSwapper swapper_child_visitor(data_->child_data[i], - data_->child_data[i]->length); - RETURN_NOT_OK(VisitTypeInline(*child_field.get()->type(), &swapper_child_visitor)); - RETURN_NOT_OK( - swapper_child_visitor.SwapChildren((*child_field.get()->type()).fields())); + ARROW_ASSIGN_OR_RAISE( + data_->child_data[i], + SwapEndianArrayData(data_->child_data[i], child_field.get()->type())); i++; } return Status::OK(); } + template + Result> ByteSwapBuffer(std::shared_ptr& in_buffer, + int64_t length) { + auto in_data = reinterpret_cast(in_buffer->data()); + ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); + auto out_data = reinterpret_cast(out_buffer->mutable_data()); + for (int64_t i = 0; i < length; i++) { +#if ARROW_LITTLE_ENDIAN + out_data[i] = BitUtil::FromBigEndian(in_data[i]); +#else + out_data[i] = BitUtil::FromLittleEndian(in_data[i]); +#endif + } + return std::move(out_buffer); + } + template Status SwapOffset(int index) { if (data_->buffers[index] == nullptr) { return Status::OK(); } - auto data = reinterpret_cast(data_->buffers[index]->data()); - ARROW_ASSIGN_OR_RAISE(auto new_buffer, - AllocateBuffer(data_->buffers[index]->size() + 1)); - auto new_data = reinterpret_cast(new_buffer->mutable_data()); // offset has one more element rather than data->length - int64_t length = length_ + 1; - for (int64_t i = 0; i < length; i++) { -#if ARROW_LITTLE_ENDIAN - new_data[i] = BitUtil::FromBigEndian(data[i]); -#else - new_data[i] = BitUtil::FromLittleEndian(data[i]); -#endif - } - data_->buffers[index] = std::move(new_buffer); + ARROW_ASSIGN_OR_RAISE(data_->buffers[index], + ByteSwapBuffer(data_->buffers[index], length_ + 1)); return Status::OK(); } @@ -126,20 +129,14 @@ class ArrayDataEndianSwapper { Status SwapLargeOffset() { return SwapOffset(1); } template - Status Visit(const T&) { + enable_if_t::value && + !std::is_base_of::value && + !std::is_base_of::value, + Status> + Visit(const T& type) { using value_type = typename T::c_type; - auto data = reinterpret_cast(data_->buffers[1]->data()); - ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); - auto new_data = reinterpret_cast(new_buffer->mutable_data()); - int64_t length = length_; - for (int64_t i = 0; i < length; i++) { -#if ARROW_LITTLE_ENDIAN - new_data[i] = BitUtil::FromBigEndian(data[i]); -#else - new_data[i] = BitUtil::FromLittleEndian(data[i]); -#endif - } - data_->buffers[1] = std::move(new_buffer); + ARROW_ASSIGN_OR_RAISE(data_->buffers[1], + ByteSwapBuffer(data_->buffers[1], length_)); return Status::OK(); } @@ -196,21 +193,8 @@ class ArrayDataEndianSwapper { } Status Visit(const DayTimeIntervalType& type) { - auto data = reinterpret_cast(data_->buffers[1]->data()); - ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); - auto new_data = reinterpret_cast(new_buffer->mutable_data()); - int64_t length = length_; - for (int64_t i = 0; i < length; i++) { - auto idx = i * 2; -#if ARROW_LITTLE_ENDIAN - new_data[idx] = BitUtil::FromBigEndian(data[idx]); - new_data[idx + 1] = BitUtil::FromBigEndian(data[idx + 1]); -#else - new_data[idx] = BitUtil::FromLittleEndian(data[idx]); - new_data[idx + 1] = BitUtil::FromLittleEndian(data[idx + 1]); -#endif - } - data_->buffers[1] = std::move(new_buffer); + ARROW_ASSIGN_OR_RAISE(data_->buffers[1], + ByteSwapBuffer(data_->buffers[1], length_ * 2)); return Status::OK(); } @@ -223,31 +207,23 @@ class ArrayDataEndianSwapper { Status Visit(const StructType& type) { return Status::OK(); } Status Visit(const SparseUnionType& type) { return Status::OK(); } -#if 0 template - enable_if_binary_like Visit(const T& type) { - RETURN_NOT_OK(SwapSmallOffset()); - return Status::OK(); - } -#else - Status Visit(const StringType& type) { - RETURN_NOT_OK(SwapSmallOffset()); - return Status::OK(); - } - Status Visit(const LargeStringType& type) { - RETURN_NOT_OK(SwapLargeOffset()); - return Status::OK(); - } - Status Visit(const BinaryType& type) { + enable_if_t::value || std::is_same::value, + Status> + Visit(const T& type) { RETURN_NOT_OK(SwapSmallOffset()); return Status::OK(); } - Status Visit(const LargeBinaryType& type) { + + template + enable_if_t::value || + std::is_same::value, + Status> + Visit(const T& type) { RETURN_NOT_OK(SwapLargeOffset()); return Status::OK(); } -#endif - + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapSmallOffset()); return Status::OK(); @@ -281,6 +257,17 @@ class ArrayDataEndianSwapper { int64_t length_; }; +Result> SwapEndianArrayData( + std::shared_ptr& data, const std::shared_ptr& type) { + std::shared_ptr out; + internal::ArrayDataEndianSwapper swapper_visitor(data, data->length); + DCHECK_OK(VisitTypeInline(*type, &swapper_visitor)); + DCHECK_OK(swapper_visitor.SwapChildren((*type).fields())); + // TODO(kiszk): Change this soon since tentatively return swapped input ArrayData + DCHECK(data); + return data; +} + } // namespace internal std::shared_ptr MakeArray(const std::shared_ptr& data) { @@ -291,12 +278,6 @@ std::shared_ptr MakeArray(const std::shared_ptr& data) { return out; } -void SwapEndianArrayData(std::shared_ptr& data) { - internal::ArrayDataEndianSwapper swapper_visitor(data, data->length); - DCHECK_OK(VisitTypeInline(*data->type, &swapper_visitor)); - DCHECK_OK(swapper_visitor.SwapChildren((*data->type).fields())); -} - // ---------------------------------------------------------------------- // Misc APIs diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 77ef124e226..7a4e811ac2f 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -37,11 +37,6 @@ namespace arrow { ARROW_EXPORT std::shared_ptr MakeArray(const std::shared_ptr& data); -/// \brief Swap endian of each element in a generic ArrayData -/// \param[in] data the array contents to be swapped -ARROW_EXPORT -void SwapEndianArrayData(std::shared_ptr& data); - /// \brief Create a strongly-typed Array instance with all elements null /// \param[in] type the array type /// \param[in] length the array length @@ -61,6 +56,14 @@ Result> MakeArrayFromScalar( namespace internal { +/// \brief Swap endian of each element in a generic ArrayData +/// \param[in] data the array contents +/// \param[in] type the array type +/// \return the resulting Array instance whose elements were swapped +ARROW_EXPORT +Result> SwapEndianArrayData( + std::shared_ptr& data, const std::shared_ptr& type); + /// Given a number of ArrayVectors, treat each ArrayVector as the /// chunks of a chunked array. Then rechunk each ArrayVector such that /// all ArrayVectors are chunked identically. It is mandatory that diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 78a66650ae0..3207bd52d75 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -111,16 +111,13 @@ Status InvalidMessageType(MessageType expected, MessageType actual) { /// \brief Structure to keep common arguments to be passed struct IpcReadContext { - IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, DictionaryKind* kind, - bool swap) - : dictionary_memo(memo), options(option), kind(kind), swap_endian(swap) {} + IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap) + : dictionary_memo(memo), options(option), swap_endian(swap) {} DictionaryMemo* dictionary_memo; const IpcReadOptions& options; - DictionaryKind* kind; - /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements /// if this flag is true const bool swap_endian; @@ -507,7 +504,9 @@ Result> LoadRecordBatchSubset( // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - SwapEndianArrayData(filtered_columns[i]); + ARROW_ASSIGN_OR_RAISE(filtered_columns[i], + arrow::internal::SwapEndianArrayData( + filtered_columns[i], filtered_columns[i]->type)); } } return RecordBatch::Make(filtered_schema, metadata->length(), @@ -523,8 +522,8 @@ Result> LoadRecordBatch( return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, metadata_version, compression, file); } else { - return LoadRecordBatchSubset(metadata, schema, nullptr, context, metadata_version, - compression, file); + return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, + metadata_version, compression, file); } } @@ -664,25 +663,32 @@ Status UnpackSchemaMessage(const void* opaque_schema, const IpcReadOptions& opti DictionaryMemo* dictionary_memo, std::shared_ptr* schema, std::shared_ptr* out_schema, - std::vector* field_inclusion_mask) { + std::vector* field_inclusion_mask, bool* swap_endian) { RETURN_NOT_OK(internal::GetSchema(opaque_schema, dictionary_memo, schema)); // If we are selecting only certain fields, populate the inclusion mask now // for fast lookups - return GetInclusionMaskAndOutSchema(*schema, options.included_fields, - field_inclusion_mask, out_schema); + RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields, + field_inclusion_mask, out_schema)); + *swap_endian = options.ensure_native_endian && !out_schema->get()->IsNativeEndianness(); + if (*swap_endian) { + // create a new schema with native endianness before swapping endian in ArrayData + *schema = schema->get()->WithNativeEndianness(); + *out_schema = out_schema->get()->WithNativeEndianness(); + } + return Status::OK(); } Status UnpackSchemaMessage(const Message& message, const IpcReadOptions& options, DictionaryMemo* dictionary_memo, std::shared_ptr* schema, std::shared_ptr* out_schema, - std::vector* field_inclusion_mask) { + std::vector* field_inclusion_mask, bool* swap_endian) { CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message.type()); CHECK_HAS_NO_BODY(message); return UnpackSchemaMessage(message.header(), options, dictionary_memo, schema, - out_schema, field_inclusion_mask); + out_schema, field_inclusion_mask, swap_endian); } Result> ReadRecordBatch( @@ -692,16 +698,14 @@ Result> ReadRecordBatch( std::shared_ptr out_schema; // Empty means do not use std::vector inclusion_mask; - DictionaryKind kind; - IpcReadContext context(const_cast(dictionary_memo), options, &kind, - false); + IpcReadContext context(const_cast(dictionary_memo), options, false); RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields, &inclusion_mask, &out_schema)); return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file); } Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, - io::RandomAccessFile* file) { + DictionaryKind* kind, io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); const auto dictionary_batch = message->header_as_DictionaryBatch(); @@ -744,29 +748,31 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, // swap endian in dict_data if necessary (swap_endian == true) if (context.swap_endian) { - SwapEndianArrayData(dict_data); + ARROW_ASSIGN_OR_RAISE( + dict_data, ::arrow::internal::SwapEndianArrayData(dict_data, dict_data->type)); } if (dictionary_batch->isDelta()) { - if (context.kind != nullptr) { - *context.kind = DictionaryKind::Delta; + if (kind != nullptr) { + *kind = DictionaryKind::Delta; } return context.dictionary_memo->AddDictionaryDelta(id, dict_data); } ARROW_ASSIGN_OR_RAISE(bool inserted, context.dictionary_memo->AddOrReplaceDictionary(id, dict_data)); - if (context.kind != nullptr) { - *context.kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement; + if (kind != nullptr) { + *kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement; } return Status::OK(); } -Status ReadDictionary(const Message& message, const IpcReadContext& context) { +Status ReadDictionary(const Message& message, const IpcReadContext& context, + DictionaryKind* kind) { // Only invoke this method if we already know we have a dictionary message DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH); CHECK_HAS_BODY(message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadDictionary(*message.metadata(), context, reader.get()); + return ReadDictionary(*message.metadata(), context, kind, reader.get()); } // ---------------------------------------------------------------------- @@ -786,13 +792,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { } RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_, - &out_schema_, &field_inclusion_mask_)); - swap_endian_ = options.ensure_native_endian && !out_schema_->IsNativeEndianness(); - if (swap_endian_) { - // create a new schema with native endianness before swapping endian in ArrayData - schema_ = schema_->WithNativeEndianness(); - out_schema_ = out_schema_->WithNativeEndianness(); - } + &out_schema_, &field_inclusion_mask_, + &swap_endian_)); return Status::OK(); } @@ -825,8 +826,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, context, reader.get()) .Value(batch); @@ -858,8 +858,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { // Read dictionary from dictionary batch Status ReadDictionary(const Message& message) { DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); - RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context)); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind)); switch (kind) { case DictionaryKind::New: break; @@ -980,8 +980,7 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal( *message->metadata(), schema_, field_inclusion_mask_, context, reader.get())); @@ -1004,13 +1003,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { // Get the schema and record any observed dictionaries RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_, - &schema_, &out_schema_, &field_inclusion_mask_)); - swap_endian_ = options.ensure_native_endian && !out_schema_->IsNativeEndianness(); - if (swap_endian_) { - // create a new schema with native endianness before swapping endian in ArrayData - schema_ = schema_->WithNativeEndianness(); - out_schema_ = out_schema_->WithNativeEndianness(); - } + &schema_, &out_schema_, &field_inclusion_mask_, + &swap_endian_)); ++stats_.num_messages; return Status::OK(); } @@ -1054,8 +1048,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); - RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, reader.get())); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get())); ++stats_.num_dictionary_batches; if (kind != DictionaryKind::New) { return Status::Invalid( @@ -1238,13 +1232,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { private: Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_, - &out_schema_, &field_inclusion_mask_)); - swap_endian_ = options_.ensure_native_endian && !out_schema_->IsNativeEndianness(); - if (swap_endian_) { - // create a new schema with native endianness before swapping endian in ArrayData - schema_ = schema_->WithNativeEndianness(); - out_schema_ = out_schema_->WithNativeEndianness(); - } + &out_schema_, &field_inclusion_mask_, + &swap_endian_)); n_required_dictionaries_ = dictionary_memo_.fields().num_fields(); if (n_required_dictionaries_ == 0) { @@ -1273,7 +1262,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnRecordBatchMessageDecoded(std::unique_ptr message) { DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, &kind, swap_endian_); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); if (message->type() == MessageType::DICTIONARY_BATCH) { return ReadDictionary(*message); } else { @@ -1293,7 +1282,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status ReadDictionary(const Message& message) { DictionaryKind kind; RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, - &kind, swap_endian_)); + &kind)); ++stats_.num_dictionary_batches; switch (kind) { case DictionaryKind::New: From 2817d004783fd7c3712d844bde1cd9103f3fc060 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 8 Dec 2020 18:39:43 +0000 Subject: [PATCH 42/63] add a header file --- cpp/src/arrow/util/bitmap_reader.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index e1412ac8d70..cf4f5e7db8b 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -22,6 +22,7 @@ #include "arrow/buffer.h" #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" namespace arrow { From b71868af7a49004a0f05554471206815cc2e6daf Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Tue, 8 Dec 2020 18:48:17 +0000 Subject: [PATCH 43/63] fix compilation error --- cpp/src/arrow/ipc/reader.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 3207bd52d75..cb465907b6c 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1261,7 +1261,6 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { } Status OnRecordBatchMessageDecoded(std::unique_ptr message) { - DictionaryKind kind; IpcReadContext context(&dictionary_memo_, options_, swap_endian_); if (message->type() == MessageType::DICTIONARY_BATCH) { return ReadDictionary(*message); From ad7dc09fde3aac272b0d0e102733bed5d6480925 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 9 Dec 2020 05:11:33 +0000 Subject: [PATCH 44/63] fix test failure in fuzzing --- cpp/src/arrow/array/util.cc | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 5fab1fdcc8d..dd463e733f4 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -99,11 +99,12 @@ class ArrayDataEndianSwapper { template Result> ByteSwapBuffer(std::shared_ptr& in_buffer, - int64_t length) { + int64_t buffer_size, + int64_t swap_length) { auto in_data = reinterpret_cast(in_buffer->data()); - ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); + ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(buffer_size)); auto out_data = reinterpret_cast(out_buffer->mutable_data()); - for (int64_t i = 0; i < length; i++) { + for (int64_t i = 0; i < swap_length; i++) { #if ARROW_LITTLE_ENDIAN out_data[i] = BitUtil::FromBigEndian(in_data[i]); #else @@ -119,8 +120,10 @@ class ArrayDataEndianSwapper { return Status::OK(); } // offset has one more element rather than data->length - ARROW_ASSIGN_OR_RAISE(data_->buffers[index], - ByteSwapBuffer(data_->buffers[index], length_ + 1)); + ARROW_ASSIGN_OR_RAISE( + data_->buffers[index], + ByteSwapBuffer(data_->buffers[index], + data_->buffers[index]->size() + 1, length_ + 1)); return Status::OK(); } @@ -136,7 +139,8 @@ class ArrayDataEndianSwapper { Visit(const T& type) { using value_type = typename T::c_type; ARROW_ASSIGN_OR_RAISE(data_->buffers[1], - ByteSwapBuffer(data_->buffers[1], length_)); + ByteSwapBuffer(data_->buffers[1], + data_->buffers[1]->size(), length_)); return Status::OK(); } @@ -194,7 +198,8 @@ class ArrayDataEndianSwapper { Status Visit(const DayTimeIntervalType& type) { ARROW_ASSIGN_OR_RAISE(data_->buffers[1], - ByteSwapBuffer(data_->buffers[1], length_ * 2)); + ByteSwapBuffer( + data_->buffers[1], data_->buffers[1]->size(), length_ * 2)); return Status::OK(); } From ce28dc2490a1cb01fe2f159ae3fe7c944527f3b3 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 19 Dec 2020 09:10:02 +0000 Subject: [PATCH 45/63] fix test failure in fuzzing --- cpp/src/arrow/array/util.cc | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index dd463e733f4..41a5c3a108e 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -99,12 +99,11 @@ class ArrayDataEndianSwapper { template Result> ByteSwapBuffer(std::shared_ptr& in_buffer, - int64_t buffer_size, - int64_t swap_length) { + int64_t length, int64_t extra_size) { auto in_data = reinterpret_cast(in_buffer->data()); - ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(buffer_size)); + ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); auto out_data = reinterpret_cast(out_buffer->mutable_data()); - for (int64_t i = 0; i < swap_length; i++) { + for (int64_t i = 0; i < length + extra_size; i++) { #if ARROW_LITTLE_ENDIAN out_data[i] = BitUtil::FromBigEndian(in_data[i]); #else @@ -116,14 +115,12 @@ class ArrayDataEndianSwapper { template Status SwapOffset(int index) { - if (data_->buffers[index] == nullptr) { + if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) { return Status::OK(); } // offset has one more element rather than data->length - ARROW_ASSIGN_OR_RAISE( - data_->buffers[index], - ByteSwapBuffer(data_->buffers[index], - data_->buffers[index]->size() + 1, length_ + 1)); + ARROW_ASSIGN_OR_RAISE(data_->buffers[index], + ByteSwapBuffer(data_->buffers[index], length_, 1)); return Status::OK(); } @@ -139,8 +136,7 @@ class ArrayDataEndianSwapper { Visit(const T& type) { using value_type = typename T::c_type; ARROW_ASSIGN_OR_RAISE(data_->buffers[1], - ByteSwapBuffer(data_->buffers[1], - data_->buffers[1]->size(), length_)); + ByteSwapBuffer(data_->buffers[1], length_, 0)); return Status::OK(); } @@ -198,8 +194,7 @@ class ArrayDataEndianSwapper { Status Visit(const DayTimeIntervalType& type) { ARROW_ASSIGN_OR_RAISE(data_->buffers[1], - ByteSwapBuffer( - data_->buffers[1], data_->buffers[1]->size(), length_ * 2)); + ByteSwapBuffer(data_->buffers[1], length_ * 2, 0)); return Status::OK(); } From 3c0828504b47b4131aef542e98a9fe511581fd88 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 19 Dec 2020 09:41:57 +0000 Subject: [PATCH 46/63] update arguments --- cpp/src/arrow/ipc/reader.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index cb465907b6c..c05d4db0411 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1267,11 +1267,11 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { } else { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); ARROW_ASSIGN_OR_RAISE( auto batch, - ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, - &dictionary_memo_, options_, reader.get(), - swap_endian_)); + ReadRecordBatchInternal(*message->metadata(), schema_, + field_inclusion_mask_, context, reader.get())); ++stats_.num_record_batches; return listener_->OnRecordBatchDecoded(std::move(batch)); } @@ -1280,8 +1280,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { // Read dictionary from dictionary batch Status ReadDictionary(const Message& message) { DictionaryKind kind; - RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, &dictionary_memo_, options_, - &kind)); + IpcReadContext context(&dictionary_memo_, options_, swap_endian_); + RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind)); ++stats_.num_dictionary_batches; switch (kind) { case DictionaryKind::New: From ce1929bcfce8d37a79a42c79906621628709d099 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sat, 19 Dec 2020 09:53:21 +0000 Subject: [PATCH 47/63] fix lint error --- cpp/src/arrow/ipc/reader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index c05d4db0411..a1593d40031 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -1270,8 +1270,8 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { IpcReadContext context(&dictionary_memo_, options_, swap_endian_); ARROW_ASSIGN_OR_RAISE( auto batch, - ReadRecordBatchInternal(*message->metadata(), schema_, - field_inclusion_mask_, context, reader.get())); + ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, + context, reader.get())); ++stats_.num_record_batches; return listener_->OnRecordBatchDecoded(std::move(batch)); } From 1d95fd2d612c2988162897a3889571fc232f5b0b Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 20 Dec 2020 04:39:13 +0000 Subject: [PATCH 48/63] Allocate Buffer when data is swapped --- cpp/src/arrow/array/util.cc | 64 ++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 41a5c3a108e..e1ca6f4e2da 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -77,8 +77,9 @@ class ArrayDataWrapper { class ArrayDataEndianSwapper { public: - ArrayDataEndianSwapper(std::shared_ptr& data, int64_t length) - : data_(data), length_(length) {} + ArrayDataEndianSwapper(std::shared_ptr& data, int64_t length, + std::shared_ptr* out) + : data_(data), length_(length), out_(out) {} Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); @@ -90,7 +91,7 @@ class ArrayDataEndianSwapper { int i = 0; for (const auto& child_field : child_fields) { ARROW_ASSIGN_OR_RAISE( - data_->child_data[i], + (*out_)->child_data[i], SwapEndianArrayData(data_->child_data[i], child_field.get()->type())); i++; } @@ -116,10 +117,11 @@ class ArrayDataEndianSwapper { template Status SwapOffset(int index) { if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) { + (*out_)->buffers[index] = data_->buffers[index]; return Status::OK(); } // offset has one more element rather than data->length - ARROW_ASSIGN_OR_RAISE(data_->buffers[index], + ARROW_ASSIGN_OR_RAISE((*out_)->buffers[index], ByteSwapBuffer(data_->buffers[index], length_, 1)); return Status::OK(); } @@ -135,7 +137,7 @@ class ArrayDataEndianSwapper { Status> Visit(const T& type) { using value_type = typename T::c_type; - ARROW_ASSIGN_OR_RAISE(data_->buffers[1], + ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], ByteSwapBuffer(data_->buffers[1], length_, 0)); return Status::OK(); } @@ -158,7 +160,7 @@ class ArrayDataEndianSwapper { new_data[idx + 1] = tmp; #endif } - data_->buffers[1] = std::move(new_buffer); + (*out_)->buffers[1] = std::move(new_buffer); return Status::OK(); } @@ -188,30 +190,43 @@ class ArrayDataEndianSwapper { new_data[idx + 3] = tmp0; #endif } - data_->buffers[1] = std::move(new_buffer); + (*out_)->buffers[1] = std::move(new_buffer); return Status::OK(); } Status Visit(const DayTimeIntervalType& type) { - ARROW_ASSIGN_OR_RAISE(data_->buffers[1], + ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], ByteSwapBuffer(data_->buffers[1], length_ * 2, 0)); return Status::OK(); } + Status CopyDataBuffer() { + ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], + data_->buffers[1]->CopySlice(0, data_->buffers[1]->size())); + return Status::OK(); + } + Status Visit(const NullType& type) { return Status::OK(); } - Status Visit(const BooleanType& type) { return Status::OK(); } - Status Visit(const Int8Type& type) { return Status::OK(); } - Status Visit(const UInt8Type& type) { return Status::OK(); } - Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } + Status Visit(const BooleanType& type) { return CopyDataBuffer(); } + Status Visit(const Int8Type& type) { return CopyDataBuffer(); } + Status Visit(const UInt8Type& type) { return CopyDataBuffer(); } + Status Visit(const FixedSizeBinaryType& type) { return CopyDataBuffer(); } Status Visit(const FixedSizeListType& type) { return Status::OK(); } Status Visit(const StructType& type) { return Status::OK(); } - Status Visit(const SparseUnionType& type) { return Status::OK(); } + Status Visit(const UnionType& type) { + (*out_)->buffers[1] = data_->buffers[1]; + if (type.mode() == UnionMode::DENSE) { + RETURN_NOT_OK(SwapSmallOffset(2)); + } + return Status::OK(); + } template enable_if_t::value || std::is_same::value, Status> Visit(const T& type) { RETURN_NOT_OK(SwapSmallOffset()); + (*out_)->buffers[2] = data_->buffers[2]; return Status::OK(); } @@ -221,6 +236,7 @@ class ArrayDataEndianSwapper { Status> Visit(const T& type) { RETURN_NOT_OK(SwapLargeOffset()); + (*out_)->buffers[2] = data_->buffers[2]; return Status::OK(); } @@ -238,34 +254,36 @@ class ArrayDataEndianSwapper { return Status::OK(); } - Status Visit(const DenseUnionType& type) { - RETURN_NOT_OK(SwapSmallOffset(2)); - return Status::OK(); - } - Status Visit(const DictionaryType& type) { RETURN_NOT_OK(SwapType(*type.index_type())); + (*out_)->dictionary = data_->dictionary; return Status::OK(); } Status Visit(const ExtensionType& type) { RETURN_NOT_OK(SwapType(*type.storage_type())); + (*out_)->dictionary = data_->dictionary; return Status::OK(); } std::shared_ptr& data_; int64_t length_; + std::shared_ptr* out_; }; Result> SwapEndianArrayData( std::shared_ptr& data, const std::shared_ptr& type) { - std::shared_ptr out; - internal::ArrayDataEndianSwapper swapper_visitor(data, data->length); + std::vector> buffers(data->buffers.size(), nullptr); + std::vector> child_data(data->child_data.size(), nullptr); + std::shared_ptr out = + ArrayData::Make(type, data->length, buffers, child_data, data->null_count, 0); + internal::ArrayDataEndianSwapper swapper_visitor(data, data->length, &out); DCHECK_OK(VisitTypeInline(*type, &swapper_visitor)); DCHECK_OK(swapper_visitor.SwapChildren((*type).fields())); - // TODO(kiszk): Change this soon since tentatively return swapped input ArrayData - DCHECK(data); - return data; + // copy null_bitmap + out->buffers[0] = data->buffers[0]; + DCHECK(out); + return out; } } // namespace internal From b29815bc6db52b2c7aae3bda21b58fcbd4e1e412 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 20 Dec 2020 05:32:35 +0000 Subject: [PATCH 49/63] fix test failure in fuzzing --- cpp/src/arrow/array/util.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index e1ca6f4e2da..0794a45a1c5 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -201,6 +201,9 @@ class ArrayDataEndianSwapper { } Status CopyDataBuffer() { + if (data_->buffers[1]->data() == nullptr) { + return Status::OK(); + } ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], data_->buffers[1]->CopySlice(0, data_->buffers[1]->size())); return Status::OK(); From bb85b32a5ee45d9ee17443c70de1107be1f28ede Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 7 Feb 2021 08:07:28 +0000 Subject: [PATCH 50/63] address review comments --- cpp/src/arrow/array/util.cc | 136 ++++++++++++++++++------------------ cpp/src/arrow/array/util.h | 3 +- cpp/src/arrow/ipc/reader.cc | 16 +++-- cpp/src/arrow/ipc/reader.h | 6 -- cpp/src/arrow/type.cc | 11 ++- cpp/src/arrow/type.h | 14 ++-- cpp/src/arrow/type_test.cc | 10 +++ 7 files changed, 100 insertions(+), 96 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 0794a45a1c5..b2304daba92 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -52,7 +52,7 @@ using internal::checked_cast; // ---------------------------------------------------------------------- // Loading from ArrayData -namespace internal { +namespace { class ArrayDataWrapper { public: @@ -77,9 +77,13 @@ class ArrayDataWrapper { class ArrayDataEndianSwapper { public: - ArrayDataEndianSwapper(std::shared_ptr& data, int64_t length, - std::shared_ptr* out) - : data_(data), length_(length), out_(out) {} + ArrayDataEndianSwapper(const std::shared_ptr& data, int64_t length) + : data_(data), length_(length) { + const std::shared_ptr& type = data->type; + std::vector> buffers(data->buffers.size(), nullptr); + std::vector> child_data(data->child_data.size(), nullptr); + out_ = ArrayData::Make(type, data->length, buffers, child_data, data->null_count, 0); + } Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); @@ -87,49 +91,45 @@ class ArrayDataEndianSwapper { return Status::OK(); } - Status SwapChildren(std::vector> child_fields) { - int i = 0; - for (const auto& child_field : child_fields) { - ARROW_ASSIGN_OR_RAISE( - (*out_)->child_data[i], - SwapEndianArrayData(data_->child_data[i], child_field.get()->type())); - i++; + Status SwapChildren(const std::vector>& child_fields) { + for (size_t i = 0; i < child_fields.size(); i++) { + ARROW_ASSIGN_OR_RAISE(out_->child_data[i], + internal::SwapEndianArrayData(data_->child_data[i])); } return Status::OK(); } template - Result> ByteSwapBuffer(std::shared_ptr& in_buffer, - int64_t length, int64_t extra_size) { + Result> ByteSwapBuffer(const std::shared_ptr& in_buffer, + int64_t length) { + if (sizeof(T) == 1) { + // if data size is 1, element is not swapped. We can use the original buffer + return in_buffer; + } auto in_data = reinterpret_cast(in_buffer->data()); ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); auto out_data = reinterpret_cast(out_buffer->mutable_data()); - for (int64_t i = 0; i < length + extra_size; i++) { -#if ARROW_LITTLE_ENDIAN - out_data[i] = BitUtil::FromBigEndian(in_data[i]); -#else - out_data[i] = BitUtil::FromLittleEndian(in_data[i]); -#endif + for (int64_t i = 0; i < length; i++) { + out_data[i] = BitUtil::ByteSwap(in_data[i]); } + assert(0 <= in_buffer->size() - length * sizeof(T)); + std::memset(out_data + length, 0, in_buffer->size() - length * sizeof(T)); return std::move(out_buffer); } template - Status SwapOffset(int index) { + Status SwapOffsets(int index, int offset_length) { if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) { - (*out_)->buffers[index] = data_->buffers[index]; + out_->buffers[index] = data_->buffers[index]; return Status::OK(); } - // offset has one more element rather than data->length - ARROW_ASSIGN_OR_RAISE((*out_)->buffers[index], - ByteSwapBuffer(data_->buffers[index], length_, 1)); + // Except union, offset has one more element rather than data->length + ARROW_ASSIGN_OR_RAISE( + out_->buffers[index], + ByteSwapBuffer(data_->buffers[index], length_ + offset_length)); return Status::OK(); } - Status SwapSmallOffset(int index = 1) { return SwapOffset(index); } - - Status SwapLargeOffset() { return SwapOffset(1); } - template enable_if_t::value && !std::is_base_of::value && @@ -137,8 +137,8 @@ class ArrayDataEndianSwapper { Status> Visit(const T& type) { using value_type = typename T::c_type; - ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], - ByteSwapBuffer(data_->buffers[1], length_, 0)); + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], + ByteSwapBuffer(data_->buffers[1], length_)); return Status::OK(); } @@ -160,7 +160,9 @@ class ArrayDataEndianSwapper { new_data[idx + 1] = tmp; #endif } - (*out_)->buffers[1] = std::move(new_buffer); + assert(0 <= data_->buffers[1]->size() - length * 16); + std::memset(new_data + length * 2, 0, data_->buffers[1]->size() - length * 16); + out_->buffers[1] = std::move(new_buffer); return Status::OK(); } @@ -190,36 +192,34 @@ class ArrayDataEndianSwapper { new_data[idx + 3] = tmp0; #endif } - (*out_)->buffers[1] = std::move(new_buffer); + assert(0 <= data_->buffers[1]->size() - length * 32); + std::memset(new_data + length * 4, 0, data_->buffers[1]->size() - length * 32); + out_->buffers[1] = std::move(new_buffer); return Status::OK(); } Status Visit(const DayTimeIntervalType& type) { - ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], - ByteSwapBuffer(data_->buffers[1], length_ * 2, 0)); + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], + ByteSwapBuffer(data_->buffers[1], length_ * 2)); return Status::OK(); } - Status CopyDataBuffer() { - if (data_->buffers[1]->data() == nullptr) { - return Status::OK(); - } - ARROW_ASSIGN_OR_RAISE((*out_)->buffers[1], - data_->buffers[1]->CopySlice(0, data_->buffers[1]->size())); + Status ReuseDataBuffer() { + out_->buffers[1] = data_->buffers[1]; return Status::OK(); } Status Visit(const NullType& type) { return Status::OK(); } - Status Visit(const BooleanType& type) { return CopyDataBuffer(); } - Status Visit(const Int8Type& type) { return CopyDataBuffer(); } - Status Visit(const UInt8Type& type) { return CopyDataBuffer(); } - Status Visit(const FixedSizeBinaryType& type) { return CopyDataBuffer(); } + Status Visit(const BooleanType& type) { return ReuseDataBuffer(); } + Status Visit(const Int8Type& type) { return ReuseDataBuffer(); } + Status Visit(const UInt8Type& type) { return ReuseDataBuffer(); } + Status Visit(const FixedSizeBinaryType& type) { return ReuseDataBuffer(); } Status Visit(const FixedSizeListType& type) { return Status::OK(); } Status Visit(const StructType& type) { return Status::OK(); } Status Visit(const UnionType& type) { - (*out_)->buffers[1] = data_->buffers[1]; + out_->buffers[1] = data_->buffers[1]; if (type.mode() == UnionMode::DENSE) { - RETURN_NOT_OK(SwapSmallOffset(2)); + RETURN_NOT_OK(SwapOffsets(2, 0)); } return Status::OK(); } @@ -228,8 +228,8 @@ class ArrayDataEndianSwapper { enable_if_t::value || std::is_same::value, Status> Visit(const T& type) { - RETURN_NOT_OK(SwapSmallOffset()); - (*out_)->buffers[2] = data_->buffers[2]; + RETURN_NOT_OK(SwapOffsets(1, 1)); + out_->buffers[2] = data_->buffers[2]; return Status::OK(); } @@ -238,51 +238,51 @@ class ArrayDataEndianSwapper { std::is_same::value, Status> Visit(const T& type) { - RETURN_NOT_OK(SwapLargeOffset()); - (*out_)->buffers[2] = data_->buffers[2]; + RETURN_NOT_OK(SwapOffsets(1, 1)); + out_->buffers[2] = data_->buffers[2]; return Status::OK(); } Status Visit(const ListType& type) { - RETURN_NOT_OK(SwapSmallOffset()); + RETURN_NOT_OK(SwapOffsets(1, 1)); return Status::OK(); } Status Visit(const LargeListType& type) { - RETURN_NOT_OK(SwapLargeOffset()); - return Status::OK(); - } - - Status Visit(const MapType& type) { - RETURN_NOT_OK(SwapSmallOffset()); + RETURN_NOT_OK(SwapOffsets(1, 1)); return Status::OK(); } Status Visit(const DictionaryType& type) { RETURN_NOT_OK(SwapType(*type.index_type())); - (*out_)->dictionary = data_->dictionary; + out_->dictionary = data_->dictionary; return Status::OK(); } Status Visit(const ExtensionType& type) { RETURN_NOT_OK(SwapType(*type.storage_type())); - (*out_)->dictionary = data_->dictionary; + out_->dictionary = data_->dictionary; return Status::OK(); } - std::shared_ptr& data_; + const std::shared_ptr& data_; int64_t length_; - std::shared_ptr* out_; + std::shared_ptr out_; }; +} // namespace + +namespace internal { + Result> SwapEndianArrayData( - std::shared_ptr& data, const std::shared_ptr& type) { - std::vector> buffers(data->buffers.size(), nullptr); - std::vector> child_data(data->child_data.size(), nullptr); - std::shared_ptr out = - ArrayData::Make(type, data->length, buffers, child_data, data->null_count, 0); - internal::ArrayDataEndianSwapper swapper_visitor(data, data->length, &out); + const std::shared_ptr& data) { + if (data->offset != 0) { + return Status::Invalid("Unsupported data format: data.offset != 0"); + } + const std::shared_ptr& type = data->type; + ArrayDataEndianSwapper swapper_visitor(data, data->length); DCHECK_OK(VisitTypeInline(*type, &swapper_visitor)); DCHECK_OK(swapper_visitor.SwapChildren((*type).fields())); + std::shared_ptr out = std::move(swapper_visitor.out_); // copy null_bitmap out->buffers[0] = data->buffers[0]; DCHECK(out); @@ -293,7 +293,7 @@ Result> SwapEndianArrayData( std::shared_ptr MakeArray(const std::shared_ptr& data) { std::shared_ptr out; - internal::ArrayDataWrapper wrapper_visitor(data, &out); + ArrayDataWrapper wrapper_visitor(data, &out); DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor)); DCHECK(out); return out; diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 7a4e811ac2f..14b18573481 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -58,11 +58,10 @@ namespace internal { /// \brief Swap endian of each element in a generic ArrayData /// \param[in] data the array contents -/// \param[in] type the array type /// \return the resulting Array instance whose elements were swapped ARROW_EXPORT Result> SwapEndianArrayData( - std::shared_ptr& data, const std::shared_ptr& type); + const std::shared_ptr& data); /// Given a number of ArrayVectors, treat each ArrayVector as the /// chunks of a chunked array. Then rechunk each ArrayVector such that diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index a1593d40031..9092152fa31 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -505,8 +505,7 @@ Result> LoadRecordBatchSubset( if (context.swap_endian) { for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData( - filtered_columns[i], filtered_columns[i]->type)); + arrow::internal::SwapEndianArrayData(filtered_columns[i])); } } return RecordBatch::Make(filtered_schema, metadata->length(), @@ -670,11 +669,11 @@ Status UnpackSchemaMessage(const void* opaque_schema, const IpcReadOptions& opti // for fast lookups RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields, field_inclusion_mask, out_schema)); - *swap_endian = options.ensure_native_endian && !out_schema->get()->IsNativeEndianness(); + *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian(); if (*swap_endian) { // create a new schema with native endianness before swapping endian in ArrayData - *schema = schema->get()->WithNativeEndianness(); - *out_schema = out_schema->get()->WithNativeEndianness(); + *schema = schema->get()->WithEndianness(Endianness::Native); + *out_schema = out_schema->get()->WithEndianness(Endianness::Native); } return Status::OK(); } @@ -748,8 +747,7 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context, // swap endian in dict_data if necessary (swap_endian == true) if (context.swap_endian) { - ARROW_ASSIGN_OR_RAISE( - dict_data, ::arrow::internal::SwapEndianArrayData(dict_data, dict_data->type)); + ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data)); } if (dictionary_batch->isDelta()) { @@ -925,6 +923,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { DictionaryMemo dictionary_memo_; std::shared_ptr schema_, out_schema_; + + bool swap_endian_; }; // ---------------------------------------------------------------------- @@ -1137,6 +1137,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { std::shared_ptr out_schema_; ReadStats stats_; + + bool swap_endian_; }; Result> RecordBatchFileReader::Open( diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 3d4f0a31a47..fe9a3b72e16 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -96,9 +96,6 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader { /// \brief Return current read statistics virtual ReadStats stats() const = 0; - - protected: - bool swap_endian_; }; /// \brief Reads the record batch file format @@ -172,9 +169,6 @@ class ARROW_EXPORT RecordBatchFileReader { /// \brief Return current read statistics virtual ReadStats stats() const = 0; - - protected: - bool swap_endian_; }; /// \brief A general listener class to receive events. diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index d5e5bd7a65d..a5f1cfae402 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1321,22 +1321,20 @@ Schema::Schema(std::vector> fields, Endianness endianness Schema::Schema(std::vector> fields, std::shared_ptr metadata) : detail::Fingerprintable(), - impl_(new Impl(std::move(fields), Endianness::NATIVE, std::move(metadata))) {} + impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {} Schema::Schema(const Schema& schema) : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} Schema::~Schema() = default; -std::shared_ptr Schema::WithNativeEndianness() const { - return std::make_shared(impl_->fields_, impl_->metadata_); +std::shared_ptr Schema::WithEndianness(Endianness endianness) const { + return std::make_shared(impl_->fields_, endianness, impl_->metadata_); } Endianness Schema::endianness() const { return impl_->endianness_; } -bool Schema::IsNativeEndianness() const { - return impl_->endianness_ == Endianness::NATIVE; -} +bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; } int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } @@ -1847,6 +1845,7 @@ std::string Schema::ComputeFingerprint() const { } ss << field_fingerprint << ";"; } + ss << (endianness() == Endianness::Little ? "L" : "B"); ss << "}"; return ss.str(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 2bc37b4b116..e5c3ba51694 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1606,12 +1606,12 @@ class ARROW_EXPORT FieldRef { // Schema enum class Endianness { - LITTLE = 0, - BIG = 1, + Little = 0, + Big = 1, #if ARROW_LITTLE_ENDIAN - NATIVE = LITTLE + Native = Little #else - NATIVE = BIG + Native = Big #endif }; @@ -1636,16 +1636,16 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, bool Equals(const Schema& other, bool check_metadata = false) const; bool Equals(const std::shared_ptr& other, bool check_metadata = false) const; - /// \brief Replace endianness with platform-native endianness in the schema + /// \brief Set endianness in the schema /// /// \return new Schema - std::shared_ptr WithNativeEndianness() const; + std::shared_ptr WithEndianness(Endianness endianness) const; /// \brief Return endianness in the schema Endianness endianness() const; /// \brief Indicate if endianness is equal to platform-native endianness - bool IsNativeEndianness() const; + bool is_native_endian() const; /// \brief Return the number of fields (columns) in the schema int num_fields() const; diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 81a0315d6d1..06b092921c5 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -475,6 +475,16 @@ TEST_F(TestSchema, Basics) { ASSERT_EQ(schema->fingerprint(), schema2->fingerprint()); ASSERT_NE(schema->fingerprint(), schema3->fingerprint()); + + auto schema4 = ::arrow::schema({f0}, Endianness::Little); + auto schema5 = ::arrow::schema({f0}, Endianness::Little); + auto schema6 = ::arrow::schema({f0}, Endianness::Big); + + AssertSchemaEqual(schema4, schema5); + AssertSchemaNotEqual(schema4, schema6); + + ASSERT_EQ(schema4->fingerprint(), schema5->fingerprint()); + ASSERT_NE(schema4->fingerprint(), schema6->fingerprint()); } TEST_F(TestSchema, ToString) { From 87ab14a9e3a95a2038b3b5cbf324944fd1c0be00 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 7 Feb 2021 08:15:50 +0000 Subject: [PATCH 51/63] add missing file --- cpp/src/arrow/ipc/metadata_internal.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index b4520121cd8..6a1f5ffe0eb 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -1337,8 +1337,8 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata)); // set endianess using the value in flatbuf schema auto endianness = schema->endianness() == flatbuf::Endianness::Little - ? Endianness::LITTLE - : Endianness::BIG; + ? Endianness::Little + : Endianness::Big; *out = ::arrow::schema(std::move(fields), endianness, metadata); return Status::OK(); } From fb2e041ea80e3081b8675da53bae2b7bd7c3c945 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 14 Feb 2021 20:02:57 +0000 Subject: [PATCH 52/63] address review comment --- cpp/src/arrow/array/util.cc | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index b2304daba92..6f83da35fb9 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -100,8 +100,8 @@ class ArrayDataEndianSwapper { } template - Result> ByteSwapBuffer(const std::shared_ptr& in_buffer, - int64_t length) { + Result> ByteSwapBuffer( + const std::shared_ptr& in_buffer) { if (sizeof(T) == 1) { // if data size is 1, element is not swapped. We can use the original buffer return in_buffer; @@ -109,24 +109,22 @@ class ArrayDataEndianSwapper { auto in_data = reinterpret_cast(in_buffer->data()); ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size())); auto out_data = reinterpret_cast(out_buffer->mutable_data()); + int64_t length = in_buffer->size() / sizeof(T); for (int64_t i = 0; i < length; i++) { out_data[i] = BitUtil::ByteSwap(in_data[i]); } - assert(0 <= in_buffer->size() - length * sizeof(T)); - std::memset(out_data + length, 0, in_buffer->size() - length * sizeof(T)); return std::move(out_buffer); } template - Status SwapOffsets(int index, int offset_length) { + Status SwapOffsets(int index) { if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) { out_->buffers[index] = data_->buffers[index]; return Status::OK(); } // Except union, offset has one more element rather than data->length - ARROW_ASSIGN_OR_RAISE( - out_->buffers[index], - ByteSwapBuffer(data_->buffers[index], length_ + offset_length)); + ARROW_ASSIGN_OR_RAISE(out_->buffers[index], + ByteSwapBuffer(data_->buffers[index])); return Status::OK(); } @@ -138,7 +136,7 @@ class ArrayDataEndianSwapper { Visit(const T& type) { using value_type = typename T::c_type; ARROW_ASSIGN_OR_RAISE(out_->buffers[1], - ByteSwapBuffer(data_->buffers[1], length_)); + ByteSwapBuffer(data_->buffers[1])); return Status::OK(); } @@ -147,6 +145,7 @@ class ArrayDataEndianSwapper { ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); auto new_data = reinterpret_cast(new_buffer->mutable_data()); int64_t length = length_; + length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2); for (int64_t i = 0; i < length; i++) { uint64_t tmp; auto idx = i * 2; @@ -160,8 +159,6 @@ class ArrayDataEndianSwapper { new_data[idx + 1] = tmp; #endif } - assert(0 <= data_->buffers[1]->size() - length * 16); - std::memset(new_data + length * 2, 0, data_->buffers[1]->size() - length * 16); out_->buffers[1] = std::move(new_buffer); return Status::OK(); } @@ -171,6 +168,7 @@ class ArrayDataEndianSwapper { ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size())); auto new_data = reinterpret_cast(new_buffer->mutable_data()); int64_t length = length_; + length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4); for (int64_t i = 0; i < length; i++) { uint64_t tmp0, tmp1, tmp2; auto idx = i * 4; @@ -192,15 +190,12 @@ class ArrayDataEndianSwapper { new_data[idx + 3] = tmp0; #endif } - assert(0 <= data_->buffers[1]->size() - length * 32); - std::memset(new_data + length * 4, 0, data_->buffers[1]->size() - length * 32); out_->buffers[1] = std::move(new_buffer); return Status::OK(); } Status Visit(const DayTimeIntervalType& type) { - ARROW_ASSIGN_OR_RAISE(out_->buffers[1], - ByteSwapBuffer(data_->buffers[1], length_ * 2)); + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer(data_->buffers[1])); return Status::OK(); } @@ -219,7 +214,7 @@ class ArrayDataEndianSwapper { Status Visit(const UnionType& type) { out_->buffers[1] = data_->buffers[1]; if (type.mode() == UnionMode::DENSE) { - RETURN_NOT_OK(SwapOffsets(2, 0)); + RETURN_NOT_OK(SwapOffsets(2)); } return Status::OK(); } @@ -228,7 +223,7 @@ class ArrayDataEndianSwapper { enable_if_t::value || std::is_same::value, Status> Visit(const T& type) { - RETURN_NOT_OK(SwapOffsets(1, 1)); + RETURN_NOT_OK(SwapOffsets(1)); out_->buffers[2] = data_->buffers[2]; return Status::OK(); } @@ -238,28 +233,30 @@ class ArrayDataEndianSwapper { std::is_same::value, Status> Visit(const T& type) { - RETURN_NOT_OK(SwapOffsets(1, 1)); + RETURN_NOT_OK(SwapOffsets(1)); out_->buffers[2] = data_->buffers[2]; return Status::OK(); } Status Visit(const ListType& type) { - RETURN_NOT_OK(SwapOffsets(1, 1)); + RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); } Status Visit(const LargeListType& type) { - RETURN_NOT_OK(SwapOffsets(1, 1)); + RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); } Status Visit(const DictionaryType& type) { RETURN_NOT_OK(SwapType(*type.index_type())); + // dictionary was already swapped in ReadDictionary() in ipc/reader.cc out_->dictionary = data_->dictionary; return Status::OK(); } Status Visit(const ExtensionType& type) { RETURN_NOT_OK(SwapType(*type.storage_type())); + // dictionary was already swapped in ReadDictionary() in ipc/reader.cc out_->dictionary = data_->dictionary; return Status::OK(); } From 3c430476393d947aaeb99215a63095c28724a2e7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 14 Feb 2021 20:03:13 +0000 Subject: [PATCH 53/63] add test cases for SwapEndianArrayData --- cpp/src/arrow/array/array_test.cc | 324 ++++++++++++++++++++++++++++++ cpp/src/arrow/array/data.cc | 27 +++ cpp/src/arrow/array/data.h | 3 + 3 files changed, 354 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 89087ee318c..8e9309583a2 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -37,6 +37,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" +#include "arrow/array/builder_nested.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -2598,4 +2599,327 @@ TEST(TestRechunkArraysConsistently, Plain) { } } +// ---------------------------------------------------------------------- +// Test SwapEndianArrayData + + +/// \brief Indicate if fields are equals. +/// +/// \param[in] target ArrayData to be converted and tested +/// \param[in] expected result ArrayData +void AssertArrayDataEqualsWithSwapEndian(const std::shared_ptr& target, + const std::shared_ptr& expected) { + auto swap_array = MakeArray(*::arrow::internal::SwapEndianArrayData(target)); + auto expected_array = MakeArray(expected); + ASSERT_ARRAYS_EQUAL(*swap_array, *expected_array); + ASSERT_OK(swap_array->ValidateFull()); +} + +TEST(TestSwapEndianArrayData, PrimitiveType) { + auto null_buffer = Buffer::FromString("\xff"); + auto data_int_buffer = Buffer::FromString("01234567"); + + auto data = ArrayData::Make(null(), 0, {nullptr}, 0); + auto expected_data = data; + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(boolean(), 8, {null_buffer, data_int_buffer}, 0); + expected_data = data; + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(int8(), 8, {null_buffer, data_int_buffer}, 0); + expected_data = data; + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(uint16(), 4, {null_buffer, data_int_buffer}, 0); + auto data_int16_buffer = Buffer::FromString("10325476"); + expected_data = ArrayData::Make(uint16(), 4, {null_buffer, data_int16_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(int32(), 2, {null_buffer, data_int_buffer}, 0); + auto data_int32_buffer = Buffer::FromString("32107654"); + expected_data = ArrayData::Make(int32(), 2, {null_buffer, data_int32_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(uint64(), 1, {null_buffer, data_int_buffer}, 0); + auto data_int64_buffer = Buffer::FromString("76543210"); + expected_data = ArrayData::Make(uint64(), 1, {null_buffer, data_int64_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + auto data_16byte_buffer = Buffer::FromString("0123456789abcdef"); + data = ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_16byte_buffer}); + auto data_decimal128_buffer = Buffer::FromString("fedcba9876543210"); + expected_data = + ArrayData::Make(decimal128(38, 10), 1, {null_buffer, data_decimal128_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + auto data_32byte_buffer = Buffer::FromString("0123456789abcdef123456789ABCDEF0"); + data = ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_32byte_buffer}); + auto data_decimal256_buffer = Buffer::FromString("0FEDCBA987654321fedcba9876543210"); + expected_data = + ArrayData::Make(decimal256(76, 20), 1, {null_buffer, data_decimal256_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + auto data_float_buffer = Buffer::FromString("01200560"); + data = ArrayData::Make(float32(), 2, {null_buffer, data_float_buffer}, 0); + auto data_float32_buffer = Buffer::FromString("02100650"); + expected_data = ArrayData::Make(float32(), 2, {null_buffer, data_float32_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + data = ArrayData::Make(float64(), 1, {null_buffer, data_float_buffer}); + auto data_float64_buffer = Buffer::FromString("06500210"); + expected_data = ArrayData::Make(float64(), 1, {null_buffer, data_float64_buffer}, 0); + AssertArrayDataEqualsWithSwapEndian(data, expected_data); + + // With offset > 0 + data = + ArrayData::Make(int64(), 1, {null_buffer, data_int_buffer}, kUnknownNullCount, 1); + ASSERT_RAISES(Invalid, ::arrow::internal::SwapEndianArrayData(data)); +} + +std::shared_ptr ReplaceBuffers(const std::shared_ptr& data, + const int32_t buffer_index, + const std::vector& buffer_data) { + const auto test_data = std::make_shared(*data); + test_data->buffers[buffer_index] = + std::make_shared(buffer_data.data(), buffer_data.size()); + return test_data; +} + +std::shared_ptr ReplaceBuffersInChild(const std::shared_ptr& data, + const int32_t child_index, + const std::vector& child_data) { + const auto test_data = std::make_shared(*data); + // assume updating only buffer[1] in child_data + auto child_array_data = + std::make_shared(*test_data->child_data[child_index]); + child_array_data->buffers[1] = + std::make_shared(child_data.data(), child_data.size()); + test_data->child_data[child_index] = child_array_data; + return test_data; +} + +std::shared_ptr ReplaceBuffersInDictionary( + const std::shared_ptr& data, const int32_t buffer_index, + const std::vector& buffer_data) { + const auto test_data = std::make_shared(*data); + auto dict_array_data = std::make_shared(*test_data->dictionary); + dict_array_data->buffers[buffer_index] = + std::make_shared(buffer_data.data(), buffer_data.size()); + test_data->dictionary = dict_array_data; + return test_data; +} + + +TEST(TestSwapEndianArrayData, NonPrimitiveType) { + auto array = ArrayFromJSON(binary(), R"(["0123", null, "45"])"); + const std::vector offset1 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; +#endif + auto expected_data = array->data(); + auto test_data = ReplaceBuffers(expected_data, 1, offset1); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(large_binary(), R"(["01234", null, "567"])"); + const std::vector offset2 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 8}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset2); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(fixed_size_binary(3), R"(["012", null, "345"])"); + expected_data = array->data(); + AssertArrayDataEqualsWithSwapEndian(expected_data, expected_data); + + array = ArrayFromJSON(utf8(), R"(["ABCD", null, "EF"])"); + const std::vector offset4 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset4); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(large_utf8(), R"(["ABCDE", null, "FGH"])"); + const std::vector offset5 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 8}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, + 5, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset5); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto type = std::make_shared(int32()); + array = ArrayFromJSON(type, "[[0, 1, 2, 3], null, [4, 5]]"); + const std::vector offset6 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; +#endif + const std::vector data6 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5}; +#else + {0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset6); + test_data = ReplaceBuffersInChild(test_data, 0, data6); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto type7 = std::make_shared(int64()); + array = ArrayFromJSON(type7, "[[0, 1, 2], null, [3]]"); + const std::vector offset7 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0}; +#endif + const std::vector data7 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0}; +#endif + expected_data = array->data(); + test_data = ReplaceBuffers(expected_data, 1, offset7); + test_data = ReplaceBuffersInChild(test_data, 0, data7); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto type8 = std::make_shared(int32(), 2); + array = ArrayFromJSON(type8, "[[0, 1], null, [2, 3]]"); + expected_data = array->data(); + const std::vector data8 = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; +#endif + test_data = ReplaceBuffersInChild(expected_data, 0, data8); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto type9 = dictionary(int32(), int16()); + auto dict = ArrayFromJSON(int16(), "[4, 5, 6, 7]"); + DictionaryArray array9(type9, ArrayFromJSON(int32(), "[0, 2, 3]"), dict); + expected_data = array9.data(); + const std::vector data9a = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; +#endif + const std::vector data9b = +#if ARROW_LITTLE_ENDIAN + {0, 4, 0, 5, 0, 6, 0, 7}; +#else + {4, 0, 5, 0, 6, 0, 7, 0}; +#endif + test_data = ReplaceBuffers(expected_data, 1, data9a); + test_data = ReplaceBuffersInDictionary(test_data, 1, data9b); + // dictionary must be explicitly swapped + test_data->dictionary = *::arrow::internal::SwapEndianArrayData(test_data->dictionary); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + array = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), + R"([{"a": 4, "b": null}, {"a": null, "b": "foo"}])"); + expected_data = array->data(); + const std::vector data10a = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 4, 0, 0, 0, 0}; +#else + {4, 0, 0, 0, 0, 0, 0, 0}; +#endif + const std::vector data10b = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0}; +#endif + test_data = ReplaceBuffersInChild(expected_data, 0, data10a); + test_data = ReplaceBuffersInChild(test_data, 1, data10b); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + auto expected_i8 = ArrayFromJSON(int8(), "[127, null, null, null, null]"); + auto expected_str = ArrayFromJSON(utf8(), R"([null, "abcd", null, null, ""])"); + auto expected_i32 = ArrayFromJSON(int32(), "[null, null, 1, 2, null]"); + std::vector expected_types_vector; + expected_types_vector.push_back(Type::INT8); + expected_types_vector.insert(expected_types_vector.end(), 2, Type::STRING); + expected_types_vector.insert(expected_types_vector.end(), 2, Type::INT32); + std::shared_ptr expected_types; + ArrayFromVector(expected_types_vector, &expected_types); + auto arr11 = SparseUnionArray::Make( + *expected_types, {expected_i8, expected_str, expected_i32}, {"i8", "str", "i32"}, + {Type::INT8, Type::STRING, Type::INT32}); + expected_data = (*arr11)->data(); + const std::vector data11a = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0}; +#endif + const std::vector data11b = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0}; +#endif + test_data = ReplaceBuffersInChild(expected_data, 1, data11a); + test_data = ReplaceBuffersInChild(test_data, 2, data11b); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); + + expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); + expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); + expected_i32 = ArrayFromJSON(int32(), "[1, -259, 2]"); + auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); + auto arr12 = DenseUnionArray::Make( + *expected_types, *expected_offsets, {expected_i8, expected_str, expected_i32}, + {"i8", "str", "i32"}, {Type::INT8, Type::STRING, Type::INT32}); + expected_data = (*arr12)->data(); + const std::vector data12a = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2}; +#else + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, + 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0}; +#endif + const std::vector data12b = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6}; +#else + {0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0}; +#endif + const std::vector data12c = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 1, 255, 255, 254, 253, 0, 0, 0, 2}; +#else + {1, 0, 0, 0, 253, 254, 255, 255, 2, 0, 0, 0}; +#endif + test_data = ReplaceBuffers(expected_data, 2, data12a); + test_data = ReplaceBuffersInChild(test_data, 1, data12b); + test_data = ReplaceBuffersInChild(test_data, 2, data12c); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index e397a752cd8..dd76296c2b9 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -130,6 +130,33 @@ int64_t ArrayData::GetNullCount() const { return precomputed; } +bool ArrayData::Equals(const ArrayData& other) const { + if (this == &other) { + return true; + } + + if (offset != other.offset || !type->Equals(other.type) || + buffers.size() != other.buffers.size() || + child_data.size() != other.child_data.size()) { + return false; + } + + for (size_t i = 0; i < buffers.size(); i++) { + if (buffers[i] != other.buffers[i] && !buffers[i]->Equals(*other.buffers[i].get())) { + return false; + } + } + + for (size_t i = 0; i < child_data.size(); i++) { + if (child_data[i] != other.child_data[i] && + !child_data[i]->Equals(*other.child_data[i].get())) { + return false; + } + } + + return (dictionary == other.dictionary || dictionary->Equals(*other.dictionary.get())); +} + // ---------------------------------------------------------------------- // Implement ArrayData::View diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 02a49949e1f..08e98a9052f 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -231,6 +231,9 @@ struct ARROW_EXPORT ArrayData { return null_count.load() != 0 && buffers[0] != NULLPTR; } + /// Return true if both ArrayDatas are the same size and contain the same bytes + bool Equals(const ArrayData& other) const; + std::shared_ptr type; int64_t length; mutable std::atomic null_count; From a9be9334f704f0ebe2863cb10110fa7e37c9c4f6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 14 Feb 2021 20:03:24 +0000 Subject: [PATCH 54/63] add usage --- dev/archery/tests/generate_files_for_endian_test.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dev/archery/tests/generate_files_for_endian_test.sh b/dev/archery/tests/generate_files_for_endian_test.sh index 78ca8c659af..787379d614a 100755 --- a/dev/archery/tests/generate_files_for_endian_test.sh +++ b/dev/archery/tests/generate_files_for_endian_test.sh @@ -16,6 +16,11 @@ # specific language governing permissions and limitations # under the License. +# This script generates json and arrow files of each type (e.g. primitive) for integration endian test +# Usage: generate_files_for_endian_test.sh +# ARROW_DIR : where Arrow repository exists +# TMP_DIR : where files will be generated + : ${ARROW_DIR:=/arrow} : ${TMP_DIR:=/tmp/arrow} From 8b7593f6c2fb0e0a64cdd944b5abe4c6fe03b01f Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Sun, 14 Feb 2021 20:08:01 +0000 Subject: [PATCH 55/63] fix compilation error --- cpp/src/arrow/util/bit_block_counter.cc | 3 --- cpp/src/arrow/util/bit_block_counter.h | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc index 57b2a408db4..c67cedc4a06 100644 --- a/cpp/src/arrow/util/bit_block_counter.cc +++ b/cpp/src/arrow/util/bit_block_counter.cc @@ -22,10 +22,7 @@ #include #include "arrow/buffer.h" -#include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/endian.h" -#include "arrow/util/ubsan.h" namespace arrow { namespace internal { diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h index 0b6199cf15e..803b825e1b2 100644 --- a/cpp/src/arrow/util/bit_block_counter.h +++ b/cpp/src/arrow/util/bit_block_counter.h @@ -25,6 +25,7 @@ #include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" From 5dd00d35ecaa785d45af015b684c2a09ec49f4e7 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 15 Feb 2021 02:04:11 +0000 Subject: [PATCH 56/63] fix lint error --- cpp/src/arrow/array/array_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 8e9309583a2..619c1e63f63 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -2602,7 +2602,6 @@ TEST(TestRechunkArraysConsistently, Plain) { // ---------------------------------------------------------------------- // Test SwapEndianArrayData - /// \brief Indicate if fields are equals. /// /// \param[in] target ArrayData to be converted and tested @@ -2710,7 +2709,6 @@ std::shared_ptr ReplaceBuffersInDictionary( return test_data; } - TEST(TestSwapEndianArrayData, NonPrimitiveType) { auto array = ArrayFromJSON(binary(), R"(["0123", null, "45"])"); const std::vector offset1 = From 39c55d96b0ade651cb55a8ce4d51327d1572a2ea Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 15 Feb 2021 02:27:11 +0000 Subject: [PATCH 57/63] fix compilation error --- cpp/src/parquet/level_comparison_inc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/level_comparison_inc.h b/cpp/src/parquet/level_comparison_inc.h index f4cf7ab48e7..e21c3e5824d 100644 --- a/cpp/src/parquet/level_comparison_inc.h +++ b/cpp/src/parquet/level_comparison_inc.h @@ -17,6 +17,7 @@ #pragma once #include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "parquet/level_comparison.h" // Used to make sure ODR rule isn't violated. From a42b4dcf0c4857ca64cef563914e9d3b29e42167 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 15 Feb 2021 04:13:12 +0000 Subject: [PATCH 58/63] fix directory path --- ci/scripts/integration_arrow.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 54e170deba4..f8f4192cc4a 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -34,5 +34,5 @@ archery integration --with-all --run-flight \ # TODO: support other languages archery integration --with-cpp=1 --run-flight \ - --gold-dirs=$gold_dir/1.0.0_bigendian \ - --gold-dirs=$gold_dir/1.0.0_littleendian \ + --gold-dirs=$gold_dir/1.0.0-bigendian \ + --gold-dirs=$gold_dir/1.0.0-littleendian \ From ccb395eca137ad853dff3f77e197b26e3d187649 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 17 Feb 2021 03:34:30 +0000 Subject: [PATCH 59/63] address review comments --- cpp/src/arrow/array/array_test.cc | 143 ++++++++++++++++++------------ cpp/src/arrow/array/data.cc | 27 ------ cpp/src/arrow/array/data.h | 3 - cpp/src/arrow/array/util.cc | 24 ++--- 4 files changed, 91 insertions(+), 106 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 619c1e63f63..a97bf134604 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -37,7 +37,6 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" -#include "arrow/array/builder_nested.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -46,6 +45,7 @@ #include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" +#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_compat.h" #include "arrow/testing/gtest_util.h" @@ -2679,7 +2679,7 @@ TEST(TestSwapEndianArrayData, PrimitiveType) { std::shared_ptr ReplaceBuffers(const std::shared_ptr& data, const int32_t buffer_index, const std::vector& buffer_data) { - const auto test_data = std::make_shared(*data); + const auto test_data = data->Copy(); test_data->buffers[buffer_index] = std::make_shared(buffer_data.data(), buffer_data.size()); return test_data; @@ -2688,10 +2688,9 @@ std::shared_ptr ReplaceBuffers(const std::shared_ptr& data std::shared_ptr ReplaceBuffersInChild(const std::shared_ptr& data, const int32_t child_index, const std::vector& child_data) { - const auto test_data = std::make_shared(*data); + const auto test_data = data->Copy(); // assume updating only buffer[1] in child_data - auto child_array_data = - std::make_shared(*test_data->child_data[child_index]); + auto child_array_data = test_data->child_data[child_index]->Copy(); child_array_data->buffers[1] = std::make_shared(child_data.data(), child_data.size()); test_data->child_data[child_index] = child_array_data; @@ -2701,15 +2700,15 @@ std::shared_ptr ReplaceBuffersInChild(const std::shared_ptr ReplaceBuffersInDictionary( const std::shared_ptr& data, const int32_t buffer_index, const std::vector& buffer_data) { - const auto test_data = std::make_shared(*data); - auto dict_array_data = std::make_shared(*test_data->dictionary); + const auto test_data = data->Copy(); + auto dict_array_data = test_data->dictionary->Copy(); dict_array_data->buffers[buffer_index] = std::make_shared(buffer_data.data(), buffer_data.size()); test_data->dictionary = dict_array_data; return test_data; } -TEST(TestSwapEndianArrayData, NonPrimitiveType) { +TEST(TestSwapEndianArrayData, BinaryType) { auto array = ArrayFromJSON(binary(), R"(["0123", null, "45"])"); const std::vector offset1 = #if ARROW_LITTLE_ENDIAN @@ -2737,20 +2736,22 @@ TEST(TestSwapEndianArrayData, NonPrimitiveType) { array = ArrayFromJSON(fixed_size_binary(3), R"(["012", null, "345"])"); expected_data = array->data(); AssertArrayDataEqualsWithSwapEndian(expected_data, expected_data); +} - array = ArrayFromJSON(utf8(), R"(["ABCD", null, "EF"])"); - const std::vector offset4 = +TEST(TestSwapEndianArrayData, StringType) { + auto array = ArrayFromJSON(utf8(), R"(["ABCD", null, "EF"])"); + const std::vector offset1 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; #else {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; #endif - expected_data = array->data(); - test_data = ReplaceBuffers(expected_data, 1, offset4); + auto expected_data = array->data(); + auto test_data = ReplaceBuffers(expected_data, 1, offset1); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); array = ArrayFromJSON(large_utf8(), R"(["ABCDE", null, "FGH"])"); - const std::vector offset5 = + const std::vector offset2 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 8}; @@ -2759,31 +2760,33 @@ TEST(TestSwapEndianArrayData, NonPrimitiveType) { 5, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0}; #endif expected_data = array->data(); - test_data = ReplaceBuffers(expected_data, 1, offset5); + test_data = ReplaceBuffers(expected_data, 1, offset2); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} - auto type = std::make_shared(int32()); - array = ArrayFromJSON(type, "[[0, 1, 2, 3], null, [4, 5]]"); - const std::vector offset6 = +TEST(TestSwapEndianArrayData, ListType) { + auto type1 = std::make_shared(int32()); + auto array = ArrayFromJSON(type1, "[[0, 1, 2, 3], null, [4, 5]]"); + const std::vector offset1 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6}; #else {0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0}; #endif - const std::vector data6 = + const std::vector data1 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5}; #else {0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0}; #endif - expected_data = array->data(); - test_data = ReplaceBuffers(expected_data, 1, offset6); - test_data = ReplaceBuffersInChild(test_data, 0, data6); + auto expected_data = array->data(); + auto test_data = ReplaceBuffers(expected_data, 1, offset1); + test_data = ReplaceBuffersInChild(test_data, 0, data1); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); - auto type7 = std::make_shared(int64()); - array = ArrayFromJSON(type7, "[[0, 1, 2], null, [3]]"); - const std::vector offset7 = + auto type2 = std::make_shared(int64()); + array = ArrayFromJSON(type2, "[[0, 1, 2], null, [3]]"); + const std::vector offset2 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4}; @@ -2791,7 +2794,7 @@ TEST(TestSwapEndianArrayData, NonPrimitiveType) { {0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0}; #endif - const std::vector data7 = + const std::vector data2 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3}; @@ -2800,63 +2803,69 @@ TEST(TestSwapEndianArrayData, NonPrimitiveType) { 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0}; #endif expected_data = array->data(); - test_data = ReplaceBuffers(expected_data, 1, offset7); - test_data = ReplaceBuffersInChild(test_data, 0, data7); + test_data = ReplaceBuffers(expected_data, 1, offset2); + test_data = ReplaceBuffersInChild(test_data, 0, data2); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); - auto type8 = std::make_shared(int32(), 2); - array = ArrayFromJSON(type8, "[[0, 1], null, [2, 3]]"); + auto type3 = std::make_shared(int32(), 2); + array = ArrayFromJSON(type3, "[[0, 1], null, [2, 3]]"); expected_data = array->data(); - const std::vector data8 = + const std::vector data3 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3}; #else {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; #endif - test_data = ReplaceBuffersInChild(expected_data, 0, data8); + test_data = ReplaceBuffersInChild(expected_data, 0, data3); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} - auto type9 = dictionary(int32(), int16()); +TEST(TestSwapEndianArrayData, DictionaryType) { + auto type = dictionary(int32(), int16()); auto dict = ArrayFromJSON(int16(), "[4, 5, 6, 7]"); - DictionaryArray array9(type9, ArrayFromJSON(int32(), "[0, 2, 3]"), dict); - expected_data = array9.data(); - const std::vector data9a = + DictionaryArray array(type, ArrayFromJSON(int32(), "[0, 2, 3]"), dict); + auto expected_data = array.data(); + const std::vector data1 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3}; #else {0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; #endif - const std::vector data9b = + const std::vector data2 = #if ARROW_LITTLE_ENDIAN {0, 4, 0, 5, 0, 6, 0, 7}; #else {4, 0, 5, 0, 6, 0, 7, 0}; #endif - test_data = ReplaceBuffers(expected_data, 1, data9a); - test_data = ReplaceBuffersInDictionary(test_data, 1, data9b); + auto test_data = ReplaceBuffers(expected_data, 1, data1); + test_data = ReplaceBuffersInDictionary(test_data, 1, data2); // dictionary must be explicitly swapped test_data->dictionary = *::arrow::internal::SwapEndianArrayData(test_data->dictionary); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} - array = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), - R"([{"a": 4, "b": null}, {"a": null, "b": "foo"}])"); - expected_data = array->data(); - const std::vector data10a = +TEST(TestSwapEndianArrayData, StructType) { + auto array = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), + R"([{"a": 4, "b": null}, {"a": null, "b": "foo"}])"); + auto expected_data = array->data(); + const std::vector data1 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 4, 0, 0, 0, 0}; #else {4, 0, 0, 0, 0, 0, 0, 0}; #endif - const std::vector data10b = + const std::vector data2 = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3}; #else {0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0}; #endif - test_data = ReplaceBuffersInChild(expected_data, 0, data10a); - test_data = ReplaceBuffersInChild(test_data, 1, data10b); + auto test_data = ReplaceBuffersInChild(expected_data, 0, data1); + test_data = ReplaceBuffersInChild(test_data, 1, data2); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} +TEST(TestSwapEndianArrayData, UnionType) { auto expected_i8 = ArrayFromJSON(int8(), "[127, null, null, null, null]"); auto expected_str = ArrayFromJSON(utf8(), R"([null, "abcd", null, null, ""])"); auto expected_i32 = ArrayFromJSON(int32(), "[null, null, 1, 2, null]"); @@ -2866,35 +2875,35 @@ TEST(TestSwapEndianArrayData, NonPrimitiveType) { expected_types_vector.insert(expected_types_vector.end(), 2, Type::INT32); std::shared_ptr expected_types; ArrayFromVector(expected_types_vector, &expected_types); - auto arr11 = SparseUnionArray::Make( + auto arr1 = SparseUnionArray::Make( *expected_types, {expected_i8, expected_str, expected_i32}, {"i8", "str", "i32"}, {Type::INT8, Type::STRING, Type::INT32}); - expected_data = (*arr11)->data(); - const std::vector data11a = + auto expected_data = (*arr1)->data(); + const std::vector data1a = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4}; #else {0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0}; #endif - const std::vector data11b = + const std::vector data1b = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0}; #else {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0}; #endif - test_data = ReplaceBuffersInChild(expected_data, 1, data11a); - test_data = ReplaceBuffersInChild(test_data, 2, data11b); + auto test_data = ReplaceBuffersInChild(expected_data, 1, data1a); + test_data = ReplaceBuffersInChild(test_data, 2, data1b); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]"); expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])"); expected_i32 = ArrayFromJSON(int32(), "[1, -259, 2]"); auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]"); - auto arr12 = DenseUnionArray::Make( + auto arr2 = DenseUnionArray::Make( *expected_types, *expected_offsets, {expected_i8, expected_str, expected_i32}, {"i8", "str", "i32"}, {Type::INT8, Type::STRING, Type::INT32}); - expected_data = (*arr12)->data(); - const std::vector data12a = + expected_data = (*arr2)->data(); + const std::vector data2a = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2}; @@ -2902,21 +2911,37 @@ TEST(TestSwapEndianArrayData, NonPrimitiveType) { {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0}; #endif - const std::vector data12b = + const std::vector data2b = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6}; #else {0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0}; #endif - const std::vector data12c = + const std::vector data2c = #if ARROW_LITTLE_ENDIAN {0, 0, 0, 1, 255, 255, 254, 253, 0, 0, 0, 2}; #else {1, 0, 0, 0, 253, 254, 255, 255, 2, 0, 0, 0}; #endif - test_data = ReplaceBuffers(expected_data, 2, data12a); - test_data = ReplaceBuffersInChild(test_data, 1, data12b); - test_data = ReplaceBuffersInChild(test_data, 2, data12c); + test_data = ReplaceBuffers(expected_data, 2, data2a); + test_data = ReplaceBuffersInChild(test_data, 1, data2b); + test_data = ReplaceBuffersInChild(test_data, 2, data2c); + AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); +} + +TEST(TestSwapEndianArrayData, ExtensionType) { + auto array_int16 = ArrayFromJSON(int16(), "[0, 1, 2, 3]"); + auto ext_data = array_int16->data()->Copy(); + ext_data->type = std::make_shared(); + auto array = MakeArray(ext_data); + auto expected_data = array->data(); + const std::vector data = +#if ARROW_LITTLE_ENDIAN + {0, 0, 0, 1, 0, 2, 0, 3}; +#else + {0, 0, 1, 0, 2, 0, 3, 0}; +#endif + auto test_data = ReplaceBuffers(expected_data, 1, data); AssertArrayDataEqualsWithSwapEndian(test_data, expected_data); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index dd76296c2b9..e397a752cd8 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -130,33 +130,6 @@ int64_t ArrayData::GetNullCount() const { return precomputed; } -bool ArrayData::Equals(const ArrayData& other) const { - if (this == &other) { - return true; - } - - if (offset != other.offset || !type->Equals(other.type) || - buffers.size() != other.buffers.size() || - child_data.size() != other.child_data.size()) { - return false; - } - - for (size_t i = 0; i < buffers.size(); i++) { - if (buffers[i] != other.buffers[i] && !buffers[i]->Equals(*other.buffers[i].get())) { - return false; - } - } - - for (size_t i = 0; i < child_data.size(); i++) { - if (child_data[i] != other.child_data[i] && - !child_data[i]->Equals(*other.child_data[i].get())) { - return false; - } - } - - return (dictionary == other.dictionary || dictionary->Equals(*other.dictionary.get())); -} - // ---------------------------------------------------------------------- // Implement ArrayData::View diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 08e98a9052f..02a49949e1f 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -231,9 +231,6 @@ struct ARROW_EXPORT ArrayData { return null_count.load() != 0 && buffers[0] != NULLPTR; } - /// Return true if both ArrayDatas are the same size and contain the same bytes - bool Equals(const ArrayData& other) const; - std::shared_ptr type; int64_t length; mutable std::atomic null_count; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 6f83da35fb9..d3049bc682c 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -79,10 +79,7 @@ class ArrayDataEndianSwapper { public: ArrayDataEndianSwapper(const std::shared_ptr& data, int64_t length) : data_(data), length_(length) { - const std::shared_ptr& type = data->type; - std::vector> buffers(data->buffers.size(), nullptr); - std::vector> child_data(data->child_data.size(), nullptr); - out_ = ArrayData::Make(type, data->length, buffers, child_data, data->null_count, 0); + out_ = data->Copy(); } Status SwapType(const DataType& type) { @@ -91,7 +88,7 @@ class ArrayDataEndianSwapper { return Status::OK(); } - Status SwapChildren(const std::vector>& child_fields) { + Status SwapChildren(const FieldVector& child_fields) { for (size_t i = 0; i < child_fields.size(); i++) { ARROW_ASSIGN_OR_RAISE(out_->child_data[i], internal::SwapEndianArrayData(data_->child_data[i])); @@ -199,16 +196,11 @@ class ArrayDataEndianSwapper { return Status::OK(); } - Status ReuseDataBuffer() { - out_->buffers[1] = data_->buffers[1]; - return Status::OK(); - } - Status Visit(const NullType& type) { return Status::OK(); } - Status Visit(const BooleanType& type) { return ReuseDataBuffer(); } - Status Visit(const Int8Type& type) { return ReuseDataBuffer(); } - Status Visit(const UInt8Type& type) { return ReuseDataBuffer(); } - Status Visit(const FixedSizeBinaryType& type) { return ReuseDataBuffer(); } + Status Visit(const BooleanType& type) { return Status::OK(); } + Status Visit(const Int8Type& type) { return Status::OK(); } + Status Visit(const UInt8Type& type) { return Status::OK(); } + Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); } Status Visit(const FixedSizeListType& type) { return Status::OK(); } Status Visit(const StructType& type) { return Status::OK(); } Status Visit(const UnionType& type) { @@ -250,14 +242,12 @@ class ArrayDataEndianSwapper { Status Visit(const DictionaryType& type) { RETURN_NOT_OK(SwapType(*type.index_type())); // dictionary was already swapped in ReadDictionary() in ipc/reader.cc - out_->dictionary = data_->dictionary; return Status::OK(); } Status Visit(const ExtensionType& type) { RETURN_NOT_OK(SwapType(*type.storage_type())); // dictionary was already swapped in ReadDictionary() in ipc/reader.cc - out_->dictionary = data_->dictionary; return Status::OK(); } @@ -278,7 +268,7 @@ Result> SwapEndianArrayData( const std::shared_ptr& type = data->type; ArrayDataEndianSwapper swapper_visitor(data, data->length); DCHECK_OK(VisitTypeInline(*type, &swapper_visitor)); - DCHECK_OK(swapper_visitor.SwapChildren((*type).fields())); + RETURN_NOT_OK(swapper_visitor.SwapChildren((*type).fields())); std::shared_ptr out = std::move(swapper_visitor.out_); // copy null_bitmap out->buffers[0] = data->buffers[0]; From 0cd74686adad9786d27a4235de2e4a0b6979b815 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 17 Feb 2021 03:35:22 +0000 Subject: [PATCH 60/63] avoid to run archery integration twice --- ci/scripts/integration_arrow.sh | 5 +---- dev/archery/archery/integration/runner.py | 5 +++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index f8f4192cc4a..aa23e5b7c18 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -30,9 +30,6 @@ pip install -e $arrow_dir/dev/archery archery integration --with-all --run-flight \ --gold-dirs=$gold_dir/0.14.1 \ --gold-dirs=$gold_dir/0.17.1 \ - --gold-dirs=$gold_dir/2.0.0-compression \ - -# TODO: support other languages -archery integration --with-cpp=1 --run-flight \ --gold-dirs=$gold_dir/1.0.0-bigendian \ --gold-dirs=$gold_dir/1.0.0-littleendian \ + --gold-dirs=$gold_dir/2.0.0-compression \ diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 2fe23a3b200..95394cdd37d 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -128,6 +128,11 @@ def _gold_tests(self, gold_dir): skip = set() if name == 'union' and prefix == '0.17.1': skip.add("Java") + if prefix == '1.0.0-bigendian' or prefix == '1.0.0-littleendian': + skip.add("Go") + skip.add("Java") + skip.add("JS") + skip.add("Rust") if prefix == '2.0.0-compression': skip.add("Go") skip.add("Java") From 5d7a4b1c34c3e836b6b72725e2b079225480ba13 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 17 Feb 2021 08:06:11 +0000 Subject: [PATCH 61/63] update IpcReadContext --- cpp/src/arrow/ipc/reader.cc | 39 ++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 9092152fa31..7e39ee1c484 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -111,13 +111,23 @@ Status InvalidMessageType(MessageType expected, MessageType actual) { /// \brief Structure to keep common arguments to be passed struct IpcReadContext { - IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap) - : dictionary_memo(memo), options(option), swap_endian(swap) {} + IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap, + MetadataVersion version = MetadataVersion::V5, + Compression::type kind = Compression::UNCOMPRESSED) + : dictionary_memo(memo), + options(option), + metadata_version(version), + compression(kind), + swap_endian(swap) {} DictionaryMemo* dictionary_memo; const IpcReadOptions& options; + MetadataVersion metadata_version; + + Compression::type compression; + /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements /// if this flag is true const bool swap_endian; @@ -455,9 +465,8 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op Result> LoadRecordBatchSubset( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector* inclusion_mask, const IpcReadContext& context, - MetadataVersion metadata_version, Compression::type compression, io::RandomAccessFile* file) { - ArrayLoader loader(metadata, metadata_version, context.options, file); + ArrayLoader loader(metadata, context.metadata_version, context.options, file); ArrayDataVector columns(schema->num_fields()); ArrayDataVector filtered_columns; @@ -497,8 +506,9 @@ Result> LoadRecordBatchSubset( filtered_schema = schema; filtered_columns = std::move(columns); } - if (compression != Compression::UNCOMPRESSED) { - RETURN_NOT_OK(DecompressBuffers(compression, context.options, &filtered_columns)); + if (context.compression != Compression::UNCOMPRESSED) { + RETURN_NOT_OK( + DecompressBuffers(context.compression, context.options, &filtered_columns)); } // swap endian in a set of ArrayData if necessary (swap_endian == true) @@ -515,14 +525,11 @@ Result> LoadRecordBatchSubset( Result> LoadRecordBatch( const flatbuf::RecordBatch* metadata, const std::shared_ptr& schema, const std::vector& inclusion_mask, const IpcReadContext& context, - MetadataVersion metadata_version, Compression::type compression, io::RandomAccessFile* file) { if (inclusion_mask.size() > 0) { - return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, - metadata_version, compression, file); + return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file); } else { - return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, - metadata_version, compression, file); + return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file); } } @@ -600,7 +607,7 @@ Result> ReadRecordBatch( Result> ReadRecordBatchInternal( const Buffer& metadata, const std::shared_ptr& schema, - const std::vector& inclusion_mask, const IpcReadContext& context, + const std::vector& inclusion_mask, IpcReadContext& context, io::RandomAccessFile* file) { const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); @@ -612,15 +619,15 @@ Result> ReadRecordBatchInternal( Compression::type compression; RETURN_NOT_OK(GetCompression(batch, &compression)); - if (compression == Compression::UNCOMPRESSED && + if (context.compression == Compression::UNCOMPRESSED && message->version() == flatbuf::MetadataVersion::V4) { // Possibly obtain codec information from experimental serialization format // in 0.17.x RETURN_NOT_OK(GetCompressionExperimental(message, &compression)); } - return LoadRecordBatch(batch, schema, inclusion_mask, context, - internal::GetMetadataVersion(message->version()), compression, - file); + context.compression = compression; + context.metadata_version = internal::GetMetadataVersion(message->version()); + return LoadRecordBatch(batch, schema, inclusion_mask, context, file); } // If we are selecting only certain fields, populate an inclusion mask for fast lookups. From bcf5bdb273b4dd66e7f40e4429107bb8685a820a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 17 Feb 2021 14:43:33 +0100 Subject: [PATCH 62/63] Minor fixes and improvements --- cpp/src/arrow/array/util.cc | 19 ++++----- cpp/src/arrow/array/util.h | 6 ++- cpp/src/arrow/ipc/options.h | 14 +++++-- cpp/src/arrow/type.cc | 16 ++++++++ cpp/src/arrow/type.h | 3 ++ cpp/src/arrow/type_test.cc | 41 ++++++++++++++++++- .../generate_files_for_endian_test.sh | 24 +++++++---- 7 files changed, 99 insertions(+), 24 deletions(-) rename dev/archery/{tests => }/generate_files_for_endian_test.sh (64%) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index d3049bc682c..297745a2b17 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -85,6 +85,10 @@ class ArrayDataEndianSwapper { Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); RETURN_NOT_OK(SwapChildren(type.fields())); + if (internal::HasValidityBitmap(type.id())) { + // Copy null bitmap + out_->buffers[0] = data_->buffers[0]; + } return Status::OK(); } @@ -240,14 +244,13 @@ class ArrayDataEndianSwapper { } Status Visit(const DictionaryType& type) { - RETURN_NOT_OK(SwapType(*type.index_type())); // dictionary was already swapped in ReadDictionary() in ipc/reader.cc + RETURN_NOT_OK(SwapType(*type.index_type())); return Status::OK(); } Status Visit(const ExtensionType& type) { RETURN_NOT_OK(SwapType(*type.storage_type())); - // dictionary was already swapped in ReadDictionary() in ipc/reader.cc return Status::OK(); } @@ -265,15 +268,9 @@ Result> SwapEndianArrayData( if (data->offset != 0) { return Status::Invalid("Unsupported data format: data.offset != 0"); } - const std::shared_ptr& type = data->type; - ArrayDataEndianSwapper swapper_visitor(data, data->length); - DCHECK_OK(VisitTypeInline(*type, &swapper_visitor)); - RETURN_NOT_OK(swapper_visitor.SwapChildren((*type).fields())); - std::shared_ptr out = std::move(swapper_visitor.out_); - // copy null_bitmap - out->buffers[0] = data->buffers[0]; - DCHECK(out); - return out; + ArrayDataEndianSwapper swapper(data, data->length); + RETURN_NOT_OK(swapper.SwapType(*data->type)); + return std::move(swapper.out_); } } // namespace internal diff --git a/cpp/src/arrow/array/util.h b/cpp/src/arrow/array/util.h index 14b18573481..3ef4e08828f 100644 --- a/cpp/src/arrow/array/util.h +++ b/cpp/src/arrow/array/util.h @@ -57,8 +57,12 @@ Result> MakeArrayFromScalar( namespace internal { /// \brief Swap endian of each element in a generic ArrayData +/// +/// As dictionaries are often shared between different arrays, dictionaries +/// are not swapped by this function and should be handled separately. +/// /// \param[in] data the array contents -/// \return the resulting Array instance whose elements were swapped +/// \return the resulting ArrayData whose elements were swapped ARROW_EXPORT Result> SwapEndianArrayData( const std::shared_ptr& data); diff --git a/cpp/src/arrow/ipc/options.h b/cpp/src/arrow/ipc/options.h index 057d17780a3..2e0f800b5ad 100644 --- a/cpp/src/arrow/ipc/options.h +++ b/cpp/src/arrow/ipc/options.h @@ -137,10 +137,16 @@ struct ARROW_EXPORT IpcReadOptions { /// like decompression bool use_threads = true; - /// \brief Convert endian of data element to platform-native endianness - /// if the endianness of the received schema is not equal to - /// platform-native endianness. This is effective at RecordBatchFileReader.Open(), - /// RecordBatchStreamReader(), or StreamDecoder(). + /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness + /// + /// If the endianness of the received schema is not equal to platform-native + /// endianness, then all buffers with endian-sensitive data will be byte-swapped. + /// This includes the value buffers of numeric types, temporal types, decimal + /// types, as well as the offset buffers of variable-sized binary and list-like + /// types. + /// + /// Endianness conversion is achieved by the RecordBatchFileReader, + /// RecordBatchStreamReader and StreamDecoder classes. bool ensure_native_endian = true; static IpcReadOptions Defaults(); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a5f1cfae402..9192c325bbf 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1298,6 +1298,18 @@ void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); } // ---------------------------------------------------------------------- // Schema implementation +std::string EndiannessToString(Endianness endianness) { + switch (endianness) { + case Endianness::Little: + return "little"; + case Endianness::Big: + return "big"; + default: + DCHECK(false) << "invalid endianness"; + return "???"; + } +} + class Schema::Impl { public: Impl(std::vector> fields, Endianness endianness, @@ -1502,6 +1514,10 @@ std::string Schema::ToString(bool show_metadata) const { ++i; } + if (impl_->endianness_ != Endianness::Native) { + buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --"; + } + if (show_metadata && HasMetadata()) { buffer << impl_->metadata_->ToString(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e5c3ba51694..0672354ab6c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1715,6 +1715,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, std::unique_ptr impl_; }; +ARROW_EXPORT +std::string EndiannessToString(Endianness endianness); + // ---------------------------------------------------------------------- /// \brief Convenience class to incrementally construct/merge schemas. diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 06b092921c5..da93e32936c 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -479,12 +479,27 @@ TEST_F(TestSchema, Basics) { auto schema4 = ::arrow::schema({f0}, Endianness::Little); auto schema5 = ::arrow::schema({f0}, Endianness::Little); auto schema6 = ::arrow::schema({f0}, Endianness::Big); + auto schema7 = ::arrow::schema({f0}); AssertSchemaEqual(schema4, schema5); AssertSchemaNotEqual(schema4, schema6); +#if ARROW_LITTLE_ENDIAN + AssertSchemaEqual(schema4, schema7); + AssertSchemaNotEqual(schema6, schema7); +#else + AssertSchemaNotEqual(schema4, schema6); + AssertSchemaEqual(schema6, schema7); +#endif ASSERT_EQ(schema4->fingerprint(), schema5->fingerprint()); ASSERT_NE(schema4->fingerprint(), schema6->fingerprint()); +#if ARROW_LITTLE_ENDIAN + ASSERT_EQ(schema4->fingerprint(), schema7->fingerprint()); + ASSERT_NE(schema6->fingerprint(), schema7->fingerprint()); +#else + ASSERT_NE(schema4->fingerprint(), schema7->fingerprint()); + ASSERT_EQ(schema6->fingerprint(), schema7->fingerprint()); +#endif } TEST_F(TestSchema, ToString) { @@ -505,14 +520,38 @@ f3: list)"; ASSERT_EQ(expected, result); result = schema->ToString(/*print_metadata=*/true); + std::string expected_with_metadata = expected + R"( +-- metadata -- +foo: bar)"; + + ASSERT_EQ(expected_with_metadata, result); + + // With swapped endianness +#if ARROW_LITTLE_ENDIAN + schema = schema->WithEndianness(Endianness::Big); + expected = R"(f0: int32 +f1: uint8 not null +f2: string +f3: list +-- endianness: big --)"; +#else + schema = schema->WithEndianness(Endianness::Little); expected = R"(f0: int32 f1: uint8 not null f2: string f3: list +-- endianness: little --)"; +#endif + + result = schema->ToString(); + ASSERT_EQ(expected, result); + + result = schema->ToString(/*print_metadata=*/true); + expected_with_metadata = expected + R"( -- metadata -- foo: bar)"; - ASSERT_EQ(expected, result); + ASSERT_EQ(expected_with_metadata, result); } TEST_F(TestSchema, GetFieldByName) { diff --git a/dev/archery/tests/generate_files_for_endian_test.sh b/dev/archery/generate_files_for_endian_test.sh similarity index 64% rename from dev/archery/tests/generate_files_for_endian_test.sh rename to dev/archery/generate_files_for_endian_test.sh index 787379d614a..54019ea570e 100755 --- a/dev/archery/tests/generate_files_for_endian_test.sh +++ b/dev/archery/generate_files_for_endian_test.sh @@ -18,16 +18,26 @@ # This script generates json and arrow files of each type (e.g. primitive) for integration endian test # Usage: generate_files_for_endian_test.sh -# ARROW_DIR : where Arrow repository exists -# TMP_DIR : where files will be generated +# ARROW_CPP_EXE_PATH : where Arrow C++ binaries can be found +# TMP_DIR : where files will be generated -: ${ARROW_DIR:=/arrow} +set -e + +: ${ARROW_CPP_EXE_PATH:=/arrow/cpp/build/debug/} : ${TMP_DIR:=/tmp/arrow} json_dir=$TMP_DIR/arrow.$$ mkdir -p $json_dir -archery integration --with-cpp=1 --tempdir=$json_dir -for f in $json_dir/*.json ; do $ARROW_DIR/cpp/build/debug/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; done -for f in $json_dir/*.arrow_file ; do $ARROW_DIR/cpp/build/debug/arrow-file-to-stream $f > ${f%.*}.stream; done -for f in $json_dir/*.json ; do gzip $f ; done + +archery integration --stop-on-error --with-cpp=1 --tempdir=$json_dir + +for f in $json_dir/*.json ; do + $ARROW_CPP_EXE_PATH/arrow-json-integration-test -mode JSON_TO_ARROW -json $f -arrow ${f%.*}.arrow_file -integration true ; +done +for f in $json_dir/*.arrow_file ; do + $ARROW_CPP_EXE_PATH/arrow-file-to-stream $f > ${f%.*}.stream; +done +for f in $json_dir/*.json ; do + gzip $f ; +done echo "The files are under $json_dir" From 252a08086733db4cad546b4ea0bbeb56962d5a5e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 17 Feb 2021 14:52:00 +0100 Subject: [PATCH 63/63] Add endianness conversion to feature matrix --- docs/source/cpp/compute.rst | 10 +++++----- docs/source/status.rst | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 4101c36ef8f..bb96dce7993 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -120,7 +120,7 @@ to numeric type which can accommodate any value from either input. .. _common-numeric-type: Common numeric type -~~~~~~~~~~~~~~~~~~~ +------------------- The common numeric type of a set of input numeric types is the smallest numeric type which can accommodate any value of any input. If any input is a floating @@ -482,14 +482,14 @@ These functions trim off characters on both sides (trim), or the left (ltrim) or +--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ * \(1) Only characters specified in :member:`TrimOptions::characters` will be -trimmed off. Both the input string as the `characters` argument are interepreted -as ASCII characters. + trimmed off. Both the input string and the `characters` argument are + interpreted as ASCII characters. * \(2) Only trim off ASCII whitespace characters (``'\t'``, ``'\n'``, ``'\v'``, -``'\f'``, ``'\r'`` and ``' '``). + ``'\f'``, ``'\r'`` and ``' '``). * \(3) Only characters specified in :member:`TrimOptions::characters` will be -trimmed off. + trimmed off. * \(4) Only trim off Unicode whitespace characters. diff --git a/docs/source/status.rst b/docs/source/status.rst index d3bb8216f5d..92c813a8541 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -128,6 +128,8 @@ IPC Format +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ | Buffer compression | ✓ | | | | | | ✓ | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ +| Endianness conversion | ✓ (2) | | | | | | | ++-----------------------------+-------+-------+-------+------------+-------+-------+-------+ | Custom schema metadata | ✓ | ✓ | | | | ✓ | ✓ | +-----------------------------+-------+-------+-------+------------+-------+-------+-------+ @@ -135,6 +137,8 @@ Notes: * \(1) Delta dictionaries not supported on nested dictionaries +* \(2) Data with non-native endianness can be byte-swapped automatically when reading. + .. seealso:: The :ref:`format-ipc` specification.