From e6d3d88523dde8093fef413d72e78d5409417760 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 27 Jan 2020 17:22:38 +0100 Subject: [PATCH 1/2] ARROW-7691: [C++] Check non-scalar Flatbuffers fields are not null We're discussing whether to make those fields required in the schema definitions (which would make validation automatic by the flatbuffers generated verifier), but in the meantime we can check those fields manually. This should fix a bunch of issues detected by OSS-Fuzz. --- cpp/src/arrow/ipc/metadata_internal.cc | 77 ++++++-------------------- cpp/src/arrow/ipc/metadata_internal.h | 23 ++++++-- cpp/src/arrow/ipc/reader.cc | 21 +++---- 3 files changed, 44 insertions(+), 77 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 05f508df31c..94bb6cbd839 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -286,11 +286,7 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::Timestamp: { auto ts_type = static_cast(type_data); TimeUnit::type unit = FromFlatbufferUnit(ts_type->unit()); - if (ts_type->timezone() != 0 && ts_type->timezone()->Length() > 0) { - *out = timestamp(unit, ts_type->timezone()->str()); - } else { - *out = timestamp(unit); - } + *out = timestamp(unit, StringFromFlatbuffers(ts_type->timezone())); return Status::OK(); } case flatbuf::Type::Duration: { @@ -369,10 +365,7 @@ static Status TypeFromFlatbuffer(const flatbuf::Field* field, const KeyValueMetadata* field_metadata, std::shared_ptr* out) { auto type_data = field->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded Field is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type"); RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out)); // Look for extension metadata in custom_metadata field @@ -474,14 +467,8 @@ Status KeyValueMetadataFromFlatbuffer(const KVVector* fb_metadata, metadata->reserve(fb_metadata->size()); for (const auto& pair : *fb_metadata) { - if (pair->key() == nullptr) { - return Status::IOError( - "Key-pointer in custom metadata of flatbuffer-encoded Schema is null."); - } - if (pair->value() == nullptr) { - return Status::IOError( - "Value-pointer in custom metadata of flatbuffer-encoded Schema is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key"); + CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value"); metadata->Append(pair->key()->str(), pair->value()->str()); } @@ -776,9 +763,7 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona // Reconstruct the data type auto children = field->children(); - if (children == nullptr) { - return Status::IOError("Children-pointer of flatbuffer-encoded Field is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(children, "Field.children"); std::vector> child_fields(children->size()); for (int i = 0; i < static_cast(children->size()); ++i) { RETURN_NOT_OK( @@ -786,6 +771,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona } RETURN_NOT_OK(TypeFromFlatbuffer(field, child_fields, metadata.get(), &type)); + auto field_name = StringFromFlatbuffers(field->name()); + const flatbuf::DictionaryEncoding* encoding = field->dictionary(); if (encoding != nullptr) { @@ -794,22 +781,14 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona // dictionary_memo std::shared_ptr index_type; auto int_data = encoding->indexType(); - if (int_data == nullptr) { - return Status::IOError( - "indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding " - "is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(int_data, "DictionaryEncoding.indexType"); RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type)); ARROW_ASSIGN_OR_RAISE(type, DictionaryType::Make(index_type, type, encoding->isOrdered())); - *out = ::arrow::field(field->name()->str(), type, field->nullable(), metadata); + *out = ::arrow::field(field_name, type, field->nullable(), metadata); RETURN_NOT_OK(dictionary_memo->AddField(encoding->id(), *out)); } else { - auto name = field->name(); - if (name == nullptr) { - return Status::IOError("Name-pointer of flatbuffer-encoded Field is null."); - } - *out = ::arrow::field(name->str(), type, field->nullable(), metadata); + *out = ::arrow::field(field_name, type, field->nullable(), metadata); } return Status::OK(); } @@ -1183,17 +1162,15 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo, std::shared_ptr* out) { auto schema = static_cast(opaque_schema); - if (schema->fields() == nullptr) { - return Status::IOError("Fields-pointer of flatbuffer-encoded Schema is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(schema, "schema"); + CHECK_FLATBUFFERS_NOT_NULL(schema->fields(), "Schema.fields"); int num_fields = static_cast(schema->fields()->size()); std::vector> fields(num_fields); for (int i = 0; i < num_fields; ++i) { const flatbuf::Field* field = schema->fields()->Get(i); - if (field == nullptr) { - return Status::IOError("Field-pointer of flatbuffer-encoded Schema is null."); - } + // XXX I don't think this check is necessary (AP) + CHECK_FLATBUFFERS_NOT_NULL(field, "DictionaryEncoding.indexType"); RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i])); } @@ -1225,12 +1202,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type auto dim = tensor->shape()->Get(i); shape->push_back(dim->size()); - auto fb_name = dim->name(); - if (fb_name == 0) { - dim_names->push_back(""); - } else { - dim_names->push_back(fb_name->str()); - } + dim_names->push_back(StringFromFlatbuffers(dim->name())); } if (tensor->strides() && tensor->strides()->size() > 0) { @@ -1239,11 +1211,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr* type } } - auto type_data = tensor->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded Tensor is null."); - } + auto type_data = tensor->type(); // Required return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type); } @@ -1283,12 +1251,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } if (dim_names) { - auto fb_name = dim->name(); - if (fb_name == 0) { - dim_names->push_back(""); - } else { - dim_names->push_back(fb_name->str()); - } + dim_names->push_back(StringFromFlatbuffers(dim->name())); } } } @@ -1324,11 +1287,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr } } - auto type_data = sparse_tensor->type(); - if (type_data == nullptr) { - return Status::IOError( - "Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null."); - } + auto type_data = sparse_tensor->type(); // Required if (type) { return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type); } else { diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index b2ac81f86e7..666c2789e26 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -31,20 +31,16 @@ #include "arrow/buffer.h" #include "arrow/ipc/dictionary.h" // IYWU pragma: keep #include "arrow/ipc/message.h" -#include "arrow/memory_pool.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" #include "generated/Message_generated.h" #include "generated/Schema_generated.h" namespace arrow { -class DataType; -class Schema; -class Tensor; -class SparseTensor; - namespace flatbuf = org::apache::arrow::flatbuf; namespace io { @@ -92,6 +88,21 @@ struct FileBlock { int64_t body_length; }; +// Low-level utilities to help with reading Flatbuffers data. + +#define CHECK_FLATBUFFERS_NOT_NULL(fb_value, name) \ + if ((fb_value) == NULLPTR) { \ + return Status::IOError("Unexpected null field ", name, \ + " in flatbuffer-encoded metadata"); \ + } + +#define FLATBUFFERS_VECTOR_SIZE(fb_vector) \ + ((fb_vector == NULLPTR) ? 0 : fb_vector->size()) + +inline std::string StringFromFlatbuffers(const flatbuffers::String* s) { + return (s == NULLPTR) ? "" : s->str(); +} + // Read interface classes. We do not fully deserialize the flatbuffers so that // individual fields metadata can be retrieved from very large schema without // diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 8712d46d1c9..ff7fee59509 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -102,10 +102,7 @@ class IpcComponentSource { Status GetBuffer(int buffer_index, std::shared_ptr* out) { auto buffers = metadata_->buffers(); - if (buffers == nullptr) { - return Status::IOError( - "Buffers-pointer of flatbuffer-encoded RecordBatch is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); if (buffer_index >= static_cast(buffers->size())) { return Status::IOError("buffer_index out of range."); } @@ -127,9 +124,7 @@ class IpcComponentSource { Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); - if (nodes == nullptr) { - return Status::IOError("Nodes-pointer of flatbuffer-encoded Table is null."); - } + CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); // pop off a field if (field_index >= static_cast(nodes->size())) { return Status::Invalid("Ran out of field metadata, likely malformed"); @@ -441,6 +436,7 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, // The dictionary is embedded in a record batch with a single column std::shared_ptr batch; auto batch_meta = dictionary_batch->data(); + CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data"); RETURN_NOT_OK(ReadRecordBatch(batch_meta, ::arrow::schema({value_field}), dictionary_memo, options, file, &batch)); if (batch->num_columns() != 1) { @@ -475,9 +471,6 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl { } CHECK_MESSAGE_TYPE(Message::SCHEMA, message->type()); CHECK_HAS_NO_BODY(*message); - if (message->header() == nullptr) { - return Status::IOError("Header-pointer of flatbuffer-encoded Message is null."); - } return internal::GetSchema(message->header(), &dictionary_memo_, &schema_); } @@ -663,9 +656,13 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { return Status::OK(); } - int num_dictionaries() const { return footer_->dictionaries()->size(); } + int num_dictionaries() const { + return FLATBUFFERS_VECTOR_SIZE(footer_->dictionaries()); + } - int num_record_batches() const { return footer_->recordBatches()->size(); } + int num_record_batches() const { + return FLATBUFFERS_VECTOR_SIZE(footer_->recordBatches()); + } MetadataVersion version() const { return internal::GetMetadataVersion(footer_->version()); From 02478a6202d02091c5fd4658bdc8693ba838f569 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 4 Feb 2020 13:50:21 +0100 Subject: [PATCH 2/2] Use a function rather than a macro --- cpp/src/arrow/ipc/metadata_internal.h | 6 ++++-- cpp/src/arrow/ipc/reader.cc | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index 666c2789e26..f5ffee3fa75 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -96,8 +96,10 @@ struct FileBlock { " in flatbuffer-encoded metadata"); \ } -#define FLATBUFFERS_VECTOR_SIZE(fb_vector) \ - ((fb_vector == NULLPTR) ? 0 : fb_vector->size()) +template +inline uint32_t FlatBuffersVectorSize(const flatbuffers::Vector* vec) { + return (vec == NULLPTR) ? 0 : vec->size(); +} inline std::string StringFromFlatbuffers(const flatbuffers::String* s) { return (s == NULLPTR) ? "" : s->str(); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index ff7fee59509..f99037a0d34 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -657,11 +657,11 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl { } int num_dictionaries() const { - return FLATBUFFERS_VECTOR_SIZE(footer_->dictionaries()); + return static_cast(internal::FlatBuffersVectorSize(footer_->dictionaries())); } int num_record_batches() const { - return FLATBUFFERS_VECTOR_SIZE(footer_->recordBatches()); + return static_cast(internal::FlatBuffersVectorSize(footer_->recordBatches())); } MetadataVersion version() const {