From 05467be2f79c03d56c6962646309d323c753488e Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 31 May 2019 20:20:42 -0700 Subject: [PATCH 1/4] ubsan stuff --- .travis.yml | 2 + cpp/cmake_modules/DefineOptions.cmake | 2 + cpp/cmake_modules/san-config.cmake | 3 +- cpp/src/arrow/array/builder_binary.h | 9 ++++- cpp/src/arrow/buffer-builder.h | 8 +++- cpp/src/arrow/ipc/json-test.cc | 8 +++- cpp/src/arrow/ipc/metadata-internal.cc | 22 +++++++---- cpp/src/arrow/json/test-common.h | 4 +- cpp/src/arrow/util/rle-encoding.h | 2 +- cpp/src/arrow/util/ubsan.cc | 28 ++++++++++++++ cpp/src/arrow/util/ubsan.h | 53 ++++++++++++++++++++++++++ cpp/src/parquet/metadata.cc | 2 + cpp/src/parquet/schema.cc | 46 +++++++++++++++++----- cpp/src/parquet/types.cc | 8 ++++ cpp/src/parquet/types.h | 10 +++-- cpp/src/plasma/protocol.cc | 18 ++++++--- 16 files changed, 192 insertions(+), 33 deletions(-) create mode 100644 cpp/src/arrow/util/ubsan.cc create mode 100644 cpp/src/arrow/util/ubsan.h diff --git a/.travis.yml b/.travis.yml index 79ce8cd29ca..9920b1b21c3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -73,6 +73,8 @@ matrix: - ARROW_TRAVIS_VERBOSE=1 - ARROW_TRAVIS_USE_SYSTEM_JAVA=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN + - ARROW_USE_ASAN=1 + - ARROW_USE_UBSAN=1 - CC="clang-7" - CXX="clang++-7" before_script: diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index c10b6a6c72b..b00af80e898 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -129,6 +129,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_USE_TSAN "Enable Thread Sanitizer checks" OFF) + define_option(ARROW_USE_UBSAN "Enable Undefined Behavior sanitizer checks" OFF) + #---------------------------------------------------------------------- set_option_category("Project component") diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index b2d3dac6d0b..95ef55389a8 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -34,6 +34,7 @@ endif() # - disable 'vptr' because it currently crashes somewhere in boost::intrusive::list code # - disable 'alignment' because unaligned access is really OK on Nehalem and we do it # all over the place. +# - disable 'function' because it appears to give a false positive https://github.com/google/sanitizers/issues/911 if(${ARROW_USE_UBSAN}) if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") @@ -45,7 +46,7 @@ if(${ARROW_USE_UBSAN}) endif() set( CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover=all" + "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function -fno-sanitize-recover=all" ) endif() diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index a04e308f83d..23a96450366 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -48,7 +48,10 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Append(const uint8_t* value, int32_t length) { ARROW_RETURN_NOT_OK(Reserve(1)); ARROW_RETURN_NOT_OK(AppendNextOffset()); - ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } UnsafeAppendToBitmap(true); return Status::OK(); @@ -246,7 +249,9 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); - byte_builder_.UnsafeAppend(value, byte_width_); + if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { + byte_builder_.UnsafeAppend(value, byte_width_); + } } void UnsafeAppend(util::string_view value) { diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h index dac2f398d73..f069ea4d7dd 100644 --- a/cpp/src/arrow/buffer-builder.h +++ b/cpp/src/arrow/buffer-builder.h @@ -29,6 +29,7 @@ #include "arrow/status.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" +#include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" namespace arrow { @@ -42,7 +43,12 @@ namespace arrow { class ARROW_EXPORT BufferBuilder { public: explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : pool_(pool), data_(NULLPTR), capacity_(0), size_(0) {} + : pool_(pool), + data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ + &util::internal::non_null_filler), + + capacity_(0), + size_(0) {} /// \brief Resize the buffer to the nearest multiple of 64 bytes /// diff --git a/cpp/src/arrow/ipc/json-test.cc b/cpp/src/arrow/ipc/json-test.cc index 2a98862081a..1b484842ed8 100644 --- a/cpp/src/arrow/ipc/json-test.cc +++ b/cpp/src/arrow/ipc/json-test.cc @@ -59,7 +59,9 @@ void TestSchemaRoundTrip(const Schema& schema) { std::string json_schema = sb.GetString(); rj::Document d; - d.Parse(json_schema); + // Pass explicit size to avoid ASAN issues with + // SIMD loads in RapidJson. + d.Parse(json_schema.data(), json_schema.size()); DictionaryMemo in_memo; std::shared_ptr out; @@ -83,7 +85,9 @@ void TestArrayRoundTrip(const Array& array) { std::string array_as_json = sb.GetString(); rj::Document d; - d.Parse(array_as_json); + // Pass explicit size to avoid ASAN issues with + // SIMD loads in RapidJson. + d.Parse(array_as_json.data(), array_as_json.size()); if (d.HasParseError()) { FAIL() << "JSON parsing failed"; diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 7a1e3b6c854..676a4774d4c 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -41,6 +41,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/ubsan.h" #include "arrow/visitor_inline.h" namespace arrow { @@ -630,7 +631,8 @@ class FieldToFlatbufferVisitor { type_ids.push_back(code); } - auto fb_type_ids = fbb_.CreateVector(type_ids); + auto fb_type_ids = + fbb_.CreateVector(util::MakeNonNull(type_ids.data()), type_ids.size()); type_offset_ = flatbuf::CreateUnion(fbb_, mode, fb_type_ids).Union(); return Status::OK(); @@ -653,7 +655,8 @@ class FieldToFlatbufferVisitor { Status GetResult(const std::shared_ptr& field, FieldOffset* offset) { auto fb_name = fbb_.CreateString(field->name()); RETURN_NOT_OK(VisitType(*field->type())); - auto fb_children = fbb_.CreateVector(children_); + auto fb_children = + fbb_.CreateVector(util::MakeNonNull(children_.data()), children_.size()); DictionaryOffset dictionary = 0; if (field->type()->id() == Type::DICTIONARY) { @@ -799,7 +802,7 @@ static Status WriteFieldNodes(FBB& fbb, const std::vector& nodes, } fb_nodes.emplace_back(node.length, node.null_count); } - *out = fbb.CreateVectorOfStructs(fb_nodes); + *out = fbb.CreateVectorOfStructs(util::MakeNonNull(fb_nodes.data()), fb_nodes.size()); return Status::OK(); } @@ -812,7 +815,9 @@ static Status WriteBuffers(FBB& fbb, const std::vector& buffers, const BufferMetadata& buffer = buffers[i]; fb_buffers.emplace_back(buffer.offset, buffer.length); } - *out = fbb.CreateVectorOfStructs(fb_buffers); + *out = + fbb.CreateVectorOfStructs(util::MakeNonNull(fb_buffers.data()), fb_buffers.size()); + return Status::OK(); } @@ -871,9 +876,11 @@ Status WriteTensorMessage(const Tensor& tensor, int64_t buffer_start_offset, dims.push_back(flatbuf::CreateTensorDim(fbb, tensor.shape()[i], name)); } - auto fb_shape = fbb.CreateVector(dims); - auto fb_strides = fbb.CreateVector(tensor.strides()); + auto fb_shape = fbb.CreateVector(util::MakeNonNull(dims.data()), dims.size()); + flatbuffers::Offset> fb_strides; + fb_strides = fbb.CreateVector(util::MakeNonNull(tensor.strides().data()), + tensor.strides().size()); int64_t body_length = tensor.size() * elem_size; flatbuf::Buffer buffer(buffer_start_offset, body_length); @@ -1004,7 +1011,7 @@ FileBlocksToFlatbuffer(FBB& fbb, const std::vector& blocks) { fb_blocks.emplace_back(block.offset, block.metadata_length, block.body_length); } - return fbb.CreateVectorOfStructs(fb_blocks); + return fbb.CreateVectorOfStructs(util::MakeNonNull(fb_blocks.data()), fb_blocks.size()); } Status WriteFileFooter(const Schema& schema, const std::vector& dictionaries, @@ -1035,7 +1042,6 @@ Status WriteFileFooter(const Schema& schema, const std::vector& dicti auto footer = flatbuf::CreateFooter(fbb, kCurrentMetadataVersion, fb_schema, fb_dictionaries, fb_record_batches); - fbb.Finish(footer); int32_t size = fbb.GetSize(); diff --git a/cpp/src/arrow/json/test-common.h b/cpp/src/arrow/json/test-common.h index b4d20694ce5..633cae2f6d6 100644 --- a/cpp/src/arrow/json/test-common.h +++ b/cpp/src/arrow/json/test-common.h @@ -170,7 +170,9 @@ inline static Status ParseFromString(ParseOptions options, string_view src_str, std::string PrettyPrint(string_view one_line) { rj::Document document; - document.Parse(one_line.data()); + + // Must pass size to avoid ASAN issues. + document.Parse(one_line.data(), one_line.size()); rj::StringBuffer sb; rj::PrettyWriter writer(sb); document.Accept(writer); diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index acefc8e3f75..739158a59a1 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -354,7 +354,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, int values_read = 0; int remaining_nulls = null_count; - internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); while (values_read < batch_size) { bool is_valid = bit_reader.IsSet(); diff --git a/cpp/src/arrow/util/ubsan.cc b/cpp/src/arrow/util/ubsan.cc new file mode 100644 index 00000000000..f3952f80e51 --- /dev/null +++ b/cpp/src/arrow/util/ubsan.cc @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { + +namespace internal { + +uint8_t non_null_filler = 0xFF; + +} // namespace internal +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/ubsan.h b/cpp/src/arrow/util/ubsan.h new file mode 100644 index 00000000000..f9fcfb54022 --- /dev/null +++ b/cpp/src/arrow/util/ubsan.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Contains utilities for making UBSan happy. + +#pragma once + +#include + +#include "arrow/util/macros.h" + +namespace arrow { +namespace util { + +namespace internal { + +static uint8_t non_null_filler; + +} // namespace internal + +/// \brief Returns maybe_null if not null or a non-null pointer to an arbitrary memory +/// that shouldn't be dereferenced. +/// +/// Memset/Memcpy are undefinfed when a nullptr is passed as an argument use this utility +/// method to wrap locations where this could happen. +/// +/// Note: Flatbuffers has UBSan warnings if a zero length vector is passed. +/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve them. +template +inline T* MakeNonNull(T* maybe_null) { + if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) { + return maybe_null; + } + + return reinterpret_cast(&internal::non_null_filler); +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 38063495d45..b5266e200f7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -104,6 +104,8 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d return MakeTypedColumnStats(meta_data, descr); case Type::FIXED_LEN_BYTE_ARRAY: return MakeTypedColumnStats(meta_data, descr); + case Type::UNDEFINED: + break; } throw ParquetException("Can't decode page statistics for selected column type"); } diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 862dfc9720a..7bd7d02bfa8 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -409,6 +409,32 @@ std::unique_ptr GroupNode::FromParquet(const void* opaque_element, int nod return std::unique_ptr(group_node.release()); } +namespace { + +// If the parquet file is corrupted it is possible the type value decoded +// will not be in the range of format::Type::type, which is undefined behavior. +// This method prevents this by loading the value as the underlying type and checking +// to make sure it is in range. +template +inline typename ApiType::type SafeLoad(ThriftType* in) { + using ApiTypeEnum = typename ApiType::type; + using ApiTypeRawEnum = typename std::underlying_type::type; + static_assert(std::is_unsigned::value, + "Safe load doesn't test wrap around of signed values."); + static_assert(sizeof(ApiTypeEnum) >= sizeof(ThriftType), + "parquet type should always be the same size of larger then thrift type"); + typename std::underlying_type::type raw_value; + memcpy(&raw_value, in, sizeof(ThriftType)); + auto return_raw_value = static_cast(raw_value); + if (ARROW_PREDICT_FALSE(return_raw_value >= + static_cast(ApiType::UNDEFINED))) { + return ApiType::UNDEFINED; + } + return FromThrift(static_cast(raw_value)); +} + +} // namespace + std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element, int node_id) { const format::SchemaElement* element = @@ -417,21 +443,23 @@ std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element, std::unique_ptr primitive_node; if (element->__isset.logicalType) { // updated writer with logical type present - primitive_node = std::unique_ptr( - new PrimitiveNode(element->name, FromThrift(element->repetition_type), - LogicalAnnotation::FromThrift(element->logicalType), - FromThrift(element->type), element->type_length, node_id)); + primitive_node = std::unique_ptr(new PrimitiveNode( + element->name, SafeLoad(&(element->repetition_type)), + LogicalAnnotation::FromThrift(element->logicalType), + SafeLoad(&(element->type)), element->type_length, node_id)); } else if (element->__isset.converted_type) { // legacy writer with logical type present primitive_node = std::unique_ptr(new PrimitiveNode( - element->name, FromThrift(element->repetition_type), FromThrift(element->type), - FromThrift(element->converted_type), element->type_length, element->precision, - element->scale, node_id)); + element->name, SafeLoad(&(element->repetition_type)), + SafeLoad(&(element->type)), + SafeLoad(&(element->converted_type)), element->type_length, + element->precision, element->scale, node_id)); } else { // logical type not present primitive_node = std::unique_ptr(new PrimitiveNode( - element->name, FromThrift(element->repetition_type), NoAnnotation::Make(), - FromThrift(element->type), element->type_length, node_id)); + element->name, SafeLoad(&(element->repetition_type)), + NoAnnotation::Make(), SafeLoad(&(element->type)), element->type_length, + node_id)); } // Return as unique_ptr to the base type diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index 62083c33432..db48b246880 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -95,6 +95,7 @@ std::string FormatStatValue(Type::type parquet_type, const std::string& val) { case Type::FIXED_LEN_BYTE_ARRAY: { return val; } + case Type::UNDEFINED: default: break; } @@ -132,6 +133,7 @@ std::string FormatStatValue(Type::type parquet_type, const char* val) { result << val; break; } + case Type::UNDEFINED: default: break; } @@ -200,6 +202,7 @@ std::string TypeToString(Type::type t) { return "BYTE_ARRAY"; case Type::FIXED_LEN_BYTE_ARRAY: return "FIXED_LEN_BYTE_ARRAY"; + case Type::UNDEFINED: default: return "UNKNOWN"; } @@ -253,6 +256,7 @@ std::string LogicalTypeToString(LogicalType::type t) { return "BSON"; case LogicalType::INTERVAL: return "INTERVAL"; + case LogicalType::UNDEFINED: default: return "UNKNOWN"; } @@ -276,6 +280,7 @@ int GetTypeByteSize(Type::type parquet_type) { return type_traits::value_byte_size; case Type::FIXED_LEN_BYTE_ARRAY: return type_traits::value_byte_size; + case Type::UNDEFINED: default: return 0; } @@ -295,6 +300,7 @@ SortOrder::type DefaultSortOrder(Type::type primitive) { case Type::FIXED_LEN_BYTE_ARRAY: return SortOrder::UNSIGNED; case Type::INT96: + case Type::UNDEFINED: return SortOrder::UNKNOWN; } return SortOrder::UNKNOWN; @@ -330,6 +336,7 @@ SortOrder::type GetSortOrder(LogicalType::type converted, Type::type primitive) case LogicalType::INTERVAL: case LogicalType::NONE: // required instead of default case LogicalType::NA: // required instead of default + case LogicalType::UNDEFINED: return SortOrder::UNKNOWN; } return SortOrder::UNKNOWN; @@ -400,6 +407,7 @@ std::shared_ptr LogicalAnnotation::FromConvertedType( case LogicalType::NONE: return NoAnnotation::Make(); case LogicalType::NA: + case LogicalType::UNDEFINED: return UnknownAnnotation::Make(); } return UnknownAnnotation::Make(); diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index a109ae1eb84..779ea6b9b5b 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -61,7 +61,9 @@ struct Type { FLOAT = 4, DOUBLE = 5, BYTE_ARRAY = 6, - FIXED_LEN_BYTE_ARRAY = 7 + FIXED_LEN_BYTE_ARRAY = 7, + // Should always be last element. + UNDEFINED = 8 }; }; @@ -91,7 +93,9 @@ struct LogicalType { JSON, BSON, INTERVAL, - NA = 25 + NA = 25, + // Should always be last element. + UNDEFINED = 26 }; }; @@ -103,7 +107,7 @@ class LogicalType; // Mirrors parquet::FieldRepetitionType struct Repetition { - enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2 }; + enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 }; }; // Reference: diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index a8786477182..cf8a7f2bbfa 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -28,6 +28,7 @@ #ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" #endif +#include "arrow/util/ubsan.h" namespace fb = plasma::flatbuf; @@ -49,7 +50,7 @@ ToFlatbuffer(flatbuffers::FlatBufferBuilder* fbb, const ObjectID* object_ids, for (int64_t i = 0; i < num_objects; i++) { results.push_back(fbb->CreateString(object_ids[i].binary())); } - return fbb->CreateVector(results); + return fbb->CreateVector(arrow::util::MakeNonNull(results.data()), results.size()); } Status PlasmaReceive(int sock, MessageType message_type, std::vector* buffer) { @@ -341,7 +342,9 @@ Status SendDeleteReply(int sock, const std::vector& object_ids, auto message = fb::CreatePlasmaDeleteReply( fbb, static_cast(object_ids.size()), ToFlatbuffer(&fbb, &object_ids[0], object_ids.size()), - fbb.CreateVector(reinterpret_cast(&errors[0]), object_ids.size())); + fbb.CreateVector( + arrow::util::MakeNonNull(reinterpret_cast(errors.data())), + object_ids.size())); return PlasmaSend(sock, MessageType::PlasmaDeleteReply, &fbb, message); } @@ -421,7 +424,9 @@ Status SendListReply(int sock, const ObjectTable& objects) { entry.second->construct_duration, digest); object_infos.push_back(info); } - auto message = fb::CreatePlasmaListReply(fbb, fbb.CreateVector(object_infos)); + auto message = fb::CreatePlasmaListReply( + fbb, fbb.CreateVector(arrow::util::MakeNonNull(object_infos.data()), + object_infos.size())); return PlasmaSend(sock, MessageType::PlasmaListReply, &fbb, message); } @@ -538,6 +543,7 @@ Status SendGetReply(int sock, ObjectID object_ids[], if (object.device_num != 0) { std::shared_ptr handle; RETURN_NOT_OK(object.ipc_handle->Serialize(arrow::default_memory_pool(), &handle)); + handles.push_back(fb::CreateCudaHandle(fbb, fbb.CreateVector(handle))); handles.push_back( fb::CreateCudaHandle(fbb, fbb.CreateVector(handle->data(), handle->size()))); } @@ -545,8 +551,10 @@ Status SendGetReply(int sock, ObjectID object_ids[], } auto message = fb::CreatePlasmaGetReply( fbb, ToFlatbuffer(&fbb, object_ids, num_objects), - fbb.CreateVectorOfStructs(objects.data(), num_objects), fbb.CreateVector(store_fds), - fbb.CreateVector(mmap_sizes), fbb.CreateVector(handles)); + fbb.CreateVectorOfStructs(arrow::util::MakeNonNull(objects.data()), num_objects), + fbb.CreateVector(arrow::util::MakeNonNull(store_fds.data()), store_fds.size()), + fbb.CreateVector(arrow::util::MakeNonNull(mmap_sizes.data()), mmap_sizes.size()), + fbb.CreateVector(arrow::util::MakeNonNull(handles.data()), handles.size())); return PlasmaSend(sock, MessageType::PlasmaGetReply, &fbb, message); } From dc300d4cb10739aae3995063563dce12166e0432 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 31 May 2019 20:26:56 -0700 Subject: [PATCH 2/4] add back asan --- ci/travis_before_script_cpp.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 21ed9dd85b1..ca01b55912f 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -121,6 +121,15 @@ if [ "$ARROW_TRAVIS_VALGRIND" == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_TEST_MEMCHECK=ON" fi +if [ "$ARROW_USE_ASAN" == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_USE_ASAN=ON" +fi + +if [ "$ARROW_USE_UBSAN" == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_USE_UBSAN=ON" +fi + + if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON" fi From 19ee908d72681c331220181b02386f2fd2eaa24f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 31 May 2019 23:56:24 -0700 Subject: [PATCH 3/4] handle signed and unsigned enum loading in parquet to support windows --- cpp/src/parquet/schema.cc | 70 +++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 7bd7d02bfa8..8fbab85f86b 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -415,23 +415,49 @@ namespace { // will not be in the range of format::Type::type, which is undefined behavior. // This method prevents this by loading the value as the underlying type and checking // to make sure it is in range. -template -inline typename ApiType::type SafeLoad(ThriftType* in) { +template +struct SafeLoader { using ApiTypeEnum = typename ApiType::type; using ApiTypeRawEnum = typename std::underlying_type::type; - static_assert(std::is_unsigned::value, - "Safe load doesn't test wrap around of signed values."); - static_assert(sizeof(ApiTypeEnum) >= sizeof(ThriftType), - "parquet type should always be the same size of larger then thrift type"); - typename std::underlying_type::type raw_value; - memcpy(&raw_value, in, sizeof(ThriftType)); - auto return_raw_value = static_cast(raw_value); - if (ARROW_PREDICT_FALSE(return_raw_value >= - static_cast(ApiType::UNDEFINED))) { - return ApiType::UNDEFINED; + + template + inline static ApiTypeRawEnum LoadRaw(ThriftType* in) { + static_assert( + sizeof(ApiTypeEnum) >= sizeof(ThriftType), + "parquet type should always be the same size of larger then thrift type"); + typename std::underlying_type::type raw_value; + memcpy(&raw_value, in, sizeof(ThriftType)); + return static_cast(raw_value); + } + + template + inline static ApiTypeEnum LoadChecked( + typename std::enable_if::type* in) { + auto raw_value = LoadRaw(in); + if (ARROW_PREDICT_FALSE(raw_value >= + static_cast(ApiType::UNDEFINED))) { + return ApiType::UNDEFINED; + } + return FromThrift(static_cast(raw_value)); + } + + template + inline static ApiTypeEnum LoadChecked( + typename std::enable_if::type* in) { + auto raw_value = LoadRaw(in); + if (ARROW_PREDICT_FALSE(raw_value >= + static_cast(ApiType::UNDEFINED) || + raw_value < 0)) { + return ApiType::UNDEFINED; + } + return FromThrift(static_cast(raw_value)); } - return FromThrift(static_cast(raw_value)); -} + + template + inline static ApiTypeEnum Load(ThriftType* in) { + return LoadChecked::value>(in); + } +}; } // namespace @@ -444,22 +470,22 @@ std::unique_ptr PrimitiveNode::FromParquet(const void* opaque_element, if (element->__isset.logicalType) { // updated writer with logical type present primitive_node = std::unique_ptr(new PrimitiveNode( - element->name, SafeLoad(&(element->repetition_type)), + element->name, SafeLoader::Load(&(element->repetition_type)), LogicalAnnotation::FromThrift(element->logicalType), - SafeLoad(&(element->type)), element->type_length, node_id)); + SafeLoader::Load(&(element->type)), element->type_length, node_id)); } else if (element->__isset.converted_type) { // legacy writer with logical type present primitive_node = std::unique_ptr(new PrimitiveNode( - element->name, SafeLoad(&(element->repetition_type)), - SafeLoad(&(element->type)), - SafeLoad(&(element->converted_type)), element->type_length, + element->name, SafeLoader::Load(&(element->repetition_type)), + SafeLoader::Load(&(element->type)), + SafeLoader::Load(&(element->converted_type)), element->type_length, element->precision, element->scale, node_id)); } else { // logical type not present primitive_node = std::unique_ptr(new PrimitiveNode( - element->name, SafeLoad(&(element->repetition_type)), - NoAnnotation::Make(), SafeLoad(&(element->type)), element->type_length, - node_id)); + element->name, SafeLoader::Load(&(element->repetition_type)), + NoAnnotation::Make(), SafeLoader::Load(&(element->type)), + element->type_length, node_id)); } // Return as unique_ptr to the base type From bba5824c4afabeadb959e856a4de1698a0d85308 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 1 Jun 2019 00:30:36 -0700 Subject: [PATCH 4/4] fix io/memory.cc asan --- cpp/src/arrow/io/memory.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 4e9c2ea79e4..22a9f9bb5fd 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -100,9 +100,11 @@ Status BufferOutputStream::Write(const void* data, int64_t nbytes) { return Status::IOError("OutputStream is closed"); } DCHECK(buffer_); - RETURN_NOT_OK(Reserve(nbytes)); - memcpy(mutable_data_ + position_, data, nbytes); - position_ += nbytes; + if (ARROW_PREDICT_TRUE(nbytes > 0)) { + RETURN_NOT_OK(Reserve(nbytes)); + memcpy(mutable_data_ + position_, data, nbytes); + position_ += nbytes; + } return Status::OK(); }