Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 34 additions & 38 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
#include "arrow/util/bitmap_builders.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/macros.h"
#include "arrow/util/range.h"
#include "arrow/visit_data_inline.h"
Expand Down Expand Up @@ -366,13 +367,12 @@ TEST_F(TestArray, BuildLargeInMemoryArray) {
ASSERT_EQ(length, result->length());
}

TEST_F(TestArray, TestMakeArrayOfNull) {
static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseTypes() {
FieldVector union_fields1({field("a", utf8()), field("b", int32())});
FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
std::vector<int8_t> union_type_codes{7, 42};

std::shared_ptr<DataType> types[] = {
// clang-format off
return {
null(),
boolean(),
int8(),
Expand All @@ -387,7 +387,7 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
utf8(),
large_utf8(),
list(utf8()),
list(int64()), // ARROW-9071
list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull
large_list(large_utf8()),
fixed_size_list(utf8(), 3),
fixed_size_list(int64(), 4),
Expand All @@ -397,13 +397,15 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
sparse_union(union_fields2, union_type_codes),
dense_union(union_fields1, union_type_codes),
dense_union(union_fields2, union_type_codes),
smallint(), // extension type
list_extension_type(), // nested extension type
// clang-format on
smallint(), // extension type
list_extension_type(), // nested extension type
run_end_encoded(int16(), utf8()),
};
}

TEST_F(TestArray, TestMakeArrayOfNull) {
for (int64_t length : {0, 1, 16, 133}) {
for (auto type : types) {
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
ARROW_SCOPED_TRACE("type = ", type->ToString());
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(type, length));
ASSERT_EQ(array->type(), type);
Expand Down Expand Up @@ -716,36 +718,7 @@ void CheckSpanRoundTrip(const Array& array) {
}

TEST_F(TestArray, TestMakeEmptyArray) {
FieldVector union_fields1({field("a", utf8()), field("b", int32())});
FieldVector union_fields2({field("a", null()), field("b", list(large_utf8()))});
std::vector<int8_t> union_type_codes{7, 42};

std::shared_ptr<DataType> types[] = {null(),
boolean(),
int8(),
uint16(),
int32(),
uint64(),
float64(),
binary(),
large_binary(),
fixed_size_binary(3),
decimal(16, 4),
utf8(),
large_utf8(),
list(utf8()),
list(int64()),
large_list(large_utf8()),
fixed_size_list(utf8(), 3),
fixed_size_list(int64(), 4),
dictionary(int32(), utf8()),
struct_({field("a", utf8()), field("b", int32())}),
sparse_union(union_fields1, union_type_codes),
sparse_union(union_fields2, union_type_codes),
dense_union(union_fields1, union_type_codes),
dense_union(union_fields2, union_type_codes)};

for (auto type : types) {
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
ARROW_SCOPED_TRACE("type = ", type->ToString());
ASSERT_OK_AND_ASSIGN(auto array, MakeEmptyArray(type));
ASSERT_OK(array->ValidateFull());
Expand All @@ -754,6 +727,29 @@ TEST_F(TestArray, TestMakeEmptyArray) {
}
}

TEST_F(TestArray, TestFillFromScalar) {
for (auto type : TestArrayUtilitiesAgainstTheseTypes()) {
ARROW_SCOPED_TRACE("type = ", type->ToString());
for (auto seed : {0u, 0xdeadbeef, 42u}) {
ARROW_SCOPED_TRACE("seed = ", seed);

Field field("", type, /*nullable=*/true,
key_value_metadata({{"extension_allow_random_storage", "true"}}));
auto array = random::GenerateArray(field, 1, seed);

ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0));

ArraySpan span(*scalar);
auto roundtripped_array = span.ToArray();
AssertArraysEqual(*array, *roundtripped_array);

ASSERT_OK(roundtripped_array->ValidateFull());
ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0));
AssertScalarsEqual(*scalar, *roundtripped_scalar);
}
}
}

TEST_F(TestArray, ExtensionSpanRoundTrip) {
// Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't
// work for extension types so we check that here
Expand Down
101 changes: 68 additions & 33 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64
}

std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
ARROW_CHECK_LE(off, length) << "Slice offset greater than array length";
ARROW_CHECK_LE(off, length) << "Slice offset (" << off
<< ") greater than array length (" << length << ")";
Comment on lines +133 to +134
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this has to always perform int to string conversion. Even when the check passes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't. The operator<< is only invoked when the assertion fails. Though operator precedence makes me slightly uneasy:

#define ARROW_CHECK_OR_LOG(condition, level) \
  ARROW_PREDICT_TRUE(condition)              \
  ? ARROW_IGNORE_EXPR(0)                     \
  : ::arrow::util::Voidify() & ARROW_LOG(level) << " Check failed: " #condition " "

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could actually add a test for that, IMHO.

len = std::min(length - off, len);
off += offset;

Expand Down Expand Up @@ -228,22 +229,20 @@ void ArraySpan::SetMembers(const ArrayData& data) {
namespace {

template <typename offset_type>
void SetOffsetsForScalar(ArraySpan* span, offset_type* buffer, int64_t value_size,
int buffer_index = 1) {
buffer[0] = 0;
buffer[1] = static_cast<offset_type>(value_size);
span->buffers[buffer_index].data = reinterpret_cast<uint8_t*>(buffer);
span->buffers[buffer_index].size = 2 * sizeof(offset_type);
BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) {
auto* offsets = reinterpret_cast<offset_type*>(scratch_space);
offsets[0] = 0;
offsets[1] = static_cast<offset_type>(value_size);
return {scratch_space, sizeof(offset_type) * 2};
}

int GetNumBuffers(const DataType& type) {
switch (type.id()) {
case Type::NA:
case Type::STRUCT:
case Type::FIXED_SIZE_LIST:
return 1;
case Type::RUN_END_ENCODED:
return 0;
return 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I merged a bit quickly, why does RUN_END_ENCODED need one buffer here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many places in the codebase assume that buffers.size() >= 1, even if buffers[0] == nullptr. When I added test cases which exercised REE scalars those places segfaulted. I thought that requiring buffers.size() >= 1 for REE (as we do for union) was the most expeditious fix

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@felipecrv What do you think here? Should we require a one-element buffers vector for REE?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RUN_END_ENCODED doesn't need any buffer but NA also does not and we return 1 here. 🤔

It's a "once you start lying you can't stop lying" kind of problem for GetNumBuffers

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, this was the only place in the codebase which didn't already give REE at least one buffer; the constructors, the builder, ... already did so

case Type::BINARY:
case Type::LARGE_BINARY:
case Type::STRING:
Expand All @@ -265,16 +264,19 @@ int GetNumBuffers(const DataType& type) {
namespace internal {

void FillZeroLengthArray(const DataType* type, ArraySpan* span) {
memset(span->scratch_space, 0x00, sizeof(span->scratch_space));

span->type = type;
span->length = 0;
int num_buffers = GetNumBuffers(*type);
for (int i = 0; i < num_buffers; ++i) {
span->buffers[i].data = reinterpret_cast<uint8_t*>(span->scratch_space);
alignas(int64_t) static std::array<uint8_t, sizeof(int64_t) * 2> kZeros{0};
span->buffers[i].data = kZeros.data();
span->buffers[i].size = 0;
}

if (!HasValidityBitmap(type->id())) {
span->buffers[0] = {};
}

for (int i = num_buffers; i < 3; ++i) {
span->buffers[i] = {};
}
Expand Down Expand Up @@ -304,9 +306,13 @@ void ArraySpan::FillFromScalar(const Scalar& value) {

Type::type type_id = value.type->id();

// Populate null count and validity bitmap (only for non-union/null types)
this->null_count = value.is_valid ? 0 : 1;
if (!is_union(type_id) && type_id != Type::NA) {
if (type_id == Type::NA) {
this->null_count = 1;
} else if (!internal::HasValidityBitmap(type_id)) {
this->null_count = 0;
} else {
// Populate null count and validity bitmap
this->null_count = value.is_valid ? 0 : 1;
this->buffers[0].data = value.is_valid ? &kTrueBit : &kFalseBit;
this->buffers[0].size = 1;
}
Expand All @@ -329,20 +335,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
}
} else if (is_base_binary_like(type_id)) {
const auto& scalar = checked_cast<const BaseBinaryScalar&>(value);
this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space);

const uint8_t* data_buffer = nullptr;
int64_t data_size = 0;
if (scalar.is_valid) {
data_buffer = scalar.value->data();
data_size = scalar.value->size();
}
if (is_binary_like(type_id)) {
SetOffsetsForScalar<int32_t>(this, reinterpret_cast<int32_t*>(this->scratch_space),
data_size);
this->buffers[1] =
OffsetsForScalar(scalar.scratch_space_, static_cast<int32_t>(data_size));
} else {
// is_large_binary_like
SetOffsetsForScalar<int64_t>(this, reinterpret_cast<int64_t*>(this->scratch_space),
data_size);
this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, data_size);
}
this->buffers[2].data = const_cast<uint8_t*>(data_buffer);
this->buffers[2].size = data_size;
Expand All @@ -367,11 +372,10 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
}

if (type_id == Type::LIST || type_id == Type::MAP) {
SetOffsetsForScalar<int32_t>(this, reinterpret_cast<int32_t*>(this->scratch_space),
value_length);
this->buffers[1] =
OffsetsForScalar(scalar.scratch_space_, static_cast<int32_t>(value_length));
} else if (type_id == Type::LARGE_LIST) {
SetOffsetsForScalar<int64_t>(this, reinterpret_cast<int64_t*>(this->scratch_space),
value_length);
this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length);
} else {
// FIXED_SIZE_LIST: does not have a second buffer
this->buffers[1] = {};
Expand All @@ -384,26 +388,31 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
this->child_data[i].FillFromScalar(*scalar.value[i]);
}
} else if (is_union(type_id)) {
// Dense union needs scratch space to store both offsets and a type code
struct UnionScratchSpace {
alignas(int64_t) int8_t type_code;
alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2];
};
static_assert(sizeof(UnionScratchSpace) <= sizeof(UnionScalar::scratch_space_));
auto* union_scratch_space = reinterpret_cast<UnionScratchSpace*>(
&checked_cast<const UnionScalar&>(value).scratch_space_);

// First buffer is kept null since unions have no validity vector
this->buffers[0] = {};

this->buffers[1].data = reinterpret_cast<uint8_t*>(this->scratch_space);
union_scratch_space->type_code = checked_cast<const UnionScalar&>(value).type_code;
this->buffers[1].data = reinterpret_cast<uint8_t*>(&union_scratch_space->type_code);
this->buffers[1].size = 1;
int8_t* type_codes = reinterpret_cast<int8_t*>(this->scratch_space);
type_codes[0] = checked_cast<const UnionScalar&>(value).type_code;

this->child_data.resize(this->type->num_fields());
if (type_id == Type::DENSE_UNION) {
const auto& scalar = checked_cast<const DenseUnionScalar&>(value);
// Has offset; start 4 bytes in so it's aligned to a 32-bit boundaries
SetOffsetsForScalar<int32_t>(this,
reinterpret_cast<int32_t*>(this->scratch_space) + 1, 1,
/*buffer_index=*/2);
this->buffers[2] =
OffsetsForScalar(union_scratch_space->offsets, static_cast<int32_t>(1));
// We can't "see" the other arrays in the union, but we put the "active"
// union array in the right place and fill zero-length arrays for the
// others
const std::vector<int>& child_ids =
checked_cast<const UnionType*>(this->type)->child_ids();
const auto& child_ids = checked_cast<const UnionType*>(this->type)->child_ids();
DCHECK_GE(scalar.type_code, 0);
DCHECK_LT(scalar.type_code, static_cast<int>(child_ids.size()));
for (int i = 0; i < static_cast<int>(this->child_data.size()); ++i) {
Expand All @@ -429,6 +438,32 @@ void ArraySpan::FillFromScalar(const Scalar& value) {

// Restore the extension type
this->type = value.type.get();
} else if (type_id == Type::RUN_END_ENCODED) {
const auto& scalar = checked_cast<const RunEndEncodedScalar&>(value);
this->child_data.resize(2);

auto set_run_end = [&](auto run_end) {
auto& e = this->child_data[0];
e.type = scalar.run_end_type().get();
e.length = 1;
e.null_count = 0;
e.buffers[1].data = scalar.scratch_space_;
e.buffers[1].size = sizeof(run_end);
reinterpret_cast<decltype(run_end)*>(scalar.scratch_space_)[0] = run_end;
};

switch (scalar.run_end_type()->id()) {
case Type::INT16:
set_run_end(static_cast<int16_t>(1));
break;
case Type::INT32:
set_run_end(static_cast<int32_t>(1));
break;
default:
DCHECK_EQ(scalar.run_end_type()->id(), Type::INT64);
set_run_end(static_cast<int64_t>(1));
}
this->child_data[1].FillFromScalar(*scalar.value);
} else {
DCHECK_EQ(Type::NA, type_id) << "should be unreachable: " << *value.type;
}
Expand Down
5 changes: 0 additions & 5 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,6 @@ struct ARROW_EXPORT ArraySpan {
int64_t offset = 0;
BufferSpan buffers[3];

// 16 bytes of scratch space to enable this ArraySpan to be a view onto
// scalar values including binary scalars (where we need to create a buffer
// that looks like two 32-bit or 64-bit offsets)
uint64_t scratch_space[2];

ArraySpan() = default;

explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
Expand Down
28 changes: 20 additions & 8 deletions cpp/src/arrow/array/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -554,13 +554,18 @@ class NullArrayFactory {
}

Status Visit(const RunEndEncodedType& type) {
ARROW_ASSIGN_OR_RAISE(auto values, MakeArrayOfNull(type.value_type(), 1, pool_));
ARROW_ASSIGN_OR_RAISE(auto run_end_scalar,
MakeScalarForRunEndValue(*type.run_end_type(), length_));
ARROW_ASSIGN_OR_RAISE(auto run_ends, MakeArrayFromScalar(*run_end_scalar, 1, pool_));
ARROW_ASSIGN_OR_RAISE(auto ree_array,
RunEndEncodedArray::Make(length_, run_ends, values));
out_ = ree_array->data();
std::shared_ptr<Array> run_ends, values;
if (length_ == 0) {
ARROW_ASSIGN_OR_RAISE(run_ends, MakeEmptyArray(type.run_end_type(), pool_));
ARROW_ASSIGN_OR_RAISE(values, MakeEmptyArray(type.value_type(), pool_));
} else {
ARROW_ASSIGN_OR_RAISE(auto run_end_scalar,
MakeScalarForRunEndValue(*type.run_end_type(), length_));
ARROW_ASSIGN_OR_RAISE(run_ends, MakeArrayFromScalar(*run_end_scalar, 1, pool_));
ARROW_ASSIGN_OR_RAISE(values, MakeArrayOfNull(type.value_type(), 1, pool_));
}
out_->child_data[0] = run_ends->data();
out_->child_data[1] = values->data();
return Status::OK();
}

Expand All @@ -582,7 +587,7 @@ class NullArrayFactory {
}

MemoryPool* pool_;
std::shared_ptr<DataType> type_;
const std::shared_ptr<DataType>& type_;
int64_t length_;
std::shared_ptr<ArrayData> out_;
std::shared_ptr<Buffer> buffer_;
Expand Down Expand Up @@ -859,6 +864,13 @@ Result<std::shared_ptr<Array>> MakeArrayFromScalar(const Scalar& scalar, int64_t

Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
MemoryPool* memory_pool) {
if (type->id() == Type::EXTENSION) {
const auto& ext_type = checked_cast<const ExtensionType&>(*type);
ARROW_ASSIGN_OR_RAISE(auto storage,
MakeEmptyArray(ext_type.storage_type(), memory_pool));
storage->data()->type = std::move(type);
return ext_type.MakeArray(storage->data());
}
std::unique_ptr<ArrayBuilder> builder;
RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
RETURN_NOT_OK(builder->Resize(0));
Expand Down
Loading