Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
459 changes: 248 additions & 211 deletions cpp/src/arrow/array-list-test.cc

Large diffs are not rendered by default.

144 changes: 102 additions & 42 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cstdint>
#include <limits>
#include <sstream>
#include <type_traits>
#include <utility>

#include "arrow/buffer.h"
Expand Down Expand Up @@ -199,34 +200,29 @@ BooleanArray::BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
: PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {}

// ----------------------------------------------------------------------
// ListArray
// ListArray / LargeListArray

ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
namespace {

ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
auto internal_data =
ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
internal_data->child_data.emplace_back(values->data());
SetData(internal_data);
}
template <typename TYPE>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ListArrayType?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That implies it's an Array type. I can call it TypeClass if you want.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is OK for now.

Status ListArrayFromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out) {
using offset_type = typename TYPE::offset_type;
using ArrayType = typename TypeTraits<TYPE>::ArrayType;
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;

Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out) {
if (offsets.length() == 0) {
return Status::Invalid("List offsets must have non-zero length");
}

if (offsets.type_id() != Type::INT32) {
return Status::TypeError("List offsets must be signed int32");
if (offsets.type_id() != OffsetArrowType::type_id) {
return Status::TypeError("List offsets must be ", OffsetArrowType::type_name());
}

BufferVector buffers = {};

const auto& typed_offsets = checked_cast<const Int32Array&>(offsets);
const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);

const int64_t num_offsets = offsets.length();

Expand All @@ -236,7 +232,8 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
}

std::shared_ptr<Buffer> clean_offsets, clean_valid_bits;
RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets));
RETURN_NOT_OK(
AllocateBuffer(pool, num_offsets * sizeof(offset_type), &clean_offsets));

// Copy valid bits, zero out the bit for the final offset
// XXX why?
Expand All @@ -245,11 +242,12 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets);
buffers.emplace_back(std::move(clean_valid_bits));

const int32_t* raw_offsets = typed_offsets.raw_values();
auto clean_raw_offsets = reinterpret_cast<int32_t*>(clean_offsets->mutable_data());
const offset_type* raw_offsets = typed_offsets.raw_values();
auto clean_raw_offsets =
reinterpret_cast<offset_type*>(clean_offsets->mutable_data());

// Must work backwards so we can tell how many values were in the last non-null value
int32_t current_offset = raw_offsets[num_offsets - 1];
offset_type current_offset = raw_offsets[num_offsets - 1];
for (int64_t i = num_offsets - 1; i >= 0; --i) {
if (offsets.IsValid(i)) {
current_offset = raw_offsets[i];
Expand All @@ -263,37 +261,88 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
buffers.emplace_back(typed_offsets.values());
}

auto list_type = list(values.type());
auto list_type = std::make_shared<TYPE>(values.type());
auto internal_data = ArrayData::Make(list_type, num_offsets - 1, std::move(buffers),
offsets.null_count(), offsets.offset());
internal_data->child_data.push_back(values.data());

*out = std::make_shared<ListArray>(internal_data);
*out = std::make_shared<ArrayType>(internal_data);
return Status::OK();
}

} // namespace

ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

LargeListArray::LargeListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
ARROW_CHECK_EQ(type->id(), Type::LIST);
auto internal_data =
ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
internal_data->child_data.emplace_back(values->data());
SetData(internal_data);
}

LargeListArray::LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset) {
ARROW_CHECK_EQ(type->id(), Type::LARGE_LIST);
auto internal_data =
ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
internal_data->child_data.emplace_back(values->data());
SetData(internal_data);
}

void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
ARROW_CHECK_EQ(data->buffers.size(), 2);
ARROW_CHECK(data->type->id() == Type::LIST);
ARROW_CHECK_EQ(data->type->id(), Type::LIST);
list_type_ = checked_cast<const ListType*>(data->type.get());

auto value_offsets = data->buffers[1];
raw_value_offsets_ = value_offsets == nullptr
? nullptr
: reinterpret_cast<const int32_t*>(value_offsets->data());
: reinterpret_cast<const offset_type*>(value_offsets->data());

ARROW_CHECK_EQ(data_->child_data.size(), 1);
ARROW_CHECK_EQ(list_type_->value_type()->id(), data->child_data[0]->type->id());
DCHECK(list_type_->value_type()->Equals(data->child_data[0]->type));
values_ = MakeArray(data_->child_data[0]);
}

std::shared_ptr<DataType> ListArray::value_type() const {
return list_type()->value_type();
void LargeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
ARROW_CHECK_EQ(data->buffers.size(), 2);
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_LIST);
list_type_ = checked_cast<const LargeListType*>(data->type.get());

auto value_offsets = data->buffers[1];
raw_value_offsets_ = value_offsets == nullptr
? nullptr
: reinterpret_cast<const offset_type*>(value_offsets->data());

ARROW_CHECK_EQ(data_->child_data.size(), 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might want to check someplace that the offset buffer has the expected size?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's done in ValidateArray (well, it checks it has at least the required size)

ARROW_CHECK_EQ(list_type_->value_type()->id(), data->child_data[0]->type->id());
DCHECK(list_type_->value_type()->Equals(data->child_data[0]->type));
values_ = MakeArray(data_->child_data[0]);
}

std::shared_ptr<Array> ListArray::values() const { return values_; }
Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out) {
return ListArrayFromArrays<ListType>(offsets, values, pool, out);
}

Status LargeListArray::FromArrays(const Array& offsets, const Array& values,
MemoryPool* pool, std::shared_ptr<Array>* out) {
return ListArrayFromArrays<LargeListType>(offsets, values, pool, out);
}

// ----------------------------------------------------------------------
// MapArray
Expand Down Expand Up @@ -1167,21 +1216,12 @@ struct ValidateVisitor {
}

Status Visit(const ListArray& array) {
if (!array.values()) {
return Status::Invalid("values was null");
}

const int32_t last_offset = array.value_offset(array.length());
if (array.values()->length() != last_offset) {
return Status::Invalid("Final offset invariant not equal to values length: ",
last_offset, "!=", array.values()->length());
}

const Status child_valid = ValidateArray(*array.values());
if (!child_valid.ok()) {
return Status::Invalid("Child array invalid: ", child_valid.ToString());
}
RETURN_NOT_OK(ValidateListArray(array));
return ValidateOffsets(array);
}

Status Visit(const LargeListArray& array) {
RETURN_NOT_OK(ValidateListArray(array));
return ValidateOffsets(array);
}

Expand Down Expand Up @@ -1280,6 +1320,26 @@ struct ValidateVisitor {
}

protected:
template <typename ListArrayType>
Status ValidateListArray(const ListArrayType& array) {
if (!array.values()) {
return Status::Invalid("values was null");
}

const auto last_offset = array.value_offset(array.length());
if (array.values()->length() != last_offset) {
return Status::Invalid("Final offset invariant not equal to values length: ",
last_offset, "!=", array.values()->length());
}

const Status child_valid = ValidateArray(*array.values());
if (!child_valid.ok()) {
return Status::Invalid("Child array invalid: ", child_valid.ToString());
}

return ValidateOffsets(array);
}

template <typename ArrayType>
Status ValidateOffsets(ArrayType& array) {
using offset_type = typename ArrayType::offset_type;
Expand Down
103 changes: 71 additions & 32 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -488,12 +488,49 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
// ----------------------------------------------------------------------
// ListArray

/// Concrete Array class for list data
class ARROW_EXPORT ListArray : public Array {
/// Base class for variable-sized list arrays, regardless of offset size.
template <typename TYPE>
class BaseListArray : public Array {
public:
using TypeClass = ListType;
using offset_type = ListType::offset_type;
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;

const TypeClass* list_type() const { return list_type_; }

/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const { return values_; }

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }

std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }

/// Return pointer to raw value offsets accounting for any slice offset
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}

// The following functions will not perform boundschecking
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}

protected:
const TypeClass* list_type_ = NULLPTR;
std::shared_ptr<Array> values_;
const offset_type* raw_value_offsets_ = NULLPTR;
};

/// Concrete Array class for list data
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
public:
explicit ListArray(const std::shared_ptr<ArrayData>& data);

ListArray(const std::shared_ptr<DataType>& type, int64_t length,
Expand All @@ -511,46 +548,48 @@ class ARROW_EXPORT ListArray : public Array {
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
/// \param[in] values Array containing
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
/// \param[out] out Will have length equal to offsets.length() - 1
static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out);

const ListType* list_type() const { return list_type_; }

/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }

std::shared_ptr<DataType> value_type() const;

/// Return pointer to raw value offsets accounting for any slice offset
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }

// The following functions will not perform boundschecking
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
int32_t value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
std::shared_ptr<Array> value_slice(int64_t i) const {
return values_->Slice(value_offset(i), value_length(i));
}

protected:
// This constructor defers SetData to a derived array class
ListArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
};

const int32_t* raw_value_offsets_ = NULLPTR;
/// Concrete Array class for large list data (with 64-bit offsets)
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
public:
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);

private:
const ListType* list_type_ = NULLPTR;
std::shared_ptr<Array> values_;
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Array>& values,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

/// \brief Construct LargeListArray from array of offsets and child value array
///
/// This function does the bare minimum of validation of the offsets and
/// input types, and will allocate a new offsets array if necessary (i.e. if
/// the offsets contain any nulls). If the offsets do not have nulls, they
/// are assumed to be well-formed
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int64 type
/// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
/// \param[out] out Will have length equal to offsets.length() - 1
static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out);

protected:
void SetData(const std::shared_ptr<ArrayData>& data);
};

// ----------------------------------------------------------------------
Expand Down
Loading