Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
711 changes: 262 additions & 449 deletions cpp/src/arrow/array-binary-test.cc

Large diffs are not rendered by default.

133 changes: 62 additions & 71 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -386,31 +386,26 @@ BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}

void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->buffers.size(), 3);
auto value_offsets = data->buffers[1];
auto value_data = data->buffers[2];
this->Array::SetData(data);
raw_data_ = value_data == nullptr ? nullptr : value_data->data();
raw_value_offsets_ = value_offsets == nullptr
? nullptr
: reinterpret_cast<const int32_t*>(value_offsets->data());
}

BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset)
: BinaryArray(binary(), length, value_offsets, data, null_bitmap, null_count,
offset) {}

BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
SetData(ArrayData::Make(type, length, {null_bitmap, value_offsets, data}, null_count,
offset));
SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_BINARY);
SetData(data);
}

LargeBinaryArray::LargeBinaryArray(int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset) {
SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
Expand All @@ -421,8 +416,24 @@ StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset)
: BinaryArray(utf8(), length, value_offsets, data, null_bitmap, null_count, offset) {}
int64_t offset) {
SetData(ArrayData::Make(utf8(), length, {null_bitmap, value_offsets, data}, null_count,
offset));
}

LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
SetData(data);
}

LargeStringArray::LargeStringArray(int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset) {
SetData(ArrayData::Make(large_utf8(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

// ----------------------------------------------------------------------
// Fixed width binary
Expand Down Expand Up @@ -1148,20 +1159,14 @@ struct ValidateVisitor {
return ValidateOffsets(array);
}

Status Visit(const ListArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}

auto value_offsets = array.value_offsets();
if (array.length() && !value_offsets) {
return Status::Invalid("value_offsets_ was null");
}
if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
" isn't large enough for length: ", array.length());
Status Visit(const LargeBinaryArray& array) {
if (array.data()->buffers.size() != 3) {
return Status::Invalid("number of buffers was != 3");
}
return ValidateOffsets(array);
}

Status Visit(const ListArray& array) {
if (!array.values()) {
return Status::Invalid("values was null");
}
Expand All @@ -1181,19 +1186,6 @@ struct ValidateVisitor {
}

Status Visit(const MapArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}

auto value_offsets = array.value_offsets();
if (array.length() && !value_offsets) {
return Status::Invalid("value_offsets_ was null");
}
if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
" isn't large enough for length: ", array.length());
}

if (!array.keys()) {
return Status::Invalid("keys was null");
}
Expand Down Expand Up @@ -1224,9 +1216,6 @@ struct ValidateVisitor {
}

Status Visit(const FixedSizeListArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}
if (!array.values()) {
return Status::Invalid("values was null");
}
Expand All @@ -1240,14 +1229,6 @@ struct ValidateVisitor {
}

Status Visit(const StructArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}

if (array.null_count() > array.length()) {
return Status::Invalid("Null count exceeds the length of this struct");
}

if (array.num_fields() > 0) {
// Validate fields
int64_t array_length = array.field(0)->length();
Expand All @@ -1274,16 +1255,7 @@ struct ValidateVisitor {
return Status::OK();
}

Status Visit(const UnionArray& array) {
if (array.length() < 0) {
return Status::Invalid("Length was negative");
}

if (array.null_count() > array.length()) {
return Status::Invalid("Null count exceeds the length of this struct");
}
return Status::OK();
}
Status Visit(const UnionArray& array) { return Status::OK(); }

Status Visit(const DictionaryArray& array) {
Type::type index_type_id = array.indices()->type()->id();
Expand All @@ -1310,12 +1282,23 @@ struct ValidateVisitor {
protected:
template <typename ArrayType>
Status ValidateOffsets(ArrayType& array) {
int32_t prev_offset = array.value_offset(0);
using offset_type = typename ArrayType::offset_type;

auto value_offsets = array.value_offsets();
if (array.length() && !value_offsets) {
return Status::Invalid("value_offsets_ was null");
}
if (value_offsets->size() / static_cast<int>(sizeof(offset_type)) < array.length()) {
return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(),
" isn't large enough for length: ", array.length());
}

auto prev_offset = array.value_offset(0);
if (array.offset() == 0 && prev_offset != 0) {
return Status::Invalid("The first offset wasn't zero");
}
for (int64_t i = 1; i <= array.length(); ++i) {
int32_t current_offset = array.value_offset(i);
auto current_offset = array.value_offset(i);
if (array.IsNull(i - 1) && current_offset != prev_offset) {
return Status::Invalid("Offset invariant failure at: ", i,
" inconsistent value_offsets for null slot",
Expand All @@ -1340,6 +1323,14 @@ Status ValidateArray(const Array& array) {
const auto layout = type.layout();
const ArrayData& data = *array.data();

if (array.length() < 0) {
return Status::Invalid("Array length is negative");
}

if (array.null_count() > array.length()) {
return Status::Invalid("Null count exceeds array length");
}

if (data.buffers.size() != layout.bit_widths.size()) {
return Status::Invalid("Expected ", layout.bit_widths.size(),
" buffers in array "
Expand Down
94 changes: 70 additions & 24 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
class ARROW_EXPORT ListArray : public Array {
public:
using TypeClass = ListType;
using offset_type = ListType::offset_type;

explicit ListArray(const std::shared_ptr<ArrayData>& data);

Expand Down Expand Up @@ -635,24 +636,20 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
// ----------------------------------------------------------------------
// Binary and String

/// Concrete Array class for variable-size binary data
class ARROW_EXPORT BinaryArray : public FlatArray {
/// Base class for variable-sized binary arrays, regardless of offset size
/// and logical interpretation.
template <typename TYPE>
class BaseBinaryArray : public FlatArray {
public:
using TypeClass = BinaryType;

explicit BinaryArray(const std::shared_ptr<ArrayData>& data);

BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;

/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
const uint8_t* GetValue(int64_t i, int32_t* out_length) const {
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
// Account for base offset
i += data_->offset;
const int32_t pos = raw_value_offsets_[i];
const offset_type pos = raw_value_offsets_[i];
*out_length = raw_value_offsets_[i + 1] - pos;
return raw_data_ + pos;
}
Expand All @@ -664,7 +661,7 @@ class ARROW_EXPORT BinaryArray : public FlatArray {
util::string_view GetView(int64_t i) const {
// Account for base offset
i += data_->offset;
const int32_t pos = raw_value_offsets_[i];
const offset_type pos = raw_value_offsets_[i];
return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
raw_value_offsets_[i + 1] - pos);
}
Expand All @@ -681,31 +678,52 @@ class ARROW_EXPORT BinaryArray : public FlatArray {
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }

const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
const offset_type* raw_value_offsets() const {
return raw_value_offsets_ + data_->offset;
}

// Neither of these functions will perform boundschecking
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
int32_t value_length(int64_t i) const {
offset_type value_offset(int64_t i) const {
return raw_value_offsets_[i + data_->offset];
}
offset_type value_length(int64_t i) const {
i += data_->offset;
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}

protected:
// For subclasses
BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {}
BaseBinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {}

/// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data);
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
auto value_offsets = data->buffers[1];
auto value_data = data->buffers[2];
this->Array::SetData(data);
raw_data_ = value_data == NULLPTR ? NULLPTR : value_data->data();
raw_value_offsets_ =
value_offsets == NULLPTR
? NULLPTR
: reinterpret_cast<const offset_type*>(value_offsets->data());
}

// Constructor to allow sub-classes/builders to substitute their own logical type
BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const offset_type* raw_value_offsets_;
const uint8_t* raw_data_;
};

/// Concrete Array class for variable-size binary data
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
public:
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);

BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

const int32_t* raw_value_offsets_;
const uint8_t* raw_data_;
protected:
// For subclasses such as StringArray
BinaryArray() : BaseBinaryArray() {}
};

/// Concrete Array class for variable-size string (utf-8) data
Expand All @@ -721,6 +739,34 @@ class ARROW_EXPORT StringArray : public BinaryArray {
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
};

/// Concrete Array class for large variable-size binary data
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
public:
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);

LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

protected:
// For subclasses such as LargeStringArray
LargeBinaryArray() : BaseBinaryArray() {}
};

/// Concrete Array class for large variable-size string (utf-8) data
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
public:
using TypeClass = LargeStringType;

explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);

LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
};

// ----------------------------------------------------------------------
// Fixed width binary

Expand Down
Loading