Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,62 @@ UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int64_t length,
SetData(internal_data);
}

Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Array>* out) {
if (value_offsets.length() == 0) {
return Status::Invalid("UnionArray offsets must have non-zero length");
}

if (value_offsets.type_id() != Type::INT32) {
return Status::Invalid("UnionArray offsets must be signed int32");
}

if (type_ids.type_id() != Type::INT8) {
return Status::Invalid("UnionArray type_ids must be signed int8");
}

if (value_offsets.null_count() != 0) {
return Status::Invalid("MakeDense does not allow NAs in value_offsets");
}

BufferVector buffers = {type_ids.null_bitmap(),
static_cast<const UInt8Array&>(type_ids).values(),
static_cast<const Int32Array&>(value_offsets).values()};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May also want to assert that value_offsets has 0 null count

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

auto union_type = union_(children, UnionMode::DENSE);
auto internal_data =
std::make_shared<ArrayData>(union_type, type_ids.length(), std::move(buffers),
type_ids.null_count(), type_ids.offset());
for (const auto& child : children) {
internal_data->child_data.push_back(child->data());
}
*out = std::make_shared<UnionArray>(internal_data);
return Status::OK();
}

Status UnionArray::MakeSparse(const Array& type_ids,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Array>* out) {
if (type_ids.type_id() != Type::INT8) {
return Status::Invalid("UnionArray type_ids must be signed int8");
}
BufferVector buffers = {type_ids.null_bitmap(),
static_cast<const UInt8Array&>(type_ids).values(), nullptr};
auto union_type = union_(children, UnionMode::SPARSE);
auto internal_data =
std::make_shared<ArrayData>(union_type, type_ids.length(), std::move(buffers),
type_ids.null_count(), type_ids.offset());
for (const auto& child : children) {
internal_data->child_data.push_back(child->data());
if (child->length() != type_ids.length()) {
return Status::Invalid(
"Sparse UnionArray must have len(child) == len(type_ids) for all children");
}
}
*out = std::make_shared<UnionArray>(internal_data);
return Status::OK();
}

std::shared_ptr<Array> UnionArray::child(int i) const {
if (!boxed_fields_[i]) {
boxed_fields_[i] = MakeArray(data_->child_data[i]);
Expand Down
33 changes: 32 additions & 1 deletion cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -612,16 +612,47 @@ class ARROW_EXPORT UnionArray : public Array {
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = 0,
int64_t offset = 0);

/// \brief Construct Dense UnionArray from types_ids, value_offsets and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types. The value_offsets are assumed to be well-formed.
///
/// \param[in] type_ids An array of 8-bit signed integers, enumerated from
/// 0 corresponding to each type.
/// \param[in] value_offsets An array of signed int32 values indicating the
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[out] out Will have length equal to value_offsets.length()
static Status MakeDense(const Array& type_ids, const Array& value_offsets,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Array>* out);

/// \brief Construct Sparse UnionArray from type_ids and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of 8-bit signed integers, enumerated from
/// 0 corresponding to each type.
/// \param[in] children Vector of children Arrays containing the data for each type.
/// \param[out] out Will have length equal to type_ids.length()
static Status MakeSparse(const Array& type_ids,
const std::vector<std::shared_ptr<Array>>& children,
std::shared_ptr<Array>* out);

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> type_ids() const { return data_->buffers[1]; }

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }

int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }

const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; }
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }

UnionMode mode() const { return static_cast<const UnionType&>(*type()).mode(); }
UnionMode::type mode() const { return static_cast<const UnionType&>(*type()).mode(); }

std::shared_ptr<Array> child(int pos) const;

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class RangeEqualsVisitor {
bool CompareUnions(const UnionArray& left) const {
const auto& right = static_cast<const UnionArray&>(right_);

const UnionMode union_mode = left.mode();
const UnionMode::type union_mode = left.mode();
if (union_mode != right.mode()) {
return false;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/ipc/json-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@ static Status GetUnion(const RjObject& json_type,
RETURN_NOT_STRING("mode", it_mode, json_type);

std::string mode_str = it_mode->value.GetString();
UnionMode mode;
UnionMode::type mode;

if (mode_str == "SPARSE") {
mode = UnionMode::SPARSE;
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/ipc/metadata-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ static Status StructToFlatbuffer(FBB& fbb, const DataType& type,
static Status UnionFromFlatbuffer(const flatbuf::Union* union_data,
const std::vector<std::shared_ptr<Field>>& children,
std::shared_ptr<DataType>* out) {
UnionMode mode = union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE
: UnionMode::DENSE;
UnionMode::type mode =
(union_data->mode() == flatbuf::UnionMode_Sparse ? UnionMode::SPARSE
: UnionMode::DENSE);

std::vector<uint8_t> type_codes;

Expand Down
18 changes: 16 additions & 2 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ std::string TimestampType::ToString() const {
// Union type

UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
const std::vector<uint8_t>& type_codes, UnionMode mode)
const std::vector<uint8_t>& type_codes, UnionMode::type mode)
: NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) {
children_ = fields;
}
Expand Down Expand Up @@ -440,10 +440,24 @@ std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fie
}

std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
const std::vector<uint8_t>& type_codes, UnionMode mode) {
const std::vector<uint8_t>& type_codes,
UnionMode::type mode) {
return std::make_shared<UnionType>(child_fields, type_codes, mode);
}

std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children,
UnionMode::type mode) {
std::vector<std::shared_ptr<Field>> types;
std::vector<uint8_t> type_codes;
uint8_t counter = 0;
for (const auto& child : children) {
types.push_back(field(std::to_string(counter), child->type()));
type_codes.push_back(counter);
counter++;
}
return union_(types, type_codes, mode);
}

std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<Array>& dict_values,
bool ordered) {
Expand Down
18 changes: 13 additions & 5 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -517,14 +517,17 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
int32_t scale_;
};

enum class UnionMode : char { SPARSE, DENSE };
struct UnionMode {
enum type { SPARSE, DENSE };
};

class ARROW_EXPORT UnionType : public NestedType {
public:
static constexpr Type::type type_id = Type::UNION;

UnionType(const std::vector<std::shared_ptr<Field>>& fields,
const std::vector<uint8_t>& type_codes, UnionMode mode = UnionMode::SPARSE);
const std::vector<uint8_t>& type_codes,
UnionMode::type mode = UnionMode::SPARSE);

std::string ToString() const override;
std::string name() const override { return "union"; }
Expand All @@ -534,10 +537,10 @@ class ARROW_EXPORT UnionType : public NestedType {

const std::vector<uint8_t>& type_codes() const { return type_codes_; }

UnionMode mode() const { return mode_; }
UnionMode::type mode() const { return mode_; }

private:
UnionMode mode_;
UnionMode::type mode_;

// The type id used in the data to indicate each data type in the union. For
// example, the first type in the union might be denoted by the id 5 (instead
Expand Down Expand Up @@ -842,7 +845,12 @@ struct_(const std::vector<std::shared_ptr<Field>>& fields);
/// \brief Create an instance of Union type
std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Field>>& child_fields,
const std::vector<uint8_t>& type_codes, UnionMode mode = UnionMode::SPARSE);
const std::vector<uint8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE);

/// \brief Create and instance of Union type
std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Array>>& children,
UnionMode::type mode = UnionMode::SPARSE);

/// \brief Create an instance of Dictionary type
std::shared_ptr<DataType> ARROW_EXPORT
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
time32, time64, timestamp, date32, date64,
float16, float32, float64,
binary, string, decimal,
list_, struct, dictionary, field,
list_, struct, union, dictionary, field,
type_for_alias,
DataType, NAType,
Field,
Expand All @@ -52,7 +52,7 @@
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
ListArray,
ListArray, UnionArray,
BinaryArray, StringArray,
FixedSizeBinaryArray,
DictionaryArray,
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
int64_t num_values() const
shared_ptr[ColumnPath] path_in_schema() const
bint is_stats_set() const
shared_ptr[CRowGroupStatistics] statistics() const;
shared_ptr[CRowGroupStatistics] statistics() const
ParquetCompression compression() const
const vector[ParquetEncoding]& encodings() const

Expand Down
53 changes: 53 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,58 @@ cdef class ListArray(Array):
return pyarrow_wrap_array(out)


cdef class UnionArray(Array):

@staticmethod
def from_dense(Array types, Array value_offsets, list children):
"""
Construct dense UnionArray from arrays of int8 types, int32 offsets and
children arrays

Parameters
----------
types : Array (int8 type)
value_offsets : Array (int32 type)
children : list

Returns
-------
union_array : UnionArray
"""
cdef shared_ptr[CArray] out
cdef vector[shared_ptr[CArray]] c
cdef Array child
for child in children:
c.push_back(child.sp_array)
with nogil:
check_status(CUnionArray.MakeDense(
deref(types.ap), deref(value_offsets.ap), c, &out))
return pyarrow_wrap_array(out)

@staticmethod
def from_sparse(Array types, list children):
"""
Construct sparse UnionArray from arrays of int8 types and children
arrays

Parameters
----------
types : Array (int8 type)
children : list

Returns
-------
union_array : UnionArray
"""
cdef shared_ptr[CArray] out
cdef vector[shared_ptr[CArray]] c
cdef Array child
for child in children:
c.push_back(child.sp_array)
with nogil:
check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out))
return pyarrow_wrap_array(out)

cdef class StringArray(Array):
pass

Expand Down Expand Up @@ -789,6 +841,7 @@ cdef dict _array_classes = {
_Type_FLOAT: FloatArray,
_Type_DOUBLE: DoubleArray,
_Type_LIST: ListArray,
_Type_UNION: UnionArray,
_Type_BINARY: BinaryArray,
_Type_STRING: StringArray,
_Type_DICTIONARY: DictionaryArray,
Expand Down
25 changes: 25 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_DICTIONARY" arrow::Type::DICTIONARY"
_Type_MAP" arrow::Type::MAP"

enum UnionMode" arrow::UnionMode::type":
_UnionMode_SPARSE" arrow::UnionMode::SPARSE"
_UnionMode_DENSE" arrow::UnionMode::DENSE"

enum TimeUnit" arrow::TimeUnit::type":
TimeUnit_SECOND" arrow::TimeUnit::SECOND"
TimeUnit_MILLI" arrow::TimeUnit::MILLI"
Expand Down Expand Up @@ -222,6 +226,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CStructType" arrow::StructType"(CDataType):
CStructType(const vector[shared_ptr[CField]]& fields)

cdef cppclass CUnionType" arrow::UnionType"(CDataType):
CUnionType(const vector[shared_ptr[CField]]& fields,
const vector[uint8_t]& type_codes, UnionMode mode)
UnionMode mode()

cdef cppclass CSchema" arrow::Schema":
CSchema(const vector[shared_ptr[CField]]& fields)
CSchema(const vector[shared_ptr[CField]]& fields,
Expand Down Expand Up @@ -317,6 +326,22 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()

cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
@staticmethod
CStatus MakeSparse(const CArray& type_ids,
const vector[shared_ptr[CArray]]& children,
shared_ptr[CArray]* out)

@staticmethod
CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets,
const vector[shared_ptr[CArray]]& children,
shared_ptr[CArray]* out)
uint8_t* raw_type_ids()
int32_t value_offset(int i)
shared_ptr[CArray] child(int pos)
const CArray* UnsafeChild(int pos)
UnionMode mode()

cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray):
const uint8_t* GetValue(int i, int32_t* length)

Expand Down
Loading