Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ struct DictionaryBuilderCase {
return CreateFor<ValueType>();
}

Status Visit(const BinaryType&) { return Create<BinaryDictionaryBuilder>(); }
Status Visit(const StringType&) { return Create<StringDictionaryBuilder>(); }
Status Visit(const BinaryType&) { return CreateFor<BinaryType>(); }
Status Visit(const StringType&) { return CreateFor<StringType>(); }
Status Visit(const FixedSizeBinaryType&) { return CreateFor<FixedSizeBinaryType>(); }

Status Visit(const DataType& value_type) { return NotImplemented(value_type); }
Expand Down
198 changes: 157 additions & 41 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ struct ValueConverter<Type, enable_if_fixed_size_binary<Type>> {
};

// ----------------------------------------------------------------------
// Sequence converter base and CRTP "middle" subclasses
// Sequence converter base

class SeqConverter;

Expand All @@ -414,24 +414,46 @@ class SeqConverter {
// arrow::MakeBuilder which also creates child builders for nested types, so
// we have to pass in the child builders to child SeqConverter in the case of
// converting Python objects to Arrow nested types
virtual Status Init(ArrayBuilder* builder) = 0;
virtual Status Init(ArrayBuilder* builder) {
builder_ = builder;
DCHECK_NE(builder_, nullptr);
return Status::OK();
}

// Append a single null value to the builder
virtual Status AppendNull() = 0;
virtual Status AppendNull() { return this->builder_->AppendNull(); }

// Append a valid value
virtual Status AppendValue(PyObject* seq) = 0;
virtual Status AppendValue(PyObject* obj) = 0;

// Append a single python object handling Null values
virtual Status Append(PyObject* seq) = 0;
virtual Status Append(PyObject* obj) = 0;

// Append the contents of a Python sequence to the underlying builder,
// virtual version
virtual Status Extend(PyObject* seq, int64_t size) = 0;
// Append the contents of a Python sequence to the underlying builder
virtual Status Extend(PyObject* obj, int64_t size) {
/// Ensure we've allocated enough space
RETURN_NOT_OK(builder_->Reserve(size));
// Iterate over the items adding each one
return internal::VisitSequence(
obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); });
}

// Append the contents of a Python sequence to the underlying builder,
// virtual version
virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0;
// Append the contents of a Python sequence to the underlying builder
virtual Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) {
/// Ensure we've allocated enough space
RETURN_NOT_OK(builder_->Reserve(size));
// Iterate over the items adding each one
return internal::VisitSequenceMasked(
obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) {
if (is_masked) {
return this->AppendNull();
} else {
// This will also apply the null-checking convention in the event
// that the value is not masked
return this->Append(item); // perhaps use AppendValue instead?
}
});
}

virtual Status Close() {
if (chunks_.size() == 0 || builder_->length() > 0) {
Expand Down Expand Up @@ -467,44 +489,16 @@ class TypedConverter : public SeqConverter {
using BuilderType = typename TypeTraits<Type>::BuilderType;

Status Init(ArrayBuilder* builder) override {
builder_ = builder;
DCHECK_NE(builder_, nullptr);
RETURN_NOT_OK(SeqConverter::Init(builder));
typed_builder_ = checked_cast<BuilderType*>(builder);
return Status::OK();
}

// Append a missing item (default implementation)
Status AppendNull() override { return this->typed_builder_->AppendNull(); }

// Append null if the obj is None or pandas null otherwise the valid value
Status Append(PyObject* obj) override {
// Append null if the obj is None or pandas null otherwise the valid value
return NullChecker<null_coding>::Check(obj) ? AppendNull() : AppendValue(obj);
}

Status Extend(PyObject* obj, int64_t size) override {
/// Ensure we've allocated enough space
RETURN_NOT_OK(typed_builder_->Reserve(size));
// Iterate over the items adding each one
return internal::VisitSequence(
obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); });
}

Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override {
/// Ensure we've allocated enough space
RETURN_NOT_OK(typed_builder_->Reserve(size));
// Iterate over the items adding each one
return internal::VisitSequenceMasked(
obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) {
if (is_masked) {
return this->AppendNull();
} else {
// This will also apply the null-checking convention in the event
// that the value is not masked
return this->Append(item); // perhaps use AppendValue instead?
}
});
}

protected:
BuilderType* typed_builder_;
};
Expand Down Expand Up @@ -903,6 +897,75 @@ class FixedSizeListConverter : public BaseListConverter<FixedSizeListType, null_
int64_t list_size_;
};

// ----------------------------------------------------------------------
// Convert dictionary

template <typename ValueType, NullCoding null_coding>
class DictionaryConverter : public SeqConverter {
public:
using BuilderType = DictionaryBuilder<ValueType>;

Status Init(ArrayBuilder* builder) override {
RETURN_NOT_OK(SeqConverter::Init(builder));
typed_builder_ = checked_cast<BuilderType*>(builder);
return Status::OK();
}

Status Append(PyObject* obj) override {
// Append null if the obj is None or pandas null otherwise the valid value
return NullChecker<null_coding>::Check(obj) ? AppendNull() : AppendValue(obj);
}

protected:
BuilderType* typed_builder_;
};

template <typename ValueType, NullCoding null_coding>
class PrimitiveDictionaryConverter : public DictionaryConverter<ValueType, null_coding> {
public:
Status AppendValue(PyObject* obj) override {
ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter<ValueType>::FromPython(obj));
return this->typed_builder_->Append(value);
}
};

template <typename ValueType, NullCoding null_coding>
class BinaryLikeDictionaryConverter : public DictionaryConverter<ValueType, null_coding> {
public:
Status AppendValue(PyObject* obj) override {
ARROW_ASSIGN_OR_RAISE(string_view_, ValueConverter<ValueType>::FromPython(obj));
// DCHECK_GE(string_view_.size, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there a TypedBuilder::Append(string_view)? If not, can you please add it?

static_cast<int32_t>(string_view_.size)));
return Status::OK();
}

protected:
// Create a single instance of PyBytesView here to prevent unnecessary object
// creation/destruction
PyBytesView string_view_;
};

template <NullCoding null_coding>
class FixedSizeBinaryDictionaryConverter
: public DictionaryConverter<FixedSizeBinaryType, null_coding> {
public:
explicit FixedSizeBinaryDictionaryConverter(int32_t byte_width)
: byte_width_(byte_width) {}

Status AppendValue(PyObject* obj) override {
ARROW_ASSIGN_OR_RAISE(
string_view_, ValueConverter<FixedSizeBinaryType>::FromPython(obj, byte_width_));
RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here: it would be good to have TypedBuilder::Append(string_view).

static_cast<int32_t>(string_view_.size)));
return Status::OK();
}

protected:
int32_t byte_width_;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add const

PyBytesView string_view_;
};

// ----------------------------------------------------------------------
// Convert maps

Expand Down Expand Up @@ -1123,6 +1186,53 @@ class DecimalConverter : public TypedConverter<arrow::Decimal128Type, null_codin
std::shared_ptr<DecimalType> decimal_type_;
};

#define DICTIONARY_PRIMITIVE(TYPE_ENUM, TYPE_CLASS) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please #undef at the end.

case Type::TYPE_ENUM: \
*out = std::unique_ptr<SeqConverter>( \
new PrimitiveDictionaryConverter<TYPE_CLASS, null_coding>); \
break;

#define DICTIONARY_BINARY_LIKE(TYPE_ENUM, TYPE_CLASS) \
case Type::TYPE_ENUM: \
*out = std::unique_ptr<SeqConverter>( \
new BinaryLikeDictionaryConverter<TYPE_CLASS, null_coding>); \
break;

template <NullCoding null_coding>
Status GetDictionaryConverter(const std::shared_ptr<DataType>& type,
std::unique_ptr<SeqConverter>* out) {
const auto& dict_type = checked_cast<const DictionaryType&>(*type);
const auto& value_type = dict_type.value_type();

switch (value_type->id()) {
DICTIONARY_PRIMITIVE(BOOL, BooleanType);
DICTIONARY_PRIMITIVE(INT8, Int8Type);
DICTIONARY_PRIMITIVE(INT16, Int16Type);
DICTIONARY_PRIMITIVE(INT32, Int32Type);
DICTIONARY_PRIMITIVE(INT64, Int64Type);
DICTIONARY_PRIMITIVE(UINT8, UInt8Type);
DICTIONARY_PRIMITIVE(UINT16, UInt16Type);
DICTIONARY_PRIMITIVE(UINT32, UInt32Type);
DICTIONARY_PRIMITIVE(UINT64, UInt64Type);
DICTIONARY_PRIMITIVE(HALF_FLOAT, HalfFloatType);
DICTIONARY_PRIMITIVE(FLOAT, FloatType);
DICTIONARY_PRIMITIVE(DOUBLE, DoubleType);
DICTIONARY_PRIMITIVE(DATE32, Date32Type);
DICTIONARY_PRIMITIVE(DATE64, Date64Type);
DICTIONARY_BINARY_LIKE(BINARY, BinaryType);
DICTIONARY_BINARY_LIKE(STRING, StringType);
case Type::FIXED_SIZE_BINARY:
*out = std::unique_ptr<SeqConverter>(
new FixedSizeBinaryDictionaryConverter<null_coding>(
checked_cast<const FixedSizeBinaryType&>(*value_type).byte_width()));
break;
default:
return Status::NotImplemented("Sequence converter for type ", type->ToString(),
" not implemented");
}
return Status::OK();
}

#define PRIMITIVE(TYPE_ENUM, TYPE) \
case Type::TYPE_ENUM: \
*out = std::unique_ptr<SeqConverter>(new PrimitiveConverter<TYPE, null_coding>); \
Expand Down Expand Up @@ -1279,6 +1389,12 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
from_pandas, strict_conversions, ignore_timezone));
}
return Status::OK();
case Type::DICTIONARY:
if (from_pandas) {
return GetDictionaryConverter<NullCoding::PANDAS_SENTINELS>(type, out);
} else {
return GetDictionaryConverter<NullCoding::NONE_ONLY>(type, out);
}
default:
break;
}
Expand Down
8 changes: 4 additions & 4 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -969,11 +969,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
vector[shared_ptr[CScalar]] value
CResult[shared_ptr[CScalar]] field(CFieldRef ref) const

cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar):
cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType":
shared_ptr[CScalar] index
shared_ptr[CArray] dictionary
cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType":
shared_ptr[CScalar] index
shared_ptr[CArray] dictionary

cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar):
CDictionaryValue value
CResult[shared_ptr[CScalar]] GetEncodedValue()

Expand Down
41 changes: 41 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,44 @@ cdef class DictionaryScalar(Scalar):
Concrete class for dictionary-encoded scalars.
"""

def __init__(self, index, dictionary, type):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can put this in some from_.. class method? (to keep it consistent with the other scalars that __init__ raises)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(and can you also add some tests for this construction method?)

cdef:
CDictionaryValue value
shared_ptr[CDictionaryScalar] wrapped
DataType type_
Scalar index_
Array dictionary_

type_ = ensure_type(type, allow_none=False)
if not isinstance(type_, DictionaryType):
raise TypeError('Must pass a DictionaryType instance')

if isinstance(index, Scalar):
if not index.type.equals(type.index_type):
raise TypeError("The Scalar value passed as index must have "
"identical type to the dictionary type's "
"index_type")
index_ = index
else:
index_ = scalar(index, type=type_.index_type)

if isinstance(dictionary, Array):
if not dictionary.type.equals(type.value_type):
raise TypeError("The Array passed as dictionary must have "
"identical type to the dictionary type's "
"value_type")
dictionary_ = dictionary
else:
dictionary_ = array(dictionary, type=type_.value_type)

value.index = pyarrow_unwrap_scalar(index_)
value.dictionary = pyarrow_unwrap_array(dictionary_)

wrapped = make_shared[CDictionaryScalar](
value, pyarrow_unwrap_data_type(type_)
)
self.init(<shared_ptr[CScalar]> wrapped)

@property
def index(self):
"""
Expand Down Expand Up @@ -727,6 +765,9 @@ cdef class DictionaryScalar(Scalar):
"please use the `value` property instead", FutureWarning)
return self.value

def __reduce__(self):
return DictionaryScalar, (self.index, self.dictionary, self.type)


cdef class UnionScalar(Scalar):
"""
Expand Down
Loading