-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-9369: [Python] Support conversion from python sequence to dictionary type #8008
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -395,7 +395,7 @@ struct ValueConverter<Type, enable_if_fixed_size_binary<Type>> { | |
| }; | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // Sequence converter base and CRTP "middle" subclasses | ||
| // Sequence converter base | ||
|
|
||
| class SeqConverter; | ||
|
|
||
|
|
@@ -414,24 +414,46 @@ class SeqConverter { | |
| // arrow::MakeBuilder which also creates child builders for nested types, so | ||
| // we have to pass in the child builders to child SeqConverter in the case of | ||
| // converting Python objects to Arrow nested types | ||
| virtual Status Init(ArrayBuilder* builder) = 0; | ||
| virtual Status Init(ArrayBuilder* builder) { | ||
| builder_ = builder; | ||
| DCHECK_NE(builder_, nullptr); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| // Append a single null value to the builder | ||
| virtual Status AppendNull() = 0; | ||
| virtual Status AppendNull() { return this->builder_->AppendNull(); } | ||
|
|
||
| // Append a valid value | ||
| virtual Status AppendValue(PyObject* seq) = 0; | ||
| virtual Status AppendValue(PyObject* obj) = 0; | ||
|
|
||
| // Append a single python object handling Null values | ||
| virtual Status Append(PyObject* seq) = 0; | ||
| virtual Status Append(PyObject* obj) = 0; | ||
|
|
||
| // Append the contents of a Python sequence to the underlying builder, | ||
| // virtual version | ||
| virtual Status Extend(PyObject* seq, int64_t size) = 0; | ||
| // Append the contents of a Python sequence to the underlying builder | ||
| virtual Status Extend(PyObject* obj, int64_t size) { | ||
| /// Ensure we've allocated enough space | ||
| RETURN_NOT_OK(builder_->Reserve(size)); | ||
| // Iterate over the items adding each one | ||
| return internal::VisitSequence( | ||
| obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); | ||
| } | ||
|
|
||
| // Append the contents of a Python sequence to the underlying builder, | ||
| // virtual version | ||
| virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; | ||
| // Append the contents of a Python sequence to the underlying builder | ||
| virtual Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) { | ||
| /// Ensure we've allocated enough space | ||
| RETURN_NOT_OK(builder_->Reserve(size)); | ||
| // Iterate over the items adding each one | ||
| return internal::VisitSequenceMasked( | ||
| obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { | ||
| if (is_masked) { | ||
| return this->AppendNull(); | ||
| } else { | ||
| // This will also apply the null-checking convention in the event | ||
| // that the value is not masked | ||
| return this->Append(item); // perhaps use AppendValue instead? | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| virtual Status Close() { | ||
| if (chunks_.size() == 0 || builder_->length() > 0) { | ||
|
|
@@ -467,44 +489,16 @@ class TypedConverter : public SeqConverter { | |
| using BuilderType = typename TypeTraits<Type>::BuilderType; | ||
|
|
||
| Status Init(ArrayBuilder* builder) override { | ||
| builder_ = builder; | ||
| DCHECK_NE(builder_, nullptr); | ||
| RETURN_NOT_OK(SeqConverter::Init(builder)); | ||
| typed_builder_ = checked_cast<BuilderType*>(builder); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| // Append a missing item (default implementation) | ||
| Status AppendNull() override { return this->typed_builder_->AppendNull(); } | ||
|
|
||
| // Append null if the obj is None or pandas null otherwise the valid value | ||
| Status Append(PyObject* obj) override { | ||
| // Append null if the obj is None or pandas null otherwise the valid value | ||
| return NullChecker<null_coding>::Check(obj) ? AppendNull() : AppendValue(obj); | ||
| } | ||
|
|
||
| Status Extend(PyObject* obj, int64_t size) override { | ||
| /// Ensure we've allocated enough space | ||
| RETURN_NOT_OK(typed_builder_->Reserve(size)); | ||
| // Iterate over the items adding each one | ||
| return internal::VisitSequence( | ||
| obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); | ||
| } | ||
|
|
||
| Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { | ||
| /// Ensure we've allocated enough space | ||
| RETURN_NOT_OK(typed_builder_->Reserve(size)); | ||
| // Iterate over the items adding each one | ||
| return internal::VisitSequenceMasked( | ||
| obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { | ||
| if (is_masked) { | ||
| return this->AppendNull(); | ||
| } else { | ||
| // This will also apply the null-checking convention in the event | ||
| // that the value is not masked | ||
| return this->Append(item); // perhaps use AppendValue instead? | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| protected: | ||
| BuilderType* typed_builder_; | ||
| }; | ||
|
|
@@ -903,6 +897,75 @@ class FixedSizeListConverter : public BaseListConverter<FixedSizeListType, null_ | |
| int64_t list_size_; | ||
| }; | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // Convert dictionary | ||
|
|
||
| template <typename ValueType, NullCoding null_coding> | ||
| class DictionaryConverter : public SeqConverter { | ||
| public: | ||
| using BuilderType = DictionaryBuilder<ValueType>; | ||
|
|
||
| Status Init(ArrayBuilder* builder) override { | ||
| RETURN_NOT_OK(SeqConverter::Init(builder)); | ||
| typed_builder_ = checked_cast<BuilderType*>(builder); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| Status Append(PyObject* obj) override { | ||
| // Append null if the obj is None or pandas null otherwise the valid value | ||
| return NullChecker<null_coding>::Check(obj) ? AppendNull() : AppendValue(obj); | ||
| } | ||
|
|
||
| protected: | ||
| BuilderType* typed_builder_; | ||
| }; | ||
|
|
||
| template <typename ValueType, NullCoding null_coding> | ||
| class PrimitiveDictionaryConverter : public DictionaryConverter<ValueType, null_coding> { | ||
| public: | ||
| Status AppendValue(PyObject* obj) override { | ||
| ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter<ValueType>::FromPython(obj)); | ||
| return this->typed_builder_->Append(value); | ||
| } | ||
| }; | ||
|
|
||
| template <typename ValueType, NullCoding null_coding> | ||
| class BinaryLikeDictionaryConverter : public DictionaryConverter<ValueType, null_coding> { | ||
| public: | ||
| Status AppendValue(PyObject* obj) override { | ||
| ARROW_ASSIGN_OR_RAISE(string_view_, ValueConverter<ValueType>::FromPython(obj)); | ||
| // DCHECK_GE(string_view_.size, 0); | ||
| RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't there a |
||
| static_cast<int32_t>(string_view_.size))); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| protected: | ||
| // Create a single instance of PyBytesView here to prevent unnecessary object | ||
| // creation/destruction | ||
| PyBytesView string_view_; | ||
| }; | ||
|
|
||
| template <NullCoding null_coding> | ||
| class FixedSizeBinaryDictionaryConverter | ||
| : public DictionaryConverter<FixedSizeBinaryType, null_coding> { | ||
| public: | ||
| explicit FixedSizeBinaryDictionaryConverter(int32_t byte_width) | ||
| : byte_width_(byte_width) {} | ||
|
|
||
| Status AppendValue(PyObject* obj) override { | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| string_view_, ValueConverter<FixedSizeBinaryType>::FromPython(obj, byte_width_)); | ||
| RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here: it would be good to have |
||
| static_cast<int32_t>(string_view_.size))); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| protected: | ||
| int32_t byte_width_; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add |
||
| PyBytesView string_view_; | ||
| }; | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // Convert maps | ||
|
|
||
|
|
@@ -1123,6 +1186,53 @@ class DecimalConverter : public TypedConverter<arrow::Decimal128Type, null_codin | |
| std::shared_ptr<DecimalType> decimal_type_; | ||
| }; | ||
|
|
||
| #define DICTIONARY_PRIMITIVE(TYPE_ENUM, TYPE_CLASS) \ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please |
||
| case Type::TYPE_ENUM: \ | ||
| *out = std::unique_ptr<SeqConverter>( \ | ||
| new PrimitiveDictionaryConverter<TYPE_CLASS, null_coding>); \ | ||
| break; | ||
|
|
||
| #define DICTIONARY_BINARY_LIKE(TYPE_ENUM, TYPE_CLASS) \ | ||
| case Type::TYPE_ENUM: \ | ||
| *out = std::unique_ptr<SeqConverter>( \ | ||
| new BinaryLikeDictionaryConverter<TYPE_CLASS, null_coding>); \ | ||
| break; | ||
|
|
||
| template <NullCoding null_coding> | ||
| Status GetDictionaryConverter(const std::shared_ptr<DataType>& type, | ||
| std::unique_ptr<SeqConverter>* out) { | ||
| const auto& dict_type = checked_cast<const DictionaryType&>(*type); | ||
| const auto& value_type = dict_type.value_type(); | ||
|
|
||
| switch (value_type->id()) { | ||
| DICTIONARY_PRIMITIVE(BOOL, BooleanType); | ||
| DICTIONARY_PRIMITIVE(INT8, Int8Type); | ||
| DICTIONARY_PRIMITIVE(INT16, Int16Type); | ||
| DICTIONARY_PRIMITIVE(INT32, Int32Type); | ||
| DICTIONARY_PRIMITIVE(INT64, Int64Type); | ||
| DICTIONARY_PRIMITIVE(UINT8, UInt8Type); | ||
| DICTIONARY_PRIMITIVE(UINT16, UInt16Type); | ||
| DICTIONARY_PRIMITIVE(UINT32, UInt32Type); | ||
| DICTIONARY_PRIMITIVE(UINT64, UInt64Type); | ||
| DICTIONARY_PRIMITIVE(HALF_FLOAT, HalfFloatType); | ||
| DICTIONARY_PRIMITIVE(FLOAT, FloatType); | ||
| DICTIONARY_PRIMITIVE(DOUBLE, DoubleType); | ||
| DICTIONARY_PRIMITIVE(DATE32, Date32Type); | ||
| DICTIONARY_PRIMITIVE(DATE64, Date64Type); | ||
| DICTIONARY_BINARY_LIKE(BINARY, BinaryType); | ||
| DICTIONARY_BINARY_LIKE(STRING, StringType); | ||
| case Type::FIXED_SIZE_BINARY: | ||
| *out = std::unique_ptr<SeqConverter>( | ||
| new FixedSizeBinaryDictionaryConverter<null_coding>( | ||
| checked_cast<const FixedSizeBinaryType&>(*value_type).byte_width())); | ||
| break; | ||
| default: | ||
| return Status::NotImplemented("Sequence converter for type ", type->ToString(), | ||
| " not implemented"); | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| #define PRIMITIVE(TYPE_ENUM, TYPE) \ | ||
| case Type::TYPE_ENUM: \ | ||
| *out = std::unique_ptr<SeqConverter>(new PrimitiveConverter<TYPE, null_coding>); \ | ||
|
|
@@ -1279,6 +1389,12 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas, | |
| from_pandas, strict_conversions, ignore_timezone)); | ||
| } | ||
| return Status::OK(); | ||
| case Type::DICTIONARY: | ||
| if (from_pandas) { | ||
| return GetDictionaryConverter<NullCoding::PANDAS_SENTINELS>(type, out); | ||
| } else { | ||
| return GetDictionaryConverter<NullCoding::NONE_ONLY>(type, out); | ||
| } | ||
| default: | ||
| break; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -687,6 +687,44 @@ cdef class DictionaryScalar(Scalar): | |
| Concrete class for dictionary-encoded scalars. | ||
| """ | ||
|
|
||
| def __init__(self, index, dictionary, type): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can put this in some
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (and can you also add some tests for this construction method?) |
||
| cdef: | ||
| CDictionaryValue value | ||
| shared_ptr[CDictionaryScalar] wrapped | ||
| DataType type_ | ||
| Scalar index_ | ||
| Array dictionary_ | ||
|
|
||
| type_ = ensure_type(type, allow_none=False) | ||
| if not isinstance(type_, DictionaryType): | ||
| raise TypeError('Must pass a DictionaryType instance') | ||
|
|
||
| if isinstance(index, Scalar): | ||
| if not index.type.equals(type.index_type): | ||
| raise TypeError("The Scalar value passed as index must have " | ||
| "identical type to the dictionary type's " | ||
| "index_type") | ||
| index_ = index | ||
| else: | ||
| index_ = scalar(index, type=type_.index_type) | ||
|
|
||
| if isinstance(dictionary, Array): | ||
| if not dictionary.type.equals(type.value_type): | ||
| raise TypeError("The Array passed as dictionary must have " | ||
| "identical type to the dictionary type's " | ||
| "value_type") | ||
| dictionary_ = dictionary | ||
| else: | ||
| dictionary_ = array(dictionary, type=type_.value_type) | ||
|
|
||
| value.index = pyarrow_unwrap_scalar(index_) | ||
| value.dictionary = pyarrow_unwrap_array(dictionary_) | ||
|
|
||
| wrapped = make_shared[CDictionaryScalar]( | ||
| value, pyarrow_unwrap_data_type(type_) | ||
| ) | ||
| self.init(<shared_ptr[CScalar]> wrapped) | ||
|
|
||
| @property | ||
| def index(self): | ||
| """ | ||
|
|
@@ -727,6 +765,9 @@ cdef class DictionaryScalar(Scalar): | |
| "please use the `value` property instead", FutureWarning) | ||
| return self.value | ||
|
|
||
| def __reduce__(self): | ||
| return DictionaryScalar, (self.index, self.dictionary, self.type) | ||
|
|
||
|
|
||
| cdef class UnionScalar(Scalar): | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why?