diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 6be07d6ca75..43988d256b5 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -40,8 +40,8 @@ struct DictionaryBuilderCase { return CreateFor(); } - Status Visit(const BinaryType&) { return Create(); } - Status Visit(const StringType&) { return Create(); } + Status Visit(const BinaryType&) { return CreateFor(); } + Status Visit(const StringType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const DataType& value_type) { return NotImplemented(value_type); } diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 098f29896c2..39f7226cdbd 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -395,7 +395,7 @@ struct ValueConverter> { }; // ---------------------------------------------------------------------- -// Sequence converter base and CRTP "middle" subclasses +// Sequence converter base class SeqConverter; @@ -414,24 +414,46 @@ class SeqConverter { // arrow::MakeBuilder which also creates child builders for nested types, so // we have to pass in the child builders to child SeqConverter in the case of // converting Python objects to Arrow nested types - virtual Status Init(ArrayBuilder* builder) = 0; + virtual Status Init(ArrayBuilder* builder) { + builder_ = builder; + DCHECK_NE(builder_, nullptr); + return Status::OK(); + } // Append a single null value to the builder - virtual Status AppendNull() = 0; + virtual Status AppendNull() { return this->builder_->AppendNull(); } // Append a valid value - virtual Status AppendValue(PyObject* seq) = 0; + virtual Status AppendValue(PyObject* obj) = 0; // Append a single python object handling Null values - virtual Status Append(PyObject* seq) = 0; + virtual Status Append(PyObject* obj) = 0; - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status Extend(PyObject* seq, int64_t size) = 0; + // Append the contents of a Python sequence to the underlying builder + virtual Status Extend(PyObject* obj, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(builder_->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence( + obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); + } - // Append the contents of a Python sequence to the underlying builder, - // virtual version - virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; + // Append the contents of a Python sequence to the underlying builder + virtual Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) { + /// Ensure we've allocated enough space + RETURN_NOT_OK(builder_->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return this->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return this->Append(item); // perhaps use AppendValue instead? + } + }); + } virtual Status Close() { if (chunks_.size() == 0 || builder_->length() > 0) { @@ -467,44 +489,16 @@ class TypedConverter : public SeqConverter { using BuilderType = typename TypeTraits::BuilderType; Status Init(ArrayBuilder* builder) override { - builder_ = builder; - DCHECK_NE(builder_, nullptr); + RETURN_NOT_OK(SeqConverter::Init(builder)); typed_builder_ = checked_cast(builder); return Status::OK(); } - // Append a missing item (default implementation) - Status AppendNull() override { return this->typed_builder_->AppendNull(); } - - // Append null if the obj is None or pandas null otherwise the valid value Status Append(PyObject* obj) override { + // Append null if the obj is None or pandas null otherwise the valid value return NullChecker::Check(obj) ? AppendNull() : AppendValue(obj); } - Status Extend(PyObject* obj, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence( - obj, [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); - } - - Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequenceMasked( - obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { - if (is_masked) { - return this->AppendNull(); - } else { - // This will also apply the null-checking convention in the event - // that the value is not masked - return this->Append(item); // perhaps use AppendValue instead? - } - }); - } - protected: BuilderType* typed_builder_; }; @@ -903,6 +897,75 @@ class FixedSizeListConverter : public BaseListConverter +class DictionaryConverter : public SeqConverter { + public: + using BuilderType = DictionaryBuilder; + + Status Init(ArrayBuilder* builder) override { + RETURN_NOT_OK(SeqConverter::Init(builder)); + typed_builder_ = checked_cast(builder); + return Status::OK(); + } + + Status Append(PyObject* obj) override { + // Append null if the obj is None or pandas null otherwise the valid value + return NullChecker::Check(obj) ? AppendNull() : AppendValue(obj); + } + + protected: + BuilderType* typed_builder_; +}; + +template +class PrimitiveDictionaryConverter : public DictionaryConverter { + public: + Status AppendValue(PyObject* obj) override { + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); + return this->typed_builder_->Append(value); + } +}; + +template +class BinaryLikeDictionaryConverter : public DictionaryConverter { + public: + Status AppendValue(PyObject* obj) override { + ARROW_ASSIGN_OR_RAISE(string_view_, ValueConverter::FromPython(obj)); + // DCHECK_GE(string_view_.size, 0); + RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes, + static_cast(string_view_.size))); + return Status::OK(); + } + + protected: + // Create a single instance of PyBytesView here to prevent unnecessary object + // creation/destruction + PyBytesView string_view_; +}; + +template +class FixedSizeBinaryDictionaryConverter + : public DictionaryConverter { + public: + explicit FixedSizeBinaryDictionaryConverter(int32_t byte_width) + : byte_width_(byte_width) {} + + Status AppendValue(PyObject* obj) override { + ARROW_ASSIGN_OR_RAISE( + string_view_, ValueConverter::FromPython(obj, byte_width_)); + RETURN_NOT_OK(this->typed_builder_->Append(string_view_.bytes, + static_cast(string_view_.size))); + return Status::OK(); + } + + protected: + int32_t byte_width_; + PyBytesView string_view_; +}; + // ---------------------------------------------------------------------- // Convert maps @@ -1123,6 +1186,53 @@ class DecimalConverter : public TypedConverter decimal_type_; }; +#define DICTIONARY_PRIMITIVE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + *out = std::unique_ptr( \ + new PrimitiveDictionaryConverter); \ + break; + +#define DICTIONARY_BINARY_LIKE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + *out = std::unique_ptr( \ + new BinaryLikeDictionaryConverter); \ + break; + +template +Status GetDictionaryConverter(const std::shared_ptr& type, + std::unique_ptr* out) { + const auto& dict_type = checked_cast(*type); + const auto& value_type = dict_type.value_type(); + + switch (value_type->id()) { + DICTIONARY_PRIMITIVE(BOOL, BooleanType); + DICTIONARY_PRIMITIVE(INT8, Int8Type); + DICTIONARY_PRIMITIVE(INT16, Int16Type); + DICTIONARY_PRIMITIVE(INT32, Int32Type); + DICTIONARY_PRIMITIVE(INT64, Int64Type); + DICTIONARY_PRIMITIVE(UINT8, UInt8Type); + DICTIONARY_PRIMITIVE(UINT16, UInt16Type); + DICTIONARY_PRIMITIVE(UINT32, UInt32Type); + DICTIONARY_PRIMITIVE(UINT64, UInt64Type); + DICTIONARY_PRIMITIVE(HALF_FLOAT, HalfFloatType); + DICTIONARY_PRIMITIVE(FLOAT, FloatType); + DICTIONARY_PRIMITIVE(DOUBLE, DoubleType); + DICTIONARY_PRIMITIVE(DATE32, Date32Type); + DICTIONARY_PRIMITIVE(DATE64, Date64Type); + DICTIONARY_BINARY_LIKE(BINARY, BinaryType); + DICTIONARY_BINARY_LIKE(STRING, StringType); + case Type::FIXED_SIZE_BINARY: + *out = std::unique_ptr( + new FixedSizeBinaryDictionaryConverter( + checked_cast(*value_type).byte_width())); + break; + default: + return Status::NotImplemented("Sequence converter for type ", type->ToString(), + " not implemented"); + } + return Status::OK(); +} + #define PRIMITIVE(TYPE_ENUM, TYPE) \ case Type::TYPE_ENUM: \ *out = std::unique_ptr(new PrimitiveConverter); \ @@ -1279,6 +1389,12 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, from_pandas, strict_conversions, ignore_timezone)); } return Status::OK(); + case Type::DICTIONARY: + if (from_pandas) { + return GetDictionaryConverter(type, out); + } else { + return GetDictionaryConverter(type, out); + } default: break; } diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f25e376946e..654860e3957 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -969,11 +969,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: vector[shared_ptr[CScalar]] value CResult[shared_ptr[CScalar]] field(CFieldRef ref) const - cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): - cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": - shared_ptr[CScalar] index - shared_ptr[CArray] dictionary + cdef cppclass CDictionaryValue "arrow::DictionaryScalar::ValueType": + shared_ptr[CScalar] index + shared_ptr[CArray] dictionary + cdef cppclass CDictionaryScalar" arrow::DictionaryScalar"(CScalar): CDictionaryValue value CResult[shared_ptr[CScalar]] GetEncodedValue() diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index cc06ca6a2f9..564a9848ac1 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -687,6 +687,44 @@ cdef class DictionaryScalar(Scalar): Concrete class for dictionary-encoded scalars. """ + def __init__(self, index, dictionary, type): + cdef: + CDictionaryValue value + shared_ptr[CDictionaryScalar] wrapped + DataType type_ + Scalar index_ + Array dictionary_ + + type_ = ensure_type(type, allow_none=False) + if not isinstance(type_, DictionaryType): + raise TypeError('Must pass a DictionaryType instance') + + if isinstance(index, Scalar): + if not index.type.equals(type.index_type): + raise TypeError("The Scalar value passed as index must have " + "identical type to the dictionary type's " + "index_type") + index_ = index + else: + index_ = scalar(index, type=type_.index_type) + + if isinstance(dictionary, Array): + if not dictionary.type.equals(type.value_type): + raise TypeError("The Array passed as dictionary must have " + "identical type to the dictionary type's " + "value_type") + dictionary_ = dictionary + else: + dictionary_ = array(dictionary, type=type_.value_type) + + value.index = pyarrow_unwrap_scalar(index_) + value.dictionary = pyarrow_unwrap_array(dictionary_) + + wrapped = make_shared[CDictionaryScalar]( + value, pyarrow_unwrap_data_type(type_) + ) + self.init( wrapped) + @property def index(self): """ @@ -727,6 +765,9 @@ cdef class DictionaryScalar(Scalar): "please use the `value` property instead", FutureWarning) return self.value + def __reduce__(self): + return DictionaryScalar, (self.index, self.dictionary, self.type) + cdef class UnionScalar(Scalar): """ diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index d50d283265b..1e92892d6b8 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -1640,3 +1640,85 @@ def test_map_from_tuples(): for entry in [[(5,)], [()], [('5', 'foo', True)]]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([entry], type=pa.map_('i4', 'i4')) + + +def test_dictionary_from_boolean(): + typ = pa.dictionary(pa.int8(), value_type=pa.bool_()) + a = pa.array([False, False, True, False, True], type=typ) + assert isinstance(a.type, pa.DictionaryType) + assert a.type.equals(typ) + + expected_indices = pa.array([0, 0, 1, 0, 1], type=pa.int8()) + expected_dictionary = pa.array([False, True], type=pa.bool_()) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +@pytest.mark.parametrize('value_type', [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.float32(), + pa.float64(), + pa.date32(), + pa.date64(), +]) +def test_dictionary_from_integers(value_type): + typ = pa.dictionary(pa.int8(), value_type=value_type) + a = pa.array([1, 2, 1, 1, 2, 3], type=typ) + assert isinstance(a.type, pa.DictionaryType) + assert a.type.equals(typ) + + expected_indices = pa.array([0, 1, 0, 0, 1, 2], type=pa.int8()) + expected_dictionary = pa.array([1, 2, 3], type=value_type) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + +@pytest.mark.parametrize('input_index_type', [ + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64() +]) +def test_dictionary_is_always_adaptive(input_index_type): + # dictionary array is constructed using adaptive index type builder, + # meaning that the input index type is ignored since the output index + # type depends on the input data + typ = pa.dictionary(input_index_type, value_type=pa.int64()) + + a = pa.array(range(2**7), type=typ) + expected = pa.dictionary(pa.int8(), pa.int64()) + assert a.type.equals(expected) + + a = pa.array(range(2**7 + 1), type=typ) + expected = pa.dictionary(pa.int16(), pa.int64()) + assert a.type.equals(expected) + + +def test_dictionary_from_strings(): + for value_type in [pa.binary(), pa.string()]: + typ = pa.dictionary(pa.int8(), value_type) + a = pa.array(["", "a", "bb", "a", "bb", "ccc"], type=typ) + assert isinstance(a.type, pa.DictionaryType) + + expected_indices = pa.array([0, 1, 2, 1, 2, 3], type=pa.int8()) + expected_dictionary = pa.array(["", "a", "bb", "ccc"], type=value_type) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + + # fixed size binary type + typ = pa.dictionary(pa.int8(), pa.binary(3)) + a = pa.array(["aaa", "aaa", "bbb", "ccc", "bbb"], type=typ) + assert isinstance(a.type, pa.DictionaryType) + + expected_indices = pa.array([0, 0, 1, 2, 1], type=pa.int8()) + expected_dictionary = pa.array(["aaa", "bbb", "ccc"], type=pa.binary(3)) + assert a.indices.equals(expected_indices) + assert a.dictionary.equals(expected_dictionary) + diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index e98c36d07fd..59af2eacd1e 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -132,7 +132,6 @@ def test_build_info(): pa.FixedSizeListScalar, pa.UnionScalar, pa.StructScalar, - pa.DictionaryScalar, pa.ipc.Message, pa.ipc.MessageReader, pa.MemoryPool, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 091ae38e6e4..297ea14b750 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -550,8 +550,8 @@ def test_dictionary(): with pytest.warns(FutureWarning): assert s.dictionary_value.as_py() == v - with pytest.raises(pa.ArrowNotImplementedError): - pickle.loads(pickle.dumps(s)) + restored = pickle.loads(pickle.dumps(s)) + assert restored.equals(s) def test_union():