diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 32ea77f114e..14f04ea4921 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -918,15 +918,6 @@ cdef class BinaryArray(Array): cdef class DictionaryArray(Array): - cdef getitem(self, int64_t i): - cdef Array dictionary = self.dictionary - index = self.indices[i] - if index is NA: - return index - else: - return box_scalar(dictionary.type, dictionary.sp_array, - index.as_py()) - def dictionary_encode(self): return self diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 04ecc9cf46e..adb5fa6c2af 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -402,6 +402,29 @@ cdef class StructValue(ArrayValue): zip(child_names, wrapped_arrays) } +cdef class DictionaryValue(ArrayValue): + + def as_py(self): + return self.dictionary_value.as_py() + + property index_value: + + def __get__(self): + cdef CDictionaryArray* darr + + darr = (self.sp_array.get()) + indices = pyarrow_wrap_array(darr.indices()) + return indices[self.index] + + property dictionary_value: + + def __get__(self): + cdef CDictionaryArray* darr + + darr = (self.sp_array.get()) + dictionary = pyarrow_wrap_array(darr.dictionary()) + return dictionary[self.index_value.as_py()] + cdef dict _scalar_classes = { _Type_BOOL: BooleanValue, @@ -428,6 +451,7 @@ cdef dict _scalar_classes = { _Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, _Type_DECIMAL: DecimalValue, _Type_STRUCT: StructValue, + _Type_DICTIONARY: DictionaryValue, } diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 7ed2cc9b7e5..6d7892a47a1 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -841,3 +841,13 @@ def test_struct_array_flatten(): xs, ys = a[1:].flatten() assert xs.to_pylist() == [None, None] assert ys.to_pylist() == [None, 2.5] + + +def test_nested_dictionary_array(): + dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) + list_arr = pa.ListArray.from_arrays([0, 2, 3], dict_arr) + assert list_arr.to_pylist() == [['a', 'b'], ['a']] + + dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) + dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr) + assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a'] diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 9c86270c454..0a5c72a427b 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -180,6 +180,7 @@ def test_timestamp(self): def test_dictionary(self): colors = ['red', 'green', 'blue'] + colors_dict = {'red': 0, 'green': 1, 'blue': 2} values = pd.Series(colors * 4) categorical = pd.Categorical(values, categories=colors) @@ -188,6 +189,8 @@ def test_dictionary(self): categorical.categories) for i, c in enumerate(values): assert v[i].as_py() == c + assert v[i].dictionary_value == c + assert v[i].index_value == colors_dict[c] def test_int_hash(self): # ARROW-640