Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 61 additions & 9 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.

import warnings


cdef _sequence_to_array(object sequence, object mask, object size,
DataType type, CMemoryPool* pool, c_bool from_pandas):
Expand Down Expand Up @@ -84,6 +86,19 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
return pyarrow_wrap_array(chunked_out.get().chunk(0))


cdef _codes_to_indices(object codes, object mask, DataType type,
MemoryPool memory_pool):
"""
Convert the codes of a pandas Categorical to indices for a pyarrow
DictionaryArray, taking into account missing values + mask
"""
if mask is None:
mask = codes == -1
else:
mask = mask | (codes == -1)
return array(codes, mask=mask, type=type, memory_pool=memory_pool)


def _handle_arrow_array_protocol(obj, type, mask, size):
if mask is not None or size is not None:
raise ValueError(
Expand Down Expand Up @@ -199,11 +214,50 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
if hasattr(values, '__arrow_array__'):
return _handle_arrow_array_protocol(values, type, mask, size)
elif pandas_api.is_categorical(values):
if type is not None:
if type.id != Type_DICTIONARY:
return _ndarray_to_array(
np.asarray(values), mask, type, c_from_pandas, safe,
pool)
index_type = type.index_type
value_type = type.value_type
if values.ordered != type.ordered:
warnings.warn(
"The 'ordered' flag of the passed categorical values "
"does not match the 'ordered' of the specified type. "
"Using the flag of the values, but in the future this "
"mismatch will raise a ValueError.",
FutureWarning, stacklevel=2)
else:
index_type = None
value_type = None

indices = _codes_to_indices(
values.codes, mask, index_type, memory_pool)
try:
dictionary = array(
values.categories.values, type=value_type,
memory_pool=memory_pool)
except TypeError:
# TODO when removing the deprecation warning, this whole
# try/except can be removed (to bubble the TypeError of
# the first array(..) call)
if value_type is not None:
warnings.warn(
"The dtype of the 'categories' of the passed "
"categorical values ({0}) does not match the "
"specified type ({1}). For now ignoring the specified "
"type, but in the future this mismatch will raise a "
"TypeError".format(
values.categories.dtype, value_type),
FutureWarning, stacklevel=2)
dictionary = array(
values.categories.values, memory_pool=memory_pool)
else:
raise

return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, ordered=values.ordered,
from_pandas=True, safe=safe,
memory_pool=memory_pool)
indices, dictionary, ordered=values.ordered, safe=safe)
else:
if pandas_api.have_pandas:
values, type = pandas_api.compat.get_datetimetz_type(
Expand Down Expand Up @@ -1543,11 +1597,9 @@ cdef class DictionaryArray(Array):
_indices = indices
else:
if from_pandas:
if mask is None:
mask = indices == -1
else:
mask = mask | (indices == -1)
_indices = array(indices, mask=mask, memory_pool=memory_pool)
_indices = _codes_to_indices(indices, mask, None, memory_pool)
else:
_indices = array(indices, mask=mask, memory_pool=memory_pool)

if isinstance(dictionary, Array):
_dictionary = dictionary
Expand Down
74 changes: 74 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3212,6 +3212,80 @@ def test_variable_dictionary_to_pandas():
tm.assert_series_equal(result_dense, expected_dense)


def test_dictionary_from_pandas():
cat = pd.Categorical([u'a', u'b', u'a'])
expected_type = pa.dictionary(pa.int8(), pa.string())

result = pa.array(cat)
assert result.to_pylist() == ['a', 'b', 'a']
assert result.type.equals(expected_type)

# with missing values in categorical
cat = pd.Categorical([u'a', u'b', None, u'a'])

result = pa.array(cat)
assert result.to_pylist() == ['a', 'b', None, 'a']
assert result.type.equals(expected_type)

# with additional mask
result = pa.array(cat, mask=np.array([False, False, False, True]))
assert result.to_pylist() == ['a', 'b', None, None]
assert result.type.equals(expected_type)


def test_dictionary_from_pandas_specified_type():
# ARROW-7168 - ensure specified type is always respected

# the same as cat = pd.Categorical(['a', 'b']) but explicit about dtypes
cat = pd.Categorical.from_codes(
np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object))

# different index type -> allow this
# (the type of the 'codes' in pandas is not part of the data type)
typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string())
result = pa.array(cat, type=typ)
assert result.type.equals(typ)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't you check also the array value? (for example using result.to_pylist())

assert result.to_pylist() == ['a', 'b']

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, should you perhaps check passing a mask argument to pa.array()?

# mismatching values type -> raise error (for now a deprecation warning)
typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
with pytest.warns(FutureWarning):
result = pa.array(cat, type=typ)
assert result.to_pylist() == ['a', 'b']

# mismatching order -> raise error (for now a deprecation warning)
typ = pa.dictionary(
index_type=pa.int8(), value_type=pa.string(), ordered=True)
with pytest.warns(FutureWarning, match="The 'ordered' flag of the passed"):
result = pa.array(cat, type=typ)
assert result.to_pylist() == ['a', 'b']

# with mask
typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string())
result = pa.array(cat, type=typ, mask=np.array([False, True]))
assert result.type.equals(typ)
assert result.to_pylist() == ['a', None]

# empty categorical -> be flexible in values type to allow
cat = pd.Categorical([])

typ = pa.dictionary(index_type=pa.int8(), value_type=pa.string())
result = pa.array(cat, type=typ)
assert result.type.equals(typ)
assert result.to_pylist() == []
typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64())
result = pa.array(cat, type=typ)
assert result.type.equals(typ)
assert result.to_pylist() == []

# passing non-dictionary type
cat = pd.Categorical(['a', 'b'])
result = pa.array(cat, type=pa.string())
expected = pa.array(['a', 'b'], type=pa.string())
assert result.equals(expected)
assert result.to_pylist() == ['a', 'b']


# ----------------------------------------------------------------------
# Array protocol in pandas conversions tests

Expand Down