diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9a6e9d1c4d6..b6d436ffa2b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import warnings + cdef _sequence_to_array(object sequence, object mask, object size, DataType type, CMemoryPool* pool, c_bool from_pandas): @@ -84,6 +86,19 @@ cdef _ndarray_to_array(object values, object mask, DataType type, return pyarrow_wrap_array(chunked_out.get().chunk(0)) +cdef _codes_to_indices(object codes, object mask, DataType type, + MemoryPool memory_pool): + """ + Convert the codes of a pandas Categorical to indices for a pyarrow + DictionaryArray, taking into account missing values + mask + """ + if mask is None: + mask = codes == -1 + else: + mask = mask | (codes == -1) + return array(codes, mask=mask, type=type, memory_pool=memory_pool) + + def _handle_arrow_array_protocol(obj, type, mask, size): if mask is not None or size is not None: raise ValueError( @@ -199,11 +214,50 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, if hasattr(values, '__arrow_array__'): return _handle_arrow_array_protocol(values, type, mask, size) elif pandas_api.is_categorical(values): + if type is not None: + if type.id != Type_DICTIONARY: + return _ndarray_to_array( + np.asarray(values), mask, type, c_from_pandas, safe, + pool) + index_type = type.index_type + value_type = type.value_type + if values.ordered != type.ordered: + warnings.warn( + "The 'ordered' flag of the passed categorical values " + "does not match the 'ordered' of the specified type. " + "Using the flag of the values, but in the future this " + "mismatch will raise a ValueError.", + FutureWarning, stacklevel=2) + else: + index_type = None + value_type = None + + indices = _codes_to_indices( + values.codes, mask, index_type, memory_pool) + try: + dictionary = array( + values.categories.values, type=value_type, + memory_pool=memory_pool) + except TypeError: + # TODO when removing the deprecation warning, this whole + # try/except can be removed (to bubble the TypeError of + # the first array(..) call) + if value_type is not None: + warnings.warn( + "The dtype of the 'categories' of the passed " + "categorical values ({0}) does not match the " + "specified type ({1}). For now ignoring the specified " + "type, but in the future this mismatch will raise a " + "TypeError".format( + values.categories.dtype, value_type), + FutureWarning, stacklevel=2) + dictionary = array( + values.categories.values, memory_pool=memory_pool) + else: + raise + return DictionaryArray.from_arrays( - values.codes, values.categories.values, - mask=mask, ordered=values.ordered, - from_pandas=True, safe=safe, - memory_pool=memory_pool) + indices, dictionary, ordered=values.ordered, safe=safe) else: if pandas_api.have_pandas: values, type = pandas_api.compat.get_datetimetz_type( @@ -1543,11 +1597,9 @@ cdef class DictionaryArray(Array): _indices = indices else: if from_pandas: - if mask is None: - mask = indices == -1 - else: - mask = mask | (indices == -1) - _indices = array(indices, mask=mask, memory_pool=memory_pool) + _indices = _codes_to_indices(indices, mask, None, memory_pool) + else: + _indices = array(indices, mask=mask, memory_pool=memory_pool) if isinstance(dictionary, Array): _dictionary = dictionary diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 441395cac9f..3476c7448d2 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3212,6 +3212,80 @@ def test_variable_dictionary_to_pandas(): tm.assert_series_equal(result_dense, expected_dense) +def test_dictionary_from_pandas(): + cat = pd.Categorical([u'a', u'b', u'a']) + expected_type = pa.dictionary(pa.int8(), pa.string()) + + result = pa.array(cat) + assert result.to_pylist() == ['a', 'b', 'a'] + assert result.type.equals(expected_type) + + # with missing values in categorical + cat = pd.Categorical([u'a', u'b', None, u'a']) + + result = pa.array(cat) + assert result.to_pylist() == ['a', 'b', None, 'a'] + assert result.type.equals(expected_type) + + # with additional mask + result = pa.array(cat, mask=np.array([False, False, False, True])) + assert result.to_pylist() == ['a', 'b', None, None] + assert result.type.equals(expected_type) + + +def test_dictionary_from_pandas_specified_type(): + # ARROW-7168 - ensure specified type is always respected + + # the same as cat = pd.Categorical(['a', 'b']) but explicit about dtypes + cat = pd.Categorical.from_codes( + np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object)) + + # different index type -> allow this + # (the type of the 'codes' in pandas is not part of the data type) + typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string()) + result = pa.array(cat, type=typ) + assert result.type.equals(typ) + assert result.to_pylist() == ['a', 'b'] + + # mismatching values type -> raise error (for now a deprecation warning) + typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) + with pytest.warns(FutureWarning): + result = pa.array(cat, type=typ) + assert result.to_pylist() == ['a', 'b'] + + # mismatching order -> raise error (for now a deprecation warning) + typ = pa.dictionary( + index_type=pa.int8(), value_type=pa.string(), ordered=True) + with pytest.warns(FutureWarning, match="The 'ordered' flag of the passed"): + result = pa.array(cat, type=typ) + assert result.to_pylist() == ['a', 'b'] + + # with mask + typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string()) + result = pa.array(cat, type=typ, mask=np.array([False, True])) + assert result.type.equals(typ) + assert result.to_pylist() == ['a', None] + + # empty categorical -> be flexible in values type to allow + cat = pd.Categorical([]) + + typ = pa.dictionary(index_type=pa.int8(), value_type=pa.string()) + result = pa.array(cat, type=typ) + assert result.type.equals(typ) + assert result.to_pylist() == [] + typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) + result = pa.array(cat, type=typ) + assert result.type.equals(typ) + assert result.to_pylist() == [] + + # passing non-dictionary type + cat = pd.Categorical(['a', 'b']) + result = pa.array(cat, type=pa.string()) + expected = pa.array(['a', 'b'], type=pa.string()) + assert result.equals(expected) + assert result.to_pylist() == ['a', 'b'] + + # ---------------------------------------------------------------------- # Array protocol in pandas conversions tests