From 3535a5699e508cdc9578c81a7a35516cd54bd7c8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 19 Nov 2019 16:54:54 +0100 Subject: [PATCH 1/5] ARROW-7168: [Python] Respect the specified dictionary type when converting pd.Categorical --- python/pyarrow/array.pxi | 47 +++++++++++++++++++++++------ python/pyarrow/tests/test_pandas.py | 40 ++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 9a6e9d1c4d6..441e1c2fbad 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -84,6 +84,19 @@ cdef _ndarray_to_array(object values, object mask, DataType type, return pyarrow_wrap_array(chunked_out.get().chunk(0)) +cdef _codes_to_indices(object codes, object mask, DataType type, + MemoryPool memory_pool): + """ + Convert the codes of a pandas Categorical to indices for a pyarrow + DictionaryArray, taking into account missing values + mask + """ + if mask is None: + mask = codes == -1 + else: + mask = mask | (codes == -1) + return array(codes, mask=mask, type=type, memory_pool=memory_pool) + + def _handle_arrow_array_protocol(obj, type, mask, size): if mask is not None or size is not None: raise ValueError( @@ -199,11 +212,29 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, if hasattr(values, '__arrow_array__'): return _handle_arrow_array_protocol(values, type, mask, size) elif pandas_api.is_categorical(values): - return DictionaryArray.from_arrays( - values.codes, values.categories.values, - mask=mask, ordered=values.ordered, - from_pandas=True, safe=safe, + if type is not None: + if type.id != Type_DICTIONARY: + return _ndarray_to_array( + np.asarray(values), mask, type, c_from_pandas, safe, + pool) + index_type = type.index_type + value_type = type.value_type + if values.ordered != type.ordered: + raise ValueError( + "The 'ordered' flag of the passed categorical values " + "does not match the 'ordered' of the specified type") + else: + index_type = None + value_type = None + + indices = _codes_to_indices( + values.codes, mask, index_type, memory_pool) + dictionary = array( + values.categories.values, type=value_type, memory_pool=memory_pool) + + return DictionaryArray.from_arrays( + indices, dictionary, ordered=values.ordered, safe=safe) else: if pandas_api.have_pandas: values, type = pandas_api.compat.get_datetimetz_type( @@ -1543,11 +1574,9 @@ cdef class DictionaryArray(Array): _indices = indices else: if from_pandas: - if mask is None: - mask = indices == -1 - else: - mask = mask | (indices == -1) - _indices = array(indices, mask=mask, memory_pool=memory_pool) + _indices = _codes_to_indices(indices, mask, None, memory_pool) + else: + _indices = array(indices, mask=mask, memory_pool=memory_pool) if isinstance(dictionary, Array): _dictionary = dictionary diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 441395cac9f..dd7d8bba3dd 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3212,6 +3212,46 @@ def test_variable_dictionary_to_pandas(): tm.assert_series_equal(result_dense, expected_dense) +def test_dictionary_from_pandas_specified_type(): + # ARROW-7168 + # the same as cat = pd.Categorical(['a', 'b']) + cat = pd.Categorical.from_codes( + np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object)) + + # different index type -> allow this + # (the type of the 'codes' in pandas is not part of the data type) + typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string()) + result = pa.array(cat, type=typ) + assert result.type.equals(typ) + + # mismatching values type -> raise error + typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) + with pytest.raises(TypeError): + pa.array(cat, type=typ) + + # mismatching order -> raise error + typ = pa.dictionary( + index_type=pa.int8(), value_type=pa.string(), ordered=True) + with pytest.raises(ValueError): + pa.array(cat, type=typ) + + # empty categorical -> be flexible in values type to allow + cat = pd.Categorical([]) + + typ = pa.dictionary(index_type=pa.int8(), value_type=pa.string()) + result = pa.array(cat, type=typ) + assert result.type.equals(typ) + typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) + result = pa.array(cat, type=typ) + assert result.type.equals(typ) + + # passing non-dictionary type + cat = pd.Categorical(['a', 'b']) + result = pa.array(cat, type=pa.string()) + expected = pa.array(['a', 'b'], type=pa.string()) + assert result.equals(expected) + + # ---------------------------------------------------------------------- # Array protocol in pandas conversions tests From bfb82372badc8d8c5e81659f664f68f461c5350c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 20 Nov 2019 11:02:38 +0100 Subject: [PATCH 2/5] additional tests --- python/pyarrow/tests/test_pandas.py | 36 +++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index dd7d8bba3dd..cae6f7ea74c 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3212,9 +3212,31 @@ def test_variable_dictionary_to_pandas(): tm.assert_series_equal(result_dense, expected_dense) +def test_dictionary_from_pandas(): + cat = pd.Categorical(['a', 'b', 'a']) + expected_type = pa.dictionary(pa.int8(), pa.string()) + + result = pa.array(cat) + assert result.to_pylist() == ['a', 'b', 'a'] + assert result.type.equals(expected_type) + + # with missing values in categorical + cat = pd.Categorical(['a', 'b', None, 'a']) + + result = pa.array(cat) + assert result.to_pylist() == ['a', 'b', None, 'a'] + assert result.type.equals(expected_type) + + # with additional mask + result = pa.array(cat, mask=np.array([False, False, False, True])) + assert result.to_pylist() == ['a', 'b', None, None] + assert result.type.equals(expected_type) + + def test_dictionary_from_pandas_specified_type(): - # ARROW-7168 - # the same as cat = pd.Categorical(['a', 'b']) + # ARROW-7168 - ensure specified type is always respected + + # the same as cat = pd.Categorical(['a', 'b']) but explicit about dtypes cat = pd.Categorical.from_codes( np.array([0, 1], dtype='int8'), np.array(['a', 'b'], dtype=object)) @@ -3223,6 +3245,7 @@ def test_dictionary_from_pandas_specified_type(): typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string()) result = pa.array(cat, type=typ) assert result.type.equals(typ) + assert result.to_pylist() == ['a', 'b'] # mismatching values type -> raise error typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) @@ -3235,21 +3258,30 @@ def test_dictionary_from_pandas_specified_type(): with pytest.raises(ValueError): pa.array(cat, type=typ) + # with mask + typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string()) + result = pa.array(cat, type=typ, mask=np.array([False, True])) + assert result.type.equals(typ) + assert result.to_pylist() == ['a', None] + # empty categorical -> be flexible in values type to allow cat = pd.Categorical([]) typ = pa.dictionary(index_type=pa.int8(), value_type=pa.string()) result = pa.array(cat, type=typ) assert result.type.equals(typ) + assert result.to_pylist() == [] typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) result = pa.array(cat, type=typ) assert result.type.equals(typ) + assert result.to_pylist() == [] # passing non-dictionary type cat = pd.Categorical(['a', 'b']) result = pa.array(cat, type=pa.string()) expected = pa.array(['a', 'b'], type=pa.string()) assert result.equals(expected) + assert result.to_pylist() == ['a', 'b'] # ---------------------------------------------------------------------- From 003e6532b36ee2f83a7d58c8012f8037845c9b45 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 20 Nov 2019 11:22:59 +0100 Subject: [PATCH 3/5] for now use deprecation warnings instead of error --- python/pyarrow/array.pxi | 33 ++++++++++++++++++++++++----- python/pyarrow/tests/test_pandas.py | 14 ++++++------ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 441e1c2fbad..b6d436ffa2b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import warnings + cdef _sequence_to_array(object sequence, object mask, object size, DataType type, CMemoryPool* pool, c_bool from_pandas): @@ -220,18 +222,39 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, index_type = type.index_type value_type = type.value_type if values.ordered != type.ordered: - raise ValueError( + warnings.warn( "The 'ordered' flag of the passed categorical values " - "does not match the 'ordered' of the specified type") + "does not match the 'ordered' of the specified type. " + "Using the flag of the values, but in the future this " + "mismatch will raise a ValueError.", + FutureWarning, stacklevel=2) else: index_type = None value_type = None indices = _codes_to_indices( values.codes, mask, index_type, memory_pool) - dictionary = array( - values.categories.values, type=value_type, - memory_pool=memory_pool) + try: + dictionary = array( + values.categories.values, type=value_type, + memory_pool=memory_pool) + except TypeError: + # TODO when removing the deprecation warning, this whole + # try/except can be removed (to bubble the TypeError of + # the first array(..) call) + if value_type is not None: + warnings.warn( + "The dtype of the 'categories' of the passed " + "categorical values ({0}) does not match the " + "specified type ({1}). For now ignoring the specified " + "type, but in the future this mismatch will raise a " + "TypeError".format( + values.categories.dtype, value_type), + FutureWarning, stacklevel=2) + dictionary = array( + values.categories.values, memory_pool=memory_pool) + else: + raise return DictionaryArray.from_arrays( indices, dictionary, ordered=values.ordered, safe=safe) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index cae6f7ea74c..93e337a4522 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3247,16 +3247,18 @@ def test_dictionary_from_pandas_specified_type(): assert result.type.equals(typ) assert result.to_pylist() == ['a', 'b'] - # mismatching values type -> raise error + # mismatching values type -> raise error (for now a deprecation warning) typ = pa.dictionary(index_type=pa.int8(), value_type=pa.int64()) - with pytest.raises(TypeError): - pa.array(cat, type=typ) + with pytest.warns(FutureWarning): + result = pa.array(cat, type=typ) + assert result.to_pylist() == ['a', 'b'] - # mismatching order -> raise error + # mismatching order -> raise error (for now a deprecation warning) typ = pa.dictionary( index_type=pa.int8(), value_type=pa.string(), ordered=True) - with pytest.raises(ValueError): - pa.array(cat, type=typ) + with pytest.warns(FutureWarning, match="The 'ordered' flag of the passed"): + result = pa.array(cat, type=typ) + assert result.to_pylist() == ['a', 'b'] # with mask typ = pa.dictionary(index_type=pa.int16(), value_type=pa.string()) From e4dbb2c4f7540e84953cf88d43b09f95ffad02c2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 20 Nov 2019 11:24:08 +0100 Subject: [PATCH 4/5] try fix python 2 --- python/pyarrow/tests/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 93e337a4522..5ac9f555cd0 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3213,7 +3213,7 @@ def test_variable_dictionary_to_pandas(): def test_dictionary_from_pandas(): - cat = pd.Categorical(['a', 'b', 'a']) + cat = pd.Categorical([u'a', u'b', u'a']) expected_type = pa.dictionary(pa.int8(), pa.string()) result = pa.array(cat) From 39ff8e82c805681c3677888b423a4a90c5dbaee1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 20 Nov 2019 11:49:44 +0100 Subject: [PATCH 5/5] more python 2 --- python/pyarrow/tests/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 5ac9f555cd0..3476c7448d2 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3221,7 +3221,7 @@ def test_dictionary_from_pandas(): assert result.type.equals(expected_type) # with missing values in categorical - cat = pd.Categorical(['a', 'b', None, 'a']) + cat = pd.Categorical([u'a', u'b', None, u'a']) result = pa.array(cat) assert result.to_pylist() == ['a', 'b', None, 'a']