From f5cbf8f8fbc7ddc2bd47f13b6548edd8d5e2581c Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 30 Sep 2020 00:23:36 -0700 Subject: [PATCH 1/9] Add test for pandas conversion --- python/pyarrow/tests/test_pandas.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 6eae419f05a..93803ebb3e9 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2132,6 +2132,16 @@ def test_auto_chunking_on_list_overflow(self): assert len(column_a.chunk(0)) == 2**24 - 1 assert len(column_a.chunk(1)) == 1 + def test_map_array(self): + data = [[(b'a', 1), (b'b', 2)], + None, + [(b'd', 4), (b'e', 5), (b'f', None)], + [(b'g', 7)]] + + arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32())) + s = arr.to_pandas() + tm.assert_series_equal(s, pd.Series(data), check_names=False) + class TestConvertStructTypes: """ From 6c8558a71895f37a3b1c1ef0f8601aae1fe41ae9 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 30 Sep 2020 00:24:07 -0700 Subject: [PATCH 2/9] Started pandas map converter --- cpp/src/arrow/python/arrow_to_pandas.cc | 85 ++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index e4d3e793d25..1951d0f19ab 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -791,6 +791,84 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, return Status::OK(); } + +inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, + PyObject** out_values) { + // Get column of underlying map arrays + std::vector> map_arrays; + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + map_arrays.emplace_back(arr.values()); + } + const auto& map_type = checked_cast(*data.type()); + auto entry_type = map_type.value_type(); + const auto& key_type = map_type.key_type(); + /*using ListArrayType = typename ListArrayT::TypeClass; + const auto& map_type = checked_cast(*data.type()); + auto value_type = list_type.value_type(); + + if (value_type->id() == Type::DICTIONARY) { + // ARROW-6899: Convert dictionary-encoded children to dense instead of + // failing below. A more efficient conversion than this could be done later + auto dense_type = checked_cast(*value_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &value_arrays)); + value_type = dense_type; + }*/ + + auto flat_column = std::make_shared(map_arrays, entry_type); + // TODO(ARROW-489): Currently we don't have a Python reference for single columns. + // Storing a reference to the whole Array would be too expensive. + + // ARROW-3789(wesm): During refactoring I found that unit tests assumed that + // timestamp units would be preserved on list conversions in + // Table.to_pandas. So we set the option here to not coerce things to + // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy + // the existing unit tests + PandasOptions modified_options = options; + modified_options.coerce_temporal_nanoseconds = false; + + OwnedRefNoGIL owned_numpy_array; + RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_column, nullptr, + owned_numpy_array.ref())); + + PyObject* numpy_array = owned_numpy_array.obj(); + + int64_t chunk_offset = 0; + for (int c = 0; c < data.num_chunks(); c++) { + auto arr = std::static_pointer_cast(data.chunk(c)); + + /*const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + if (has_nulls && arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + OwnedRef start(PyLong_FromLongLong(arr->value_offset(i) + chunk_offset)); + OwnedRef end(PyLong_FromLongLong(arr->value_offset(i + 1) + chunk_offset)); + OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); + + if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { + // Fall out of loop, will return from RETURN_IF_PYERROR + break; + } + *out_values = PyObject_GetItem(numpy_array, slice.obj()); + + if (*out_values == nullptr) { + // Fall out of loop, will return from RETURN_IF_PYERROR + break; + } + } + ++out_values; + }*/ + RETURN_IF_PYERROR(); + + chunk_offset += arr->values()->length(); + } + + return Status::OK(); +} + + template inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value, OutType* out_values) { @@ -1027,6 +1105,10 @@ struct ObjectWriterVisitor { return ConvertListsLike(options, data, out_values); } + Status Visit(const MapType& type) { + return ConvertMap(options, data, out_values); + } + Status Visit(const StructType& type) { return ConvertStruct(options, data, out_values); } @@ -1801,7 +1883,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& } break; case Type::FIXED_SIZE_LIST: case Type::LIST: - case Type::LARGE_LIST: { + case Type::LARGE_LIST: + case Type::MAP: { auto list_type = std::static_pointer_cast(data.type()); if (!ListTypeSupported(*list_type->value_type())) { return Status::NotImplemented("Not implemented type for Arrow list to pandas: ", From e85014ab43977e7b692dd294166f3251faa812e4 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Fri, 2 Oct 2020 16:45:52 -0700 Subject: [PATCH 3/9] Conversion working except when item has a NULL --- cpp/src/arrow/python/arrow_to_pandas.cc | 126 +++++++++++++++++++----- python/pyarrow/tests/test_pandas.py | 2 + 2 files changed, 104 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 1951d0f19ab..8b62a269a65 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -794,15 +794,20 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - // Get column of underlying map arrays - std::vector> map_arrays; - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = checked_cast(*data.chunk(c)); - map_arrays.emplace_back(arr.values()); + std::cout << "*** CONVERT_MAP" << std::endl; + // Get columns of underlying key/item arrays + std::vector> key_arrays; + std::vector> item_arrays; + for (int c = 0; c < data.num_chunks(); ++c) { + const auto& map_arr = checked_cast(*data.chunk(c)); + key_arrays.emplace_back(map_arr.keys()); + item_arrays.emplace_back(map_arr.items()); } + const auto& map_type = checked_cast(*data.type()); - auto entry_type = map_type.value_type(); + //auto entry_type = map_type.value_type(); const auto& key_type = map_type.key_type(); + const auto& item_type = map_type.item_type(); /*using ListArrayType = typename ListArrayT::TypeClass; const auto& map_type = checked_cast(*data.type()); auto value_type = list_type.value_type(); @@ -815,7 +820,8 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, value_type = dense_type; }*/ - auto flat_column = std::make_shared(map_arrays, entry_type); + auto flat_keys = std::make_shared(key_arrays, key_type); + auto flat_items = std::make_shared(item_arrays, item_type); // TODO(ARROW-489): Currently we don't have a Python reference for single columns. // Storing a reference to the whole Array would be too expensive. @@ -827,39 +833,111 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; - OwnedRefNoGIL owned_numpy_array; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_column, nullptr, - owned_numpy_array.ref())); - - PyObject* numpy_array = owned_numpy_array.obj(); + std::cout << "*** starting key item conversion" << std::endl; + + OwnedRef list_item; + OwnedRef tuple_item; + OwnedRefNoGIL owned_numpy_keys; + RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr, + owned_numpy_keys.ref())); + OwnedRefNoGIL owned_numpy_items; + RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr, + owned_numpy_items.ref())); + std::cout << "*** done key item conversion" << std::endl; + PyArrayObject* keys_array = reinterpret_cast(owned_numpy_keys.obj()); + PyArrayObject* items_array = reinterpret_cast(owned_numpy_items.obj()); + + std::cout << "*** starting chunk loop" << std::endl; int64_t chunk_offset = 0; - for (int c = 0; c < data.num_chunks(); c++) { + for (int c = 0; c < data.num_chunks(); ++c) { auto arr = std::static_pointer_cast(data.chunk(c)); + const bool has_nulls = data.null_count() > 0; - /*const bool has_nulls = data.null_count() > 0; + // Make a list of key/item pairs for each row in array for (int64_t i = 0; i < arr->length(); ++i) { if (has_nulls && arr->IsNull(i)) { Py_INCREF(Py_None); *out_values = Py_None; } else { - OwnedRef start(PyLong_FromLongLong(arr->value_offset(i) + chunk_offset)); - OwnedRef end(PyLong_FromLongLong(arr->value_offset(i + 1) + chunk_offset)); - OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); + int64_t entry_offset = arr->value_offset(i); + int64_t num_maps = arr->value_offset(i + 1) - entry_offset; + OwnedRef key_value; + OwnedRef item_value; + + std::cout << "*** row loop " << i << std::endl; + // Build the new list object for the row of maps + list_item.reset(PyList_New(0)); //num_maps)); + RETURN_IF_PYERROR(); + + // Add each key/item pair in the row + for (int64_t j = 0; j < num_maps; ++j) { - if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { - // Fall out of loop, will return from RETURN_IF_PYERROR - break; + std::cout << "*** map loop " << j << std::endl; + + // Build the new tuple object for key and item + tuple_item.reset(PyTuple_New(2)); + RETURN_IF_PYERROR(); + + // Get key value, key is non-nullable for a valid row + auto ptr_key = reinterpret_cast( + PyArray_GETPTR1(keys_array, chunk_offset + entry_offset + j)); + key_value.reset(PyArray_GETITEM(keys_array, ptr_key)); + RETURN_IF_PYERROR(); + + std::cout << "*** set key" << std::endl; + //PyObject* int_key = PyLong_FromLong(1L); + //PyTuple_SetItem(tuple_item.obj(), 0, int_key); + PyTuple_SetItem(tuple_item.obj(), 0, key_value.obj()); + RETURN_IF_PYERROR(); + + std::cout << "*** item len: " << flat_items->length() << ", item index: " << (entry_offset + j) << std::endl; + + if (!item_arrays[c]->IsNull(entry_offset + j)) { + // Get valid value from item array + auto ptr_item = reinterpret_cast( + PyArray_GETPTR1(items_array, chunk_offset + entry_offset + j)); + item_value.reset(PyArray_GETITEM(items_array, ptr_item)); + RETURN_IF_PYERROR(); + } else { + std::cout << "*** item is null: " << entry_offset + j << std::endl; + // Translate the Null to a None + Py_INCREF(Py_None); + item_value.reset(Py_None); + std::cout << "*** item set to null" << std::endl; + } + + std::cout << "*** set item" << std::endl; + //PyObject* int_item = PyLong_FromLong(2L); + //PyTuple_SetItem(tuple_item.obj(), 1, int_item); + PyTuple_SetItem(tuple_item.obj(), 1, item_value.obj()); + RETURN_IF_PYERROR(); + + std::cout << "*** set list" << std::endl; + //Py_INCREF(tuple_item.obj()); + // Add the key/item pair to the list for the row + PyList_Append(list_item.obj(), tuple_item.obj()); + RETURN_IF_PYERROR(); + + std::cout << "*** end map loop" << std::endl; } - *out_values = PyObject_GetItem(numpy_array, slice.obj()); - if (*out_values == nullptr) { + + std::cout << "*** assign out_values" << std::endl; + + *out_values = list_item.obj(); + // Grant ownership to the resulting array + Py_INCREF(*out_values); + + /*if (*out_values == nullptr) { // Fall out of loop, will return from RETURN_IF_PYERROR break; - } + }*/ } ++out_values; - }*/ + } + + std::cout << "*** end chunk loop" << std::endl; RETURN_IF_PYERROR(); chunk_offset += arr->values()->length(); diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 93803ebb3e9..e83a59b4a9e 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2138,6 +2138,8 @@ def test_map_array(self): [(b'd', 4), (b'e', 5), (b'f', None)], [(b'g', 7)]] + data = [[(b'a', 1)], [(b'b', 2)]] + arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32())) s = arr.to_pandas() tm.assert_series_equal(s, pd.Series(data), check_names=False) From c5feb6ca5558d17d170fb26c245bddb2ae713e56 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sat, 3 Oct 2020 22:34:46 -0700 Subject: [PATCH 4/9] Fixed setting tuples with nulls --- cpp/src/arrow/python/arrow_to_pandas.cc | 63 ++++--------------------- 1 file changed, 10 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 8b62a269a65..dafd29711db 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -794,7 +794,6 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - std::cout << "*** CONVERT_MAP" << std::endl; // Get columns of underlying key/item arrays std::vector> key_arrays; std::vector> item_arrays; @@ -833,22 +832,18 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; - std::cout << "*** starting key item conversion" << std::endl; - OwnedRef list_item; - OwnedRef tuple_item; + OwnedRef key_value; + OwnedRef item_value; OwnedRefNoGIL owned_numpy_keys; RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr, owned_numpy_keys.ref())); OwnedRefNoGIL owned_numpy_items; RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr, owned_numpy_items.ref())); - std::cout << "*** done key item conversion" << std::endl; PyArrayObject* keys_array = reinterpret_cast(owned_numpy_keys.obj()); PyArrayObject* items_array = reinterpret_cast(owned_numpy_items.obj()); - std::cout << "*** starting chunk loop" << std::endl; - int64_t chunk_offset = 0; for (int c = 0; c < data.num_chunks(); ++c) { auto arr = std::static_pointer_cast(data.chunk(c)); @@ -862,82 +857,44 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, } else { int64_t entry_offset = arr->value_offset(i); int64_t num_maps = arr->value_offset(i + 1) - entry_offset; - OwnedRef key_value; - OwnedRef item_value; - std::cout << "*** row loop " << i << std::endl; // Build the new list object for the row of maps - list_item.reset(PyList_New(0)); //num_maps)); + list_item.reset(PyList_New(0)); RETURN_IF_PYERROR(); // Add each key/item pair in the row for (int64_t j = 0; j < num_maps; ++j) { - std::cout << "*** map loop " << j << std::endl; - - // Build the new tuple object for key and item - tuple_item.reset(PyTuple_New(2)); - RETURN_IF_PYERROR(); - // Get key value, key is non-nullable for a valid row auto ptr_key = reinterpret_cast( PyArray_GETPTR1(keys_array, chunk_offset + entry_offset + j)); key_value.reset(PyArray_GETITEM(keys_array, ptr_key)); RETURN_IF_PYERROR(); - std::cout << "*** set key" << std::endl; - //PyObject* int_key = PyLong_FromLong(1L); - //PyTuple_SetItem(tuple_item.obj(), 0, int_key); - PyTuple_SetItem(tuple_item.obj(), 0, key_value.obj()); - RETURN_IF_PYERROR(); - - std::cout << "*** item len: " << flat_items->length() << ", item index: " << (entry_offset + j) << std::endl; - - if (!item_arrays[c]->IsNull(entry_offset + j)) { + if (item_arrays[c]->IsNull(entry_offset + j)) { + // Translate the Null to a None + Py_INCREF(Py_None); + item_value.reset(Py_None); + } else { // Get valid value from item array auto ptr_item = reinterpret_cast( PyArray_GETPTR1(items_array, chunk_offset + entry_offset + j)); item_value.reset(PyArray_GETITEM(items_array, ptr_item)); RETURN_IF_PYERROR(); - } else { - std::cout << "*** item is null: " << entry_offset + j << std::endl; - // Translate the Null to a None - Py_INCREF(Py_None); - item_value.reset(Py_None); - std::cout << "*** item set to null" << std::endl; } - std::cout << "*** set item" << std::endl; - //PyObject* int_item = PyLong_FromLong(2L); - //PyTuple_SetItem(tuple_item.obj(), 1, int_item); - PyTuple_SetItem(tuple_item.obj(), 1, item_value.obj()); - RETURN_IF_PYERROR(); - - std::cout << "*** set list" << std::endl; - //Py_INCREF(tuple_item.obj()); // Add the key/item pair to the list for the row - PyList_Append(list_item.obj(), tuple_item.obj()); + PyList_Append(list_item.obj(), + PyTuple_Pack(2, key_value.obj(), item_value.obj())); RETURN_IF_PYERROR(); - - std::cout << "*** end map loop" << std::endl; } - - std::cout << "*** assign out_values" << std::endl; - *out_values = list_item.obj(); // Grant ownership to the resulting array Py_INCREF(*out_values); - - /*if (*out_values == nullptr) { - // Fall out of loop, will return from RETURN_IF_PYERROR - break; - }*/ } ++out_values; } - - std::cout << "*** end chunk loop" << std::endl; RETURN_IF_PYERROR(); chunk_offset += arr->values()->length(); From 82ead951259e22dc70b4725e4e4dfcd27a0fed22 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sun, 4 Oct 2020 23:01:41 -0700 Subject: [PATCH 5/9] Added roundtrip and chunked tests --- python/pyarrow/tests/test_pandas.py | 37 +++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index e83a59b4a9e..70999d52704 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2132,17 +2132,46 @@ def test_auto_chunking_on_list_overflow(self): assert len(column_a.chunk(0)) == 2**24 - 1 assert len(column_a.chunk(1)) == 1 - def test_map_array(self): + def test_map_array_roundtrip(self): + data = [[(b'a', 1), (b'b', 2)], + [(b'c', 3)], + [(b'd', 4), (b'e', 5), (b'f', 6)], + [(b'g', 7)]] + + df = pd.DataFrame({"map": data}) + schema = pa.schema([("map", pa.map_(pa.binary(), pa.int32()))]) + + _check_pandas_roundtrip(df, schema=schema) + + def test_map_array_chunked(self): + data1 = [[(b'a', 1), (b'b', 2)], + [(b'c', 3)], + [(b'd', 4), (b'e', 5), (b'f', 6)], + [(b'g', 7)]] + data2 = [[(k, v * 2) for k, v in row] for row in data1] + + arr1 = pa.array(data1, type=pa.map_(pa.binary(), pa.int32())) + arr2 = pa.array(data2, type=pa.map_(pa.binary(), pa.int32())) + arr = pa.chunked_array([arr1, arr2]) + + expected = pd.Series(data1 + data2) + actual = arr.to_pandas() + tm.assert_series_equal(actual, expected, check_names=False) + + def test_map_array_with_nulls(self): data = [[(b'a', 1), (b'b', 2)], None, [(b'd', 4), (b'e', 5), (b'f', None)], [(b'g', 7)]] - data = [[(b'a', 1)], [(b'b', 2)]] + # None value in item array causes upcast to float + expected = [[(k, float(v) if v is not None else None) for k, v in row] + if row is not None else None for row in data] + expected = pd.Series(expected) arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32())) - s = arr.to_pandas() - tm.assert_series_equal(s, pd.Series(data), check_names=False) + actual = arr.to_pandas() + tm.assert_series_equal(actual, expected, check_names=False) class TestConvertStructTypes: From 43263a7007e8248114725333cc2300273299b119 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 5 Oct 2020 00:00:16 -0700 Subject: [PATCH 6/9] Fix lint errors --- cpp/src/arrow/python/arrow_to_pandas.cc | 33 ++++++++++--------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index dafd29711db..2535333e1f2 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -791,7 +791,6 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, return Status::OK(); } - inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { // Get columns of underlying key/item arrays @@ -804,13 +803,11 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, } const auto& map_type = checked_cast(*data.type()); - //auto entry_type = map_type.value_type(); const auto& key_type = map_type.key_type(); const auto& item_type = map_type.item_type(); - /*using ListArrayType = typename ListArrayT::TypeClass; - const auto& map_type = checked_cast(*data.type()); - auto value_type = list_type.value_type(); + // TODO: Is this needed for key/item? + /* if (value_type->id() == Type::DICTIONARY) { // ARROW-6899: Convert dictionary-encoded children to dense instead of // failing below. A more efficient conversion than this could be done later @@ -819,8 +816,6 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, value_type = dense_type; }*/ - auto flat_keys = std::make_shared(key_arrays, key_type); - auto flat_items = std::make_shared(item_arrays, item_type); // TODO(ARROW-489): Currently we don't have a Python reference for single columns. // Storing a reference to the whole Array would be too expensive. @@ -832,6 +827,8 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, PandasOptions modified_options = options; modified_options.coerce_temporal_nanoseconds = false; + auto flat_keys = std::make_shared(key_arrays, key_type); + auto flat_items = std::make_shared(item_arrays, item_type); OwnedRef list_item; OwnedRef key_value; OwnedRef item_value; @@ -841,8 +838,8 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, OwnedRefNoGIL owned_numpy_items; RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr, owned_numpy_items.ref())); - PyArrayObject* keys_array = reinterpret_cast(owned_numpy_keys.obj()); - PyArrayObject* items_array = reinterpret_cast(owned_numpy_items.obj()); + PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); + PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); int64_t chunk_offset = 0; for (int c = 0; c < data.num_chunks(); ++c) { @@ -861,14 +858,13 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, // Build the new list object for the row of maps list_item.reset(PyList_New(0)); RETURN_IF_PYERROR(); - + // Add each key/item pair in the row for (int64_t j = 0; j < num_maps; ++j) { - // Get key value, key is non-nullable for a valid row auto ptr_key = reinterpret_cast( - PyArray_GETPTR1(keys_array, chunk_offset + entry_offset + j)); - key_value.reset(PyArray_GETITEM(keys_array, ptr_key)); + PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j)); + key_value.reset(PyArray_GETITEM(py_keys, ptr_key)); RETURN_IF_PYERROR(); if (item_arrays[c]->IsNull(entry_offset + j)) { @@ -878,8 +874,8 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, } else { // Get valid value from item array auto ptr_item = reinterpret_cast( - PyArray_GETPTR1(items_array, chunk_offset + entry_offset + j)); - item_value.reset(PyArray_GETITEM(items_array, ptr_item)); + PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j)); + item_value.reset(PyArray_GETITEM(py_items, ptr_item)); RETURN_IF_PYERROR(); } @@ -903,7 +899,6 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, return Status::OK(); } - template inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value, OutType* out_values) { @@ -1140,9 +1135,7 @@ struct ObjectWriterVisitor { return ConvertListsLike(options, data, out_values); } - Status Visit(const MapType& type) { - return ConvertMap(options, data, out_values); - } + Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); } Status Visit(const StructType& type) { return ConvertStruct(options, data, out_values); @@ -1918,7 +1911,7 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& } break; case Type::FIXED_SIZE_LIST: case Type::LIST: - case Type::LARGE_LIST: + case Type::LARGE_LIST: case Type::MAP: { auto list_type = std::static_pointer_cast(data.type()); if (!ListTypeSupported(*list_type->value_type())) { From c469dcb264265fe7f74dd2e88c7c84ba23fad4ab Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 5 Oct 2020 10:44:17 -0700 Subject: [PATCH 7/9] Handle dictionary encoded arrays --- cpp/src/arrow/python/arrow_to_pandas.cc | 27 ++++++++++++++----------- python/pyarrow/tests/test_pandas.py | 13 ++++++++++++ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 2535333e1f2..012e72fd07c 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -803,18 +803,21 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, } const auto& map_type = checked_cast(*data.type()); - const auto& key_type = map_type.key_type(); - const auto& item_type = map_type.item_type(); - - // TODO: Is this needed for key/item? - /* - if (value_type->id() == Type::DICTIONARY) { - // ARROW-6899: Convert dictionary-encoded children to dense instead of - // failing below. A more efficient conversion than this could be done later - auto dense_type = checked_cast(*value_type).value_type(); - RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &value_arrays)); - value_type = dense_type; - }*/ + auto key_type = map_type.key_type(); + auto item_type = map_type.item_type(); + + // ARROW-6899: Convert dictionary-encoded children to dense instead of + // failing below. A more efficient conversion than this could be done later + if (key_type->id() == Type::DICTIONARY) { + auto dense_type = checked_cast(*key_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays)); + key_type = dense_type; + } + if (item_type->id() == Type::DICTIONARY) { + auto dense_type = checked_cast(*item_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays)); + item_type = dense_type; + } // TODO(ARROW-489): Currently we don't have a Python reference for single columns. // Storing a reference to the whole Array would be too expensive. diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 70999d52704..0e01dc08ef6 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2173,6 +2173,19 @@ def test_map_array_with_nulls(self): actual = arr.to_pandas() tm.assert_series_equal(actual, expected, check_names=False) + def test_map_array_dictionary_encoded(self): + offsets = pa.array([0, 3, 5]) + items = pa.array(['a', 'b', 'c', 'a', 'd']).dictionary_encode() + keys = pa.array(list(range(len(items)))) + arr = pa.MapArray.from_arrays(offsets, keys, items) + + # Dictionary encoded values converted to dense + expected = pd.Series( + [[(0, 'a'), (1, 'b'), (2, 'c')], [(3, 'a'), (4, 'd')]]) + + actual = arr.to_pandas() + tm.assert_series_equal(actual, expected, check_names=False) + class TestConvertStructTypes: """ From c2a804ef9525bc0c16c364ce2035895fe53e0d81 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 7 Oct 2020 00:14:06 -0700 Subject: [PATCH 8/9] Address feedback, rebased with recent changes --- cpp/src/arrow/python/arrow_to_pandas.cc | 44 +++++++++++-------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 012e72fd07c..910feb8b23d 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -791,8 +791,8 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, - PyObject** out_values) { +Status ConvertMap(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { // Get columns of underlying key/item arrays std::vector> key_arrays; std::vector> item_arrays; @@ -819,16 +819,10 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, item_type = dense_type; } - // TODO(ARROW-489): Currently we don't have a Python reference for single columns. - // Storing a reference to the whole Array would be too expensive. - - // ARROW-3789(wesm): During refactoring I found that unit tests assumed that - // timestamp units would be preserved on list conversions in - // Table.to_pandas. So we set the option here to not coerce things to - // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy - // the existing unit tests - PandasOptions modified_options = options; - modified_options.coerce_temporal_nanoseconds = false; + // See notes in MakeInnerOptions. + options = MakeInnerOptions(std::move(options)); + // Don't blindly convert because timestamps in lists are handled differently. + options.timestamp_as_object = true; auto flat_keys = std::make_shared(key_arrays, key_type); auto flat_items = std::make_shared(item_arrays, item_type); @@ -836,30 +830,30 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, OwnedRef key_value; OwnedRef item_value; OwnedRefNoGIL owned_numpy_keys; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_keys, nullptr, + RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref())); OwnedRefNoGIL owned_numpy_items; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_items, nullptr, + RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref())); PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); int64_t chunk_offset = 0; for (int c = 0; c < data.num_chunks(); ++c) { - auto arr = std::static_pointer_cast(data.chunk(c)); + const auto& arr = checked_cast(*data.chunk(c)); const bool has_nulls = data.null_count() > 0; // Make a list of key/item pairs for each row in array - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { Py_INCREF(Py_None); *out_values = Py_None; } else { - int64_t entry_offset = arr->value_offset(i); - int64_t num_maps = arr->value_offset(i + 1) - entry_offset; + int64_t entry_offset = arr.value_offset(i); + int64_t num_maps = arr.value_offset(i + 1) - entry_offset; // Build the new list object for the row of maps - list_item.reset(PyList_New(0)); + list_item.reset(PyList_New(num_maps)); RETURN_IF_PYERROR(); // Add each key/item pair in the row @@ -883,20 +877,20 @@ inline Status ConvertMap(const PandasOptions& options, const ChunkedArray& data, } // Add the key/item pair to the list for the row - PyList_Append(list_item.obj(), - PyTuple_Pack(2, key_value.obj(), item_value.obj())); + PyList_SET_ITEM(list_item.obj(), j, + PyTuple_Pack(2, key_value.obj(), item_value.obj())); RETURN_IF_PYERROR(); } *out_values = list_item.obj(); - // Grant ownership to the resulting array - Py_INCREF(*out_values); + // Release ownership to the resulting array + list_item.detach(); } ++out_values; } RETURN_IF_PYERROR(); - chunk_offset += arr->values()->length(); + chunk_offset += arr.values()->length(); } return Status::OK(); From 090177e4ebe7c7c71f3460ad598c7ec5bd661ac5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 7 Oct 2020 11:32:02 +0200 Subject: [PATCH 9/9] Nits & lint --- cpp/src/arrow/python/arrow_to_pandas.cc | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 910feb8b23d..be27a108dd1 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -830,11 +830,11 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data, OwnedRef key_value; OwnedRef item_value; OwnedRefNoGIL owned_numpy_keys; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_keys, nullptr, - owned_numpy_keys.ref())); + RETURN_NOT_OK( + ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref())); OwnedRefNoGIL owned_numpy_items; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_items, nullptr, - owned_numpy_items.ref())); + RETURN_NOT_OK( + ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref())); PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); @@ -882,9 +882,8 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data, RETURN_IF_PYERROR(); } - *out_values = list_item.obj(); - // Release ownership to the resulting array - list_item.detach(); + // Pass ownership to the resulting array + *out_values = list_item.detach(); } ++out_values; }