diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py index 63af4234802..c9343fce233 100644 --- a/python/pyarrow/tests/pandas_examples.py +++ b/python/pyarrow/tests/pandas_examples.py @@ -76,3 +76,43 @@ def dataframe_with_arrays(): schema = pa.Schema.from_fields(fields) return df, schema + +def dataframe_with_lists(): + """ + Dataframe with list columns of every possible primtive type. + + Returns + ------- + df: pandas.DataFrame + schema: pyarrow.Schema + Arrow schema definition that is in line with the constructed df. + """ + arrays = OrderedDict() + fields = [] + + fields.append(pa.field('int64', pa.list_(pa.int64()))) + arrays['int64'] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [0, 1, 2, 3, 4], + None, + [0] + ] + fields.append(pa.field('double', pa.list_(pa.double()))) + arrays['double'] = [ + [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], + [0., 1., 2., 3., 4.], + None, + [0.] + ] + fields.append(pa.field('str_list', pa.list_(pa.string()))) + arrays['str_list'] = [ + [u"1", u"รค"], + None, + [u"1"], + [u"1", u"2", u"3"] + ] + + df = pd.DataFrame(arrays) + schema = pa.Schema.from_fields(fields) + + return df, schema diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 960653dca27..9821c224bc1 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -30,7 +30,7 @@ from pyarrow.compat import u import pyarrow as A -from .pandas_examples import dataframe_with_arrays +from .pandas_examples import dataframe_with_arrays, dataframe_with_lists def _alltypes_example(size=100): @@ -329,7 +329,7 @@ def test_date(self): expected['date'] = pd.to_datetime(df['date']) tm.assert_frame_equal(result, expected) - def test_column_of_lists(self): + def test_column_of_arrays(self): df, schema = dataframe_with_arrays() self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) table = A.Table.from_pandas(df, schema=schema) @@ -339,6 +339,16 @@ def test_column_of_lists(self): field = schema.field_by_name(column) self._check_array_roundtrip(df[column], field=field) + def test_column_of_lists(self): + df, schema = dataframe_with_lists() + self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) + table = A.Table.from_pandas(df, schema=schema) + assert table.schema.equals(schema) + + for column in df.columns: + field = schema.field_by_name(column) + self._check_array_roundtrip(df[column], field=field) + def test_threaded_conversion(self): df = _alltypes_example() self._check_pandas_roundtrip(df, nthreads=2, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 96f2d15e312..c72ff9e862b 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -23,7 +23,7 @@ from pyarrow.compat import guid import pyarrow as pa import pyarrow.io as paio -from .pandas_examples import dataframe_with_arrays +from .pandas_examples import dataframe_with_arrays, dataframe_with_lists import numpy as np import pandas as pd @@ -322,7 +322,7 @@ def test_compare_schemas(): @parquet -def test_column_of_lists(tmpdir): +def test_column_of_arrays(tmpdir): df, schema = dataframe_with_arrays() filename = tmpdir.join('pandas_rountrip.parquet') @@ -334,6 +334,19 @@ def test_column_of_lists(tmpdir): pdt.assert_frame_equal(df, df_read) +@parquet +def test_column_of_lists(tmpdir): + df, schema = dataframe_with_lists() + + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, + schema=schema) + pq.write_table(arrow_table, filename.strpath, version="2.0") + table_read = pq.read_table(filename.strpath) + df_read = table_read.to_pandas() + pdt.assert_frame_equal(df, df_read) + + @parquet def test_multithreaded_read(): df = alltypes_sample(size=10000) diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index c125cc078af..4f7b2cb09e1 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -206,8 +206,7 @@ class SeqVisitor { }; // Non-exhaustive type inference -static Status InferArrowType( - PyObject* obj, int64_t* size, std::shared_ptr* out_type) { +Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr* out_type) { *size = PySequence_Size(obj); if (PyErr_Occurred()) { // Not a sequence @@ -496,6 +495,19 @@ Status ListConverter::Init(const std::shared_ptr& builder) { return Status::OK(); } +Status AppendPySequence(PyObject* obj, const std::shared_ptr& type, + const std::shared_ptr& builder) { + std::shared_ptr converter = GetConverter(type); + if (converter == nullptr) { + std::stringstream ss; + ss << "No type converter implemented for " << type->ToString(); + return Status::NotImplemented(ss.str()); + } + converter->Init(builder); + + return converter->AppendData(obj); +} + Status ConvertPySequence( PyObject* obj, MemoryPool* pool, std::shared_ptr* out) { std::shared_ptr type; @@ -509,19 +521,10 @@ Status ConvertPySequence( return Status::OK(); } - std::shared_ptr converter = GetConverter(type); - if (converter == nullptr) { - std::stringstream ss; - ss << "No type converter implemented for " << type->ToString(); - return Status::NotImplemented(ss.str()); - } - // Give the sequence converter an array builder std::shared_ptr builder; RETURN_NOT_OK(arrow::MakeBuilder(pool, type, &builder)); - converter->Init(builder); - - RETURN_NOT_OK(converter->AppendData(obj)); + RETURN_NOT_OK(AppendPySequence(obj, type, builder)); return builder->Finish(out); } diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 667298e3c5c..0c863a5631a 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -37,6 +37,13 @@ class Status; namespace pyarrow { +PYARROW_EXPORT arrow::Status InferArrowType( + PyObject* obj, int64_t* size, std::shared_ptr* out_type); + +PYARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, + const std::shared_ptr& type, + const std::shared_ptr& builder); + PYARROW_EXPORT arrow::Status ConvertPySequence( PyObject* obj, arrow::MemoryPool* pool, std::shared_ptr* out); diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index cadb53e0d2a..0715742ab7a 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -19,6 +19,7 @@ #include +#include "pyarrow/adapters/builtin.h" #include "pyarrow/adapters/pandas.h" #include "pyarrow/numpy_interop.h" @@ -1621,6 +1622,7 @@ inline Status ArrowSerializer::ConvertTypedLists( typedef npy_traits traits; typedef typename traits::value_type T; typedef typename traits::BuilderClass BuilderT; + PyAcquireGIL lock; auto value_builder = std::make_shared(pool_, field->type); ListBuilder list_builder(pool_, value_builder); @@ -1648,7 +1650,16 @@ inline Status ArrowSerializer::ConvertTypedLists( RETURN_NOT_OK(value_builder->Append(data, size)); } } else if (PyList_Check(objects[i])) { - return Status::TypeError("Python lists are not yet supported"); + int64_t size; + std::shared_ptr type; + RETURN_NOT_OK(list_builder.Append(true)); + RETURN_NOT_OK(InferArrowType(objects[i], &size, &type)); + if (type->type != field->type->type) { + std::stringstream ss; + ss << type->ToString() << " cannot be converted to " << field->type->ToString(); + return Status::TypeError(ss.str()); + } + RETURN_NOT_OK(AppendPySequence(objects[i], field->type, value_builder)); } else { return Status::TypeError("Unsupported Python type for list items"); } @@ -1662,6 +1673,7 @@ inline Status ArrowSerializer::ConvertTypedLists( const std::shared_ptr& field, std::shared_ptr* out) { // TODO: If there are bytes involed, convert to Binary representation + PyAcquireGIL lock; bool have_bytes = false; auto value_builder = std::make_shared(pool_); @@ -1681,7 +1693,16 @@ ArrowSerializer::ConvertTypedLists( auto data = reinterpret_cast(PyArray_DATA(numpy_array)); RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes)); } else if (PyList_Check(objects[i])) { - return Status::TypeError("Python lists are not yet supported"); + int64_t size; + std::shared_ptr type; + RETURN_NOT_OK(list_builder.Append(true)); + RETURN_NOT_OK(InferArrowType(objects[i], &size, &type)); + if (type->type != Type::STRING) { + std::stringstream ss; + ss << type->ToString() << " cannot be converted to STRING."; + return Status::TypeError(ss.str()); + } + RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder)); } else { return Status::TypeError("Unsupported Python type for list items"); }