Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions python/pyarrow/tests/pandas_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,43 @@ def dataframe_with_arrays():
schema = pa.Schema.from_fields(fields)

return df, schema

def dataframe_with_lists():
"""
Dataframe with list columns of every possible primtive type.

Returns
-------
df: pandas.DataFrame
schema: pyarrow.Schema
Arrow schema definition that is in line with the constructed df.
"""
arrays = OrderedDict()
fields = []

fields.append(pa.field('int64', pa.list_(pa.int64())))
arrays['int64'] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[0, 1, 2, 3, 4],
None,
[0]
]
fields.append(pa.field('double', pa.list_(pa.double())))
arrays['double'] = [
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4.],
None,
[0.]
]
fields.append(pa.field('str_list', pa.list_(pa.string())))
arrays['str_list'] = [
[u"1", u"ä"],
None,
[u"1"],
[u"1", u"2", u"3"]
]

df = pd.DataFrame(arrays)
schema = pa.Schema.from_fields(fields)

return df, schema
14 changes: 12 additions & 2 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from pyarrow.compat import u
import pyarrow as A

from .pandas_examples import dataframe_with_arrays
from .pandas_examples import dataframe_with_arrays, dataframe_with_lists


def _alltypes_example(size=100):
Expand Down Expand Up @@ -329,7 +329,7 @@ def test_date(self):
expected['date'] = pd.to_datetime(df['date'])
tm.assert_frame_equal(result, expected)

def test_column_of_lists(self):
def test_column_of_arrays(self):
df, schema = dataframe_with_arrays()
self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
table = A.Table.from_pandas(df, schema=schema)
Expand All @@ -339,6 +339,16 @@ def test_column_of_lists(self):
field = schema.field_by_name(column)
self._check_array_roundtrip(df[column], field=field)

def test_column_of_lists(self):
df, schema = dataframe_with_lists()
self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
table = A.Table.from_pandas(df, schema=schema)
assert table.schema.equals(schema)

for column in df.columns:
field = schema.field_by_name(column)
self._check_array_roundtrip(df[column], field=field)

def test_threaded_conversion(self):
df = _alltypes_example()
self._check_pandas_roundtrip(df, nthreads=2,
Expand Down
17 changes: 15 additions & 2 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from pyarrow.compat import guid
import pyarrow as pa
import pyarrow.io as paio
from .pandas_examples import dataframe_with_arrays
from .pandas_examples import dataframe_with_arrays, dataframe_with_lists

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -322,7 +322,7 @@ def test_compare_schemas():


@parquet
def test_column_of_lists(tmpdir):
def test_column_of_arrays(tmpdir):
df, schema = dataframe_with_arrays()

filename = tmpdir.join('pandas_rountrip.parquet')
Expand All @@ -334,6 +334,19 @@ def test_column_of_lists(tmpdir):
pdt.assert_frame_equal(df, df_read)


@parquet
def test_column_of_lists(tmpdir):
df, schema = dataframe_with_lists()

filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True,
schema=schema)
pq.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)


@parquet
def test_multithreaded_read():
df = alltypes_sample(size=10000)
Expand Down
27 changes: 15 additions & 12 deletions python/src/pyarrow/adapters/builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,7 @@ class SeqVisitor {
};

// Non-exhaustive type inference
static Status InferArrowType(
PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
*size = PySequence_Size(obj);
if (PyErr_Occurred()) {
// Not a sequence
Expand Down Expand Up @@ -496,6 +495,19 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
return Status::OK();
}

Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
const std::shared_ptr<ArrayBuilder>& builder) {
std::shared_ptr<SeqConverter> converter = GetConverter(type);
if (converter == nullptr) {
std::stringstream ss;
ss << "No type converter implemented for " << type->ToString();
return Status::NotImplemented(ss.str());
}
converter->Init(builder);

return converter->AppendData(obj);
}

Status ConvertPySequence(
PyObject* obj, MemoryPool* pool, std::shared_ptr<arrow::Array>* out) {
std::shared_ptr<DataType> type;
Expand All @@ -509,19 +521,10 @@ Status ConvertPySequence(
return Status::OK();
}

std::shared_ptr<SeqConverter> converter = GetConverter(type);
if (converter == nullptr) {
std::stringstream ss;
ss << "No type converter implemented for " << type->ToString();
return Status::NotImplemented(ss.str());
}

// Give the sequence converter an array builder
std::shared_ptr<ArrayBuilder> builder;
RETURN_NOT_OK(arrow::MakeBuilder(pool, type, &builder));
converter->Init(builder);

RETURN_NOT_OK(converter->AppendData(obj));
RETURN_NOT_OK(AppendPySequence(obj, type, builder));

return builder->Finish(out);
}
Expand Down
7 changes: 7 additions & 0 deletions python/src/pyarrow/adapters/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ class Status;

namespace pyarrow {

PYARROW_EXPORT arrow::Status InferArrowType(
PyObject* obj, int64_t* size, std::shared_ptr<arrow::DataType>* out_type);

PYARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj,
const std::shared_ptr<arrow::DataType>& type,
const std::shared_ptr<arrow::ArrayBuilder>& builder);

PYARROW_EXPORT
arrow::Status ConvertPySequence(
PyObject* obj, arrow::MemoryPool* pool, std::shared_ptr<arrow::Array>* out);
Expand Down
25 changes: 23 additions & 2 deletions python/src/pyarrow/adapters/pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <Python.h>

#include "pyarrow/adapters/builtin.h"
#include "pyarrow/adapters/pandas.h"
#include "pyarrow/numpy_interop.h"

Expand Down Expand Up @@ -1621,6 +1622,7 @@ inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
typedef npy_traits<ITEM_TYPE> traits;
typedef typename traits::value_type T;
typedef typename traits::BuilderClass BuilderT;
PyAcquireGIL lock;

auto value_builder = std::make_shared<BuilderT>(pool_, field->type);
ListBuilder list_builder(pool_, value_builder);
Expand Down Expand Up @@ -1648,7 +1650,16 @@ inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
RETURN_NOT_OK(value_builder->Append(data, size));
}
} else if (PyList_Check(objects[i])) {
return Status::TypeError("Python lists are not yet supported");
int64_t size;
std::shared_ptr<arrow::DataType> type;
RETURN_NOT_OK(list_builder.Append(true));
RETURN_NOT_OK(InferArrowType(objects[i], &size, &type));
if (type->type != field->type->type) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to allow implicit conversion (like int32 to int64)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may allow this in future but currently we are missing a matching converter or alternatively, a convenient cast function for other integer types.

std::stringstream ss;
ss << type->ToString() << " cannot be converted to " << field->type->ToString();
return Status::TypeError(ss.str());
}
RETURN_NOT_OK(AppendPySequence(objects[i], field->type, value_builder));
} else {
return Status::TypeError("Unsupported Python type for list items");
}
Expand All @@ -1662,6 +1673,7 @@ inline Status
ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
// TODO: If there are bytes involed, convert to Binary representation
PyAcquireGIL lock;
bool have_bytes = false;

auto value_builder = std::make_shared<arrow::StringBuilder>(pool_);
Expand All @@ -1681,7 +1693,16 @@ ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
} else if (PyList_Check(objects[i])) {
return Status::TypeError("Python lists are not yet supported");
int64_t size;
std::shared_ptr<arrow::DataType> type;
RETURN_NOT_OK(list_builder.Append(true));
RETURN_NOT_OK(InferArrowType(objects[i], &size, &type));
if (type->type != Type::STRING) {
std::stringstream ss;
ss << type->ToString() << " cannot be converted to STRING.";
return Status::TypeError(ss.str());
}
RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder));
} else {
return Status::TypeError("Unsupported Python type for list items");
}
Expand Down