diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 9b2a5c4c60d..cd7ad477826 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -155,7 +155,7 @@ def index_level_name(index, i): return '__index_level_{:d}__'.format(i) -def construct_metadata(df, index_levels, preserve_index, types): +def construct_metadata(df, column_names, index_levels, preserve_index, types): """Returns a dictionary containing enough metadata to reconstruct a pandas DataFrame as an Arrow Table, including index columns. @@ -170,41 +170,77 @@ def construct_metadata(df, index_levels, preserve_index, types): ------- dict """ - ncolumns = len(df.columns) + ncolumns = len(column_names) df_types = types[:ncolumns] index_types = types[ncolumns:ncolumns + len(index_levels)] + + column_metadata = [ + get_column_metadata(df[col_name], name=sanitized_name, + arrow_type=arrow_type) + for col_name, sanitized_name, arrow_type in + zip(df.columns, column_names, df_types) + ] + + if preserve_index: + index_column_names = [index_level_name(level, i) + for i, level in enumerate(index_levels)] + index_column_metadata = [ + get_column_metadata(level, name=index_level_name(level, i), + arrow_type=arrow_type) + for i, (level, arrow_type) in enumerate(zip(index_levels, + index_types)) + ] + else: + index_column_names = index_column_metadata = [] + return { - b'pandas': json.dumps( - { - 'index_columns': [ - index_level_name(level, i) - for i, level in enumerate(index_levels) - ] if preserve_index else [], - 'columns': [ - get_column_metadata( - df[name], - name=name, - arrow_type=arrow_type - ) - for name, arrow_type in zip(df.columns, df_types) - ] + ( - [ - get_column_metadata( - level, - name=index_level_name(level, i), - arrow_type=arrow_type - ) - for i, (level, arrow_type) in enumerate( - zip(index_levels, index_types) - ) - ] if preserve_index else [] - ), - 'pandas_version': pd.__version__, - } - ).encode('utf8') + b'pandas': json.dumps({ + 'index_columns': index_column_names, + 'columns': column_metadata + index_column_metadata, + 'pandas_version': pd.__version__ + }).encode('utf8') } +def dataframe_to_arrays(df, timestamps_to_ms, schema, preserve_index): + names = [] + arrays = [] + index_columns = [] + types = [] + type = None + + if preserve_index: + n = len(getattr(df.index, 'levels', [df.index])) + index_columns.extend(df.index.get_level_values(i) for i in range(n)) + + for name in df.columns: + col = df[name] + if not isinstance(name, six.string_types): + name = str(name) + + if schema is not None: + field = schema.field_by_name(name) + type = getattr(field, "type", None) + + array = pa.Array.from_pandas( + col, type=type, timestamps_to_ms=timestamps_to_ms + ) + arrays.append(array) + names.append(name) + types.append(array.type) + + for i, column in enumerate(index_columns): + array = pa.Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms) + arrays.append(array) + names.append(index_level_name(column, i)) + types.append(array.type) + + metadata = construct_metadata( + df, names, index_columns, preserve_index, types + ) + return names, arrays, metadata + + def table_to_blockmanager(table, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index a9cb06480cd..c1d5a50d487 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -317,51 +317,6 @@ cdef int _schema_from_arrays( return 0 -cdef tuple _dataframe_to_arrays( - df, - bint timestamps_to_ms, - Schema schema, - bint preserve_index -): - cdef: - list names = [] - list arrays = [] - list index_columns = [] - list types = [] - DataType type = None - dict metadata - Py_ssize_t i - Py_ssize_t n - - if preserve_index: - n = len(getattr(df.index, 'levels', [df.index])) - index_columns.extend(df.index.get_level_values(i) for i in range(n)) - - for name in df.columns: - col = df[name] - if schema is not None: - field = schema.field_by_name(name) - type = getattr(field, "type", None) - - array = Array.from_pandas( - col, type=type, timestamps_to_ms=timestamps_to_ms - ) - arrays.append(array) - names.append(name) - types.append(array.type) - - for i, column in enumerate(index_columns): - array = Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms) - arrays.append(array) - names.append(pdcompat.index_level_name(column, i)) - types.append(array.type) - - metadata = pdcompat.construct_metadata( - df, index_columns, preserve_index, types - ) - return names, arrays, metadata - - cdef class RecordBatch: """ Batch of rows of columns of equal length @@ -570,7 +525,7 @@ cdef class RecordBatch: ------- pyarrow.RecordBatch """ - names, arrays, metadata = _dataframe_to_arrays( + names, arrays, metadata = pdcompat.dataframe_to_arrays( df, False, schema, preserve_index ) return cls.from_arrays(arrays, names, metadata) @@ -748,7 +703,7 @@ cdef class Table: >>> pa.Table.from_pandas(df) """ - names, arrays, metadata = _dataframe_to_arrays( + names, arrays, metadata = pdcompat.dataframe_to_arrays( df, timestamps_to_ms=timestamps_to_ms, schema=schema, diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 43e0bad5e3d..d4886585633 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -109,6 +109,11 @@ def test_all_none_category(self): df['a'] = df['a'].astype('category') self._check_pandas_roundtrip(df) + def test_non_string_columns(self): + df = pd.DataFrame({0: [1, 2, 3]}) + table = pa.Table.from_pandas(df) + assert table.column(0).name == '0' + def test_float_no_nulls(self): data = {} fields = [] diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index bcaca6df777..3ad369c31f4 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -360,7 +360,7 @@ def test_pandas_serialize_round_trip_multi_index(): @pytest.mark.xfail( - raises=TypeError, + raises=AssertionError, reason='Non string columns are not supported', ) def test_pandas_serialize_round_trip_not_string_columns():