diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 3f67ba40c73..7d44f2e46ee 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -320,12 +320,15 @@ cdef tuple _dataframe_to_arrays( cdef: list names = [] list arrays = [] - list index_levels = [] + list index_columns = [] DataType type = None dict metadata + Py_ssize_t i + Py_ssize_t n if preserve_index: - index_levels.extend(getattr(df.index, 'levels', [df.index])) + n = len(getattr(df.index, 'levels', [df.index])) + index_columns.extend(df.index.get_level_values(i) for i in range(n)) for name in df.columns: col = df[name] @@ -339,13 +342,13 @@ cdef tuple _dataframe_to_arrays( ) names.append(name) - for i, level in enumerate(index_levels): + for i, column in enumerate(index_columns): arrays.append( - Array.from_pandas(level, timestamps_to_ms=timestamps_to_ms) + Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms) ) - names.append(pdcompat.index_level_name(level, i)) + names.append(pdcompat.index_level_name(column, i)) - metadata = pdcompat.construct_metadata(df, index_levels, preserve_index) + metadata = pdcompat.construct_metadata(df, index_columns, preserve_index) return names, arrays, metadata diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index f6ada09cc2c..49b7eb73df0 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -640,3 +640,17 @@ def _check_series(s): _check_series(pd.Series([None] * 3, dtype=object)) _check_series(pd.Series([np.nan] * 3, dtype=object)) _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object)) + + def test_multiindex_duplicate_values(self): + num_rows = 3 + numbers = list(range(num_rows)) + index = pd.MultiIndex.from_arrays( + [['foo', 'foo', 'bar'], numbers], + names=['foobar', 'some_numbers'], + ) + + df = pd.DataFrame({'numbers': numbers}, index=index) + + table = pa.Table.from_pandas(df) + result_df = table.to_pandas() + tm.assert_frame_equal(result_df, df) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 1cf54959da0..4e8f0db046c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -847,3 +847,24 @@ def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): with pytest.raises(ValueError): read_multiple_files(mixed_paths) + + +def test_multiindex_duplicate_values(tmpdir): + num_rows = 3 + numbers = list(range(num_rows)) + index = pd.MultiIndex.from_arrays( + [['foo', 'foo', 'bar'], numbers], + names=['foobar', 'some_numbers'], + ) + + df = pd.DataFrame({'numbers': numbers}, index=index) + table = pa.Table.from_pandas(df) + + filename = tmpdir.join('dup_multi_index_levels.parquet').strpath + + _write_table(table, filename) + result_table = _read_table(filename) + assert table.equals(result_table) + + result_df = result_table.to_pandas() + tm.assert_frame_equal(result_df, df)