diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8c02785647861..121c2805be8ca 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -536,6 +536,7 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concanenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) +- Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) - Sparse diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 72f3b402d49e3..864aa97df0587 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -705,7 +705,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde names = [None] if levels is None: - levels = [ensure_index(keys)] + levels = [ensure_index(keys).unique()] else: levels = [ensure_index(x) for x in levels] diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index aad56a1dccedc..50fee28669c58 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( DataFrame, @@ -323,3 +325,49 @@ def test_concat_multiindex_(self): {"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables) ) tm.assert_frame_equal(result_df, expected_df) + + def test_concat_with_key_not_unique(self): + # GitHub #46519 + df1 = DataFrame({"name": [1]}) + df2 = DataFrame({"name": [2]}) + df3 = DataFrame({"name": [3]}) + df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) + # the warning is caused by indexing unsorted multi-index + with tm.assert_produces_warning( + PerformanceWarning, match="indexing past lexsort depth" + ): + out_a = df_a.loc[("x", 0), :] + + df_b = DataFrame( + {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) + ) + with tm.assert_produces_warning( + PerformanceWarning, match="indexing past lexsort depth" + ): + out_b = df_b.loc[("x", 0)] + + tm.assert_frame_equal(out_a, out_b) + + df1 = DataFrame({"name": ["a", "a", "b"]}) + df2 = DataFrame({"name": ["a", "b"]}) + df3 = DataFrame({"name": ["c", "d"]}) + df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) + with tm.assert_produces_warning( + PerformanceWarning, match="indexing past lexsort depth" + ): + out_a = df_a.loc[("x", 0), :] + + df_b = DataFrame( + { + "a": ["x", "x", "x", "y", "y", "x", "x"], + "b": [0, 1, 2, 0, 1, 0, 1], + "name": list("aababcd"), + } + ).set_index(["a", "b"]) + df_b.index.names = [None, None] + with tm.assert_produces_warning( + PerformanceWarning, match="indexing past lexsort depth" + ): + out_b = df_b.loc[("x", 0), :] + + tm.assert_frame_equal(out_a, out_b)