From c624df52b95e24deb911c6287daaf5cbe72e0751 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 14:11:12 -0500 Subject: [PATCH 01/15] Update concat.py --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 72f3b402d49e3..864aa97df0587 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -705,7 +705,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde names = [None] if levels is None: - levels = [ensure_index(keys)] + levels = [ensure_index(keys).unique()] else: levels = [ensure_index(x) for x in levels] From 58b2bb47fee7b1a1bcbf06710165554b62bd3fab Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 14:17:31 -0500 Subject: [PATCH 02/15] Update v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8c02785647861..71929b6314d02 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -536,7 +536,7 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concanenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) -- +- Bug in concanenation with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) Sparse ^^^^^^ From 2e4317c9470a3f500d8469ee13878c8445a86d6b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 14:45:48 -0500 Subject: [PATCH 03/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index aad56a1dccedc..813f156c71976 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -323,3 +323,27 @@ def test_concat_multiindex_(self): {"col": ["a", "b", "c"]}, index=MultiIndex.from_product(iterables) ) tm.assert_frame_equal(result_df, expected_df) + + @pytest.mark.parametrize( + "keys", + [["x", "y", "x"]], + ) + def test_concat_with_key_not_unique( + self, keys: list, + ): + # GitHub #46519 + df1 = DataFrame({'name': [1]}) + df2 = DataFrame({'name': [2]}) + df3 = DataFrame({"name": [3]}) + df_a = pd.concat([df1, df2, df3], keys=keys) + out_a = df_a.loc[("x", 0), :] + + df_b = pd.DataFrame( + { + "name": [1, 2, 3] + }, + index=Index([("x", 0), ("y", 0), ("x", 0)]) + ) + out_b = df_b.loc[("x", 0)] + + tm.assert_frame_equal(out_a, out_b) From ef8337a3234b2689e80d569d325c65c1cdc09d91 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 18:04:53 -0500 Subject: [PATCH 04/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 813f156c71976..e5a857687afb8 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -329,21 +329,21 @@ def test_concat_multiindex_(self): [["x", "y", "x"]], ) def test_concat_with_key_not_unique( - self, keys: list, + self, + keys: list, ): # GitHub #46519 - df1 = DataFrame({'name': [1]}) - df2 = DataFrame({'name': [2]}) + df1 = DataFrame({"name": [1]}) + df2 = DataFrame({"name": [2]}) df3 = DataFrame({"name": [3]}) - df_a = pd.concat([df1, df2, df3], keys=keys) - out_a = df_a.loc[("x", 0), :] + df_a = concat([df1, df2, df3], keys=keys) + with tm.assert_produces_warning(PerformanceWarning): + out_a = df_a.loc[("x", 0), :] - df_b = pd.DataFrame( - { - "name": [1, 2, 3] - }, - index=Index([("x", 0), ("y", 0), ("x", 0)]) + df_b = DataFrame( + {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) ) - out_b = df_b.loc[("x", 0)] + with tm.assert_produces_warning(PerformanceWarning): + out_b = df_b.loc[("x", 0)] tm.assert_frame_equal(out_a, out_b) From 5c8e7dbc05637821ae21c574a7ff6ba2bb65cbfe Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 19:01:19 -0500 Subject: [PATCH 05/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index e5a857687afb8..408c46e384440 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -9,6 +9,7 @@ Series, concat, ) +from pandas.errors import PerformanceWarning import pandas._testing as tm From 7330ddbef51ae9d601146675d5ea26e8894576a6 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 19:57:15 -0500 Subject: [PATCH 06/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 408c46e384440..b5e7955839f01 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -348,3 +348,23 @@ def test_concat_with_key_not_unique( out_b = df_b.loc[("x", 0)] tm.assert_frame_equal(out_a, out_b) + + df1 = DataFrame({"name": ["a", "a", "b"]}) + df2 = DataFrame({"name": ["a", "b"]}) + df3 = DataFrame({"name": ["c", "d"]}) + df_a = concat([df1, df2, df3], keys=keys) + with tm.assert_produces_warning(PerformanceWarning): + out_a = df_a.loc[("x", 0), :] + + df_b = DataFrame( + { + "a": ["x", "x", "x", "y", "y", "x", "x"], + "b": [0, 1, 2, 0, 1, 0, 1], + "name": list("aababcd"), + } + ).set_index(["a", "b"]) + df_b.index.names = [None, None] + with tm.assert_produces_warning(PerformanceWarning): + out_b = df_b.loc[("x", 0), :] + + tm.assert_frame_equal(out_a, out_b) From 1e1f525830e89824cbd9bdfb09299f3a45aaa188 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 19:59:12 -0500 Subject: [PATCH 07/15] Update concat.py --- pandas/core/reshape/concat.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 864aa97df0587..7c6ace51027e1 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -709,6 +709,10 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde else: levels = [ensure_index(x) for x in levels] + for level in levels: + if not level.is_unique: + raise ValueError(f"Level values not unique: {level.tolist()}") + if not all_indexes_same(indexes) or not all(level.is_unique for level in levels): codes_list = [] From 6d059b45569a736ecb6bfccc9c4f237193a17401 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 20:00:42 -0500 Subject: [PATCH 08/15] Update v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 71929b6314d02..121c2805be8ca 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -536,7 +536,8 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concanenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) -- Bug in concanenation with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) +- Bug in :func:`concat` with identical key leads to error when indexing :class:`MultiIndex` (:issue:`46519`) +- Sparse ^^^^^^ From 3c43cea41b5c24eb433a8544a5254b25e31a317f Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 20:08:04 -0500 Subject: [PATCH 09/15] Update concat.py --- pandas/core/reshape/concat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 7c6ace51027e1..864aa97df0587 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -709,10 +709,6 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde else: levels = [ensure_index(x) for x in levels] - for level in levels: - if not level.is_unique: - raise ValueError(f"Level values not unique: {level.tolist()}") - if not all_indexes_same(indexes) or not all(level.is_unique for level in levels): codes_list = [] From 4c91709d6996d3b50da85afb5a94ab4b64d1e0b0 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 28 Mar 2022 20:38:07 -0500 Subject: [PATCH 10/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index b5e7955839f01..3f11110f0baef 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( DataFrame, @@ -9,7 +11,6 @@ Series, concat, ) -from pandas.errors import PerformanceWarning import pandas._testing as tm From 742921728683e56e69b2313b11e9c57a93534fd7 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 29 Mar 2022 10:17:51 -0500 Subject: [PATCH 11/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 3f11110f0baef..1b6a29b01d0e2 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -326,10 +326,6 @@ def test_concat_multiindex_(self): ) tm.assert_frame_equal(result_df, expected_df) - @pytest.mark.parametrize( - "keys", - [["x", "y", "x"]], - ) def test_concat_with_key_not_unique( self, keys: list, @@ -338,7 +334,7 @@ def test_concat_with_key_not_unique( df1 = DataFrame({"name": [1]}) df2 = DataFrame({"name": [2]}) df3 = DataFrame({"name": [3]}) - df_a = concat([df1, df2, df3], keys=keys) + df_a = concat([df1, df2, df3], keys=[["x", "y", "x"]]) with tm.assert_produces_warning(PerformanceWarning): out_a = df_a.loc[("x", 0), :] @@ -353,7 +349,7 @@ def test_concat_with_key_not_unique( df1 = DataFrame({"name": ["a", "a", "b"]}) df2 = DataFrame({"name": ["a", "b"]}) df3 = DataFrame({"name": ["c", "d"]}) - df_a = concat([df1, df2, df3], keys=keys) + df_a = concat([df1, df2, df3], keys=[["x", "y", "x"]]) with tm.assert_produces_warning(PerformanceWarning): out_a = df_a.loc[("x", 0), :] From 3cfad33b47da2bc70b32b50a7f4f8a91d2a3f1d4 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 29 Mar 2022 11:04:57 -0500 Subject: [PATCH 12/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 1b6a29b01d0e2..26187a76846f8 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -326,10 +326,7 @@ def test_concat_multiindex_(self): ) tm.assert_frame_equal(result_df, expected_df) - def test_concat_with_key_not_unique( - self, - keys: list, - ): + def test_concat_with_key_not_unique(self): # GitHub #46519 df1 = DataFrame({"name": [1]}) df2 = DataFrame({"name": [2]}) From 6ff64391256cc98d3918de63115d034266de1a1b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Tue, 29 Mar 2022 11:40:25 -0500 Subject: [PATCH 13/15] Update test_index.py --- pandas/tests/reshape/concat/test_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 26187a76846f8..6ddbd943262ac 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -331,7 +331,7 @@ def test_concat_with_key_not_unique(self): df1 = DataFrame({"name": [1]}) df2 = DataFrame({"name": [2]}) df3 = DataFrame({"name": [3]}) - df_a = concat([df1, df2, df3], keys=[["x", "y", "x"]]) + df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) with tm.assert_produces_warning(PerformanceWarning): out_a = df_a.loc[("x", 0), :] @@ -346,7 +346,7 @@ def test_concat_with_key_not_unique(self): df1 = DataFrame({"name": ["a", "a", "b"]}) df2 = DataFrame({"name": ["a", "b"]}) df3 = DataFrame({"name": ["c", "d"]}) - df_a = concat([df1, df2, df3], keys=[["x", "y", "x"]]) + df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) with tm.assert_produces_warning(PerformanceWarning): out_a = df_a.loc[("x", 0), :] From f2fe384ebf5e7a3c088cd4271320653a6fd3e02b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 4 Apr 2022 15:21:31 -0500 Subject: [PATCH 14/15] add warning message check --- pandas/tests/reshape/concat/test_index.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 6ddbd943262ac..04969a1b13646 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -332,13 +332,20 @@ def test_concat_with_key_not_unique(self): df2 = DataFrame({"name": [2]}) df3 = DataFrame({"name": [3]}) df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) - with tm.assert_produces_warning(PerformanceWarning): + # the warning is caused by indexing unsorted multi-index + with tm.assert_produces_warning( + PerformanceWarning, + match="indexing past lexsort depth" + ): out_a = df_a.loc[("x", 0), :] df_b = DataFrame( {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) ) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning( + PerformanceWarning, + match="indexing past lexsort depth" + ): out_b = df_b.loc[("x", 0)] tm.assert_frame_equal(out_a, out_b) @@ -347,7 +354,10 @@ def test_concat_with_key_not_unique(self): df2 = DataFrame({"name": ["a", "b"]}) df3 = DataFrame({"name": ["c", "d"]}) df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning( + PerformanceWarning, + match="indexing past lexsort depth" + ): out_a = df_a.loc[("x", 0), :] df_b = DataFrame( @@ -358,7 +368,10 @@ def test_concat_with_key_not_unique(self): } ).set_index(["a", "b"]) df_b.index.names = [None, None] - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning( + PerformanceWarning, + match="indexing past lexsort depth" + ): out_b = df_b.loc[("x", 0), :] tm.assert_frame_equal(out_a, out_b) From 330187bda3e05cde366819437b828cbffe67e48a Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Mon, 4 Apr 2022 15:33:42 -0500 Subject: [PATCH 15/15] fix format --- pandas/tests/reshape/concat/test_index.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index 04969a1b13646..50fee28669c58 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -334,8 +334,7 @@ def test_concat_with_key_not_unique(self): df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) # the warning is caused by indexing unsorted multi-index with tm.assert_produces_warning( - PerformanceWarning, - match="indexing past lexsort depth" + PerformanceWarning, match="indexing past lexsort depth" ): out_a = df_a.loc[("x", 0), :] @@ -343,8 +342,7 @@ def test_concat_with_key_not_unique(self): {"name": [1, 2, 3]}, index=Index([("x", 0), ("y", 0), ("x", 0)]) ) with tm.assert_produces_warning( - PerformanceWarning, - match="indexing past lexsort depth" + PerformanceWarning, match="indexing past lexsort depth" ): out_b = df_b.loc[("x", 0)] @@ -355,8 +353,7 @@ def test_concat_with_key_not_unique(self): df3 = DataFrame({"name": ["c", "d"]}) df_a = concat([df1, df2, df3], keys=["x", "y", "x"]) with tm.assert_produces_warning( - PerformanceWarning, - match="indexing past lexsort depth" + PerformanceWarning, match="indexing past lexsort depth" ): out_a = df_a.loc[("x", 0), :] @@ -369,8 +366,7 @@ def test_concat_with_key_not_unique(self): ).set_index(["a", "b"]) df_b.index.names = [None, None] with tm.assert_produces_warning( - PerformanceWarning, - match="indexing past lexsort depth" + PerformanceWarning, match="indexing past lexsort depth" ): out_b = df_b.loc[("x", 0), :]