From 8e3fc2885008f58de77ab18734b9a994c5b7997f Mon Sep 17 00:00:00 2001 From: taytzehao Date: Tue, 29 Aug 2023 02:58:22 +0800 Subject: [PATCH 01/11] autoformat --- pandas/core/methods/to_dict.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index e89f641e17296..59864151e54f2 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -107,14 +107,15 @@ def to_dict( elif orient == "list": object_dtype_indices_as_set = set(box_native_indices) + values = df.values return into_c( ( - k, - list(map(maybe_box_native, v.tolist())) + col, + list(map(maybe_box_native, values[:, i].tolist())) if i in object_dtype_indices_as_set - else v.tolist(), + else values[:, i].tolist(), ) - for i, (k, v) in enumerate(df.items()) + for i, col in enumerate(df.columns) ) elif orient == "split": From e7e0560085661435cae1df280823104e08f9193a Mon Sep 17 00:00:00 2001 From: taytzehao Date: Tue, 29 Aug 2023 03:02:46 +0800 Subject: [PATCH 02/11] autoformat --- pandas/core/methods/to_dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 59864151e54f2..e13737fe1298b 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -106,8 +106,8 @@ def to_dict( return into_c((k, v.to_dict(into)) for k, v in df.items()) elif orient == "list": - object_dtype_indices_as_set = set(box_native_indices) - values = df.values + object_dtype_indices_as_set: set[int] = set(box_native_indices) + values: np.ndarray = df.values return into_c( ( col, From 3440f2e9392dba29e45fff3b72bc0bea18eccb7b Mon Sep 17 00:00:00 2001 From: taytzehao Date: Tue, 29 Aug 2023 03:06:16 +0800 Subject: [PATCH 03/11] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d1a689dc60830..6904648917205 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -599,6 +599,7 @@ Performance improvements - Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) - Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) +- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) From b77df9ebddac0836531f49c1f8a288d873b522f4 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Wed, 30 Aug 2023 00:13:12 +0800 Subject: [PATCH 04/11] reformat --- pandas/core/frame.py | 1 + pandas/core/methods/to_dict.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9939daadd9237..19c69ec72e5d4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2037,6 +2037,7 @@ def to_dict( [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] """ + from pandas.core.methods.to_dict import to_dict return to_dict(self, orient, into, index) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index e13737fe1298b..bcc48411fc560 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -72,6 +72,7 @@ def to_dict( Return a collections.abc.Mapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. """ + if not df.columns.is_unique: warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", @@ -107,13 +108,13 @@ def to_dict( elif orient == "list": object_dtype_indices_as_set: set[int] = set(box_native_indices) - values: np.ndarray = df.values + return into_c( ( col, - list(map(maybe_box_native, values[:, i].tolist())) + list(map(maybe_box_native, df[col].values.tolist())) if i in object_dtype_indices_as_set - else values[:, i].tolist(), + else df[col].values.tolist(), ) for i, col in enumerate(df.columns) ) From e6d1dd9f86e1a1f8581ee5bc8888a708a0c141f1 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Thu, 31 Aug 2023 22:44:58 +0800 Subject: [PATCH 05/11] rm values --- pandas/core/methods/to_dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index bcc48411fc560..82678da8de65e 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -112,9 +112,9 @@ def to_dict( return into_c( ( col, - list(map(maybe_box_native, df[col].values.tolist())) + list(map(maybe_box_native, df[col].tolist())) if i in object_dtype_indices_as_set - else df[col].values.tolist(), + else df[col].tolist(), ) for i, col in enumerate(df.columns) ) From 19c4c313dd742e89fa456014af4a55318b96bcb3 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Thu, 31 Aug 2023 22:54:42 +0800 Subject: [PATCH 06/11] to_numpy() --- pandas/core/methods/to_dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 82678da8de65e..f1545b4fee0e5 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -112,9 +112,9 @@ def to_dict( return into_c( ( col, - list(map(maybe_box_native, df[col].tolist())) + list(map(maybe_box_native, df[col].to_numpy().tolist())) if i in object_dtype_indices_as_set - else df[col].tolist(), + else df[col].to_numpy().tolist(), ) for i, col in enumerate(df.columns) ) From c8c33e51445d8a7e82964f088f1fab59d061bc06 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Thu, 31 Aug 2023 23:07:47 +0800 Subject: [PATCH 07/11] to_numpy() --- doc/source/whatsnew/v2.1.0.rst | 1 - pandas/core/frame.py | 1 - pandas/core/methods/to_dict.py | 1 - 3 files changed, 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6904648917205..d1a689dc60830 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -599,7 +599,6 @@ Performance improvements - Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) - Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) -- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19c69ec72e5d4..9939daadd9237 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2037,7 +2037,6 @@ def to_dict( [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] """ - from pandas.core.methods.to_dict import to_dict return to_dict(self, orient, into, index) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index f1545b4fee0e5..164fe65b13c90 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -108,7 +108,6 @@ def to_dict( elif orient == "list": object_dtype_indices_as_set: set[int] = set(box_native_indices) - return into_c( ( col, From 035d1764bf4edb06732d0349e73fc98034cf7f78 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Thu, 31 Aug 2023 23:17:33 +0800 Subject: [PATCH 08/11] rm newline --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/methods/to_dict.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 621c9159a5fe8..3c9810598331e 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -157,9 +157,9 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.bug_fixes: diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 164fe65b13c90..1f83cdc427999 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -72,7 +72,6 @@ def to_dict( Return a collections.abc.Mapping object representing the DataFrame. The resulting transformation depends on the `orient` parameter. """ - if not df.columns.is_unique: warnings.warn( "DataFrame columns are not unique, some columns will be omitted.", From 034af069bc07c399d7c6c2250e9870b6d4a9685d Mon Sep 17 00:00:00 2001 From: taytzehao Date: Fri, 1 Sep 2023 12:16:01 +0800 Subject: [PATCH 09/11] .items --- pandas/core/methods/to_dict.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 1f83cdc427999..f4e0dcddcd34a 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -109,12 +109,12 @@ def to_dict( object_dtype_indices_as_set: set[int] = set(box_native_indices) return into_c( ( - col, - list(map(maybe_box_native, df[col].to_numpy().tolist())) + k, + list(map(maybe_box_native, v.to_numpy().tolist())) if i in object_dtype_indices_as_set - else df[col].to_numpy().tolist(), + else v.to_numpy().tolist(), ) - for i, col in enumerate(df.columns) + for i, (k, v) in enumerate(df.items()) ) elif orient == "split": From 44ac8481186ac6efdca9d48bc2d5bfcc3be10562 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Fri, 1 Sep 2023 13:24:50 +0800 Subject: [PATCH 10/11] unit test --- pandas/tests/frame/methods/test_to_dict.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 7bb9518f9b0f9..a166c4b8fdfee 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -166,6 +166,19 @@ def test_to_dict_not_unique_warning(self): with tm.assert_produces_warning(UserWarning): df.to_dict() + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.parametrize( + "orient,expected", + [ + ("list", {"A": [2, 5], "B": [3, 6]}), + ("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}), + ], + ) + def test_to_dict_not_unique(self, orient, expected): + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) + result = df.to_dict(orient) + assert result == expected + # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index From d5f485a1095612af57665b0be4a484d31d5cae69 Mon Sep 17 00:00:00 2001 From: taytzehao Date: Fri, 1 Sep 2023 13:28:37 +0800 Subject: [PATCH 11/11] comment --- pandas/tests/frame/methods/test_to_dict.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index a166c4b8fdfee..61f0ad30b4519 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -175,6 +175,8 @@ def test_to_dict_not_unique_warning(self): ], ) def test_to_dict_not_unique(self, orient, expected): + # GH#54824: This is to make sure that dataframes with non-unique column + # would have uniform behavior throughout different orients df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) result = df.to_dict(orient) assert result == expected