From 20c98114bfe47c9bd92c988f3d849d297f69ee7a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 13:39:53 +0100 Subject: [PATCH 1/6] ENH: Add use_nullable_dtypes to read_clipboard --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/io/clipboards.py | 15 ++++++- pandas/tests/io/test_clipboard.py | 66 +++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..f418dc61efdbe 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -36,6 +36,7 @@ Configuration option, ``mode.dtype_backend``, to return pyarrow-backed dtypes The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`) * :func:`read_csv` +* :func:`read_clipboard` * :func:`read_fwf` * :func:`read_excel` * :func:`read_html` @@ -47,6 +48,7 @@ Additionally a new global configuration, ``mode.dtype_backend`` can now be used to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``) +* :func:`read_clipboard` (with ``engine="python"``) * :func:`read_excel` * :func:`read_html` * :func:`read_parquet` diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index a3e778e552439..4fa39c83f9e76 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -14,7 +14,9 @@ ) -def read_clipboard(sep: str = r"\s+", **kwargs): # pragma: no cover +def read_clipboard( + sep: str = r"\s+", use_nullable_dtypes: bool = False, **kwargs +): # pragma: no cover r""" Read text from clipboard and pass to read_csv. @@ -24,6 +26,13 @@ def read_clipboard(sep: str = r"\s+", **kwargs): # pragma: no cover A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. versionadded:: 2.0 + **kwargs See read_csv for the full argument list. @@ -85,7 +94,9 @@ def read_clipboard(sep: str = r"\s+", **kwargs): # pragma: no cover stacklevel=find_stack_level(), ) - return read_csv(StringIO(text), sep=sep, **kwargs) + return read_csv( + StringIO(text), sep=sep, use_nullable_dtypes=use_nullable_dtypes, **kwargs + ) def to_clipboard( diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index c47a963e0fa3c..54362717a63e3 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -10,12 +10,19 @@ PyperclipWindowsException, ) +import pandas as pd from pandas import ( + NA, DataFrame, + Series, get_option, read_clipboard, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.clipboard import ( CheckedCall, @@ -402,3 +409,62 @@ def test_raw_roundtrip(self, data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) assert data == clipboard_get() + + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_read_clipboard_nullable_dtypes( + self, request, mock_clipboard, string_storage, dtype_backend, engine + ): + # GH# + if string_storage == "pyarrow" or dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + + if dtype_backend == "pyarrow" and engine == "c": + pytest.skip(reason="c engine not yet supported") + + if string_storage == "python": + string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) + string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + + else: + string_array = ArrowStringArray(pa.array(["x", "y"])) + string_array_na = ArrowStringArray(pa.array(["x", None])) + + text = """a,b,c,d,e,f,g,h,i +x,1,4.0,x,2,4.0,,True,False +y,2,5.0,,,,,False,""" + mock_clipboard[request.node.name] = text + + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_clipboard( + sep=",", use_nullable_dtypes=True, engine=engine + ) + + expected = DataFrame( + { + "a": string_array, + "b": Series([1, 2], dtype="Int64"), + "c": Series([4.0, 5.0], dtype="Float64"), + "d": string_array_na, + "e": Series([2, NA], dtype="Int64"), + "f": Series([4.0, NA], dtype="Float64"), + "g": Series([NA, NA], dtype="Int64"), + "h": Series([True, False], dtype="boolean"), + "i": Series([False, NA], dtype="boolean"), + } + ) + if dtype_backend == "pyarrow": + import pyarrow as pa + + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + expected["g"] = ArrowExtensionArray(pa.array([None, None])) + + tm.assert_frame_equal(result, expected) From 73178ad13d4cffa46b9738fb8a21dabe56a44c5a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 13:41:02 +0100 Subject: [PATCH 2/6] Adjust whatsnew --- pandas/io/clipboards.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 4fa39c83f9e76..51d5f5cf123ca 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -31,6 +31,12 @@ def read_clipboard( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 **kwargs From a39326194f0d18787c6058e70cae8ae60db81102 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 13:41:21 +0100 Subject: [PATCH 3/6] Add gh ref --- pandas/tests/io/test_clipboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 54362717a63e3..f36ef0f72786f 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -415,7 +415,7 @@ def test_raw_roundtrip(self, data): def test_read_clipboard_nullable_dtypes( self, request, mock_clipboard, string_storage, dtype_backend, engine ): - # GH# + # GH#50502 if string_storage == "pyarrow" or dtype_backend == "pyarrow": pa = pytest.importorskip("pyarrow") From d76f73f6b5ed725b2e1384aa9cb2992fc3eedf47 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Jan 2023 22:45:15 +0100 Subject: [PATCH 4/6] Remove import --- pandas/tests/io/test_clipboard.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index f36ef0f72786f..ae9c5aacf6e6b 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -455,8 +455,6 @@ def test_read_clipboard_nullable_dtypes( } ) if dtype_backend == "pyarrow": - import pyarrow as pa - from pandas.arrays import ArrowExtensionArray expected = DataFrame( From 02d847f4fedc284fa9645d0c7160959c01a2c354 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Jan 2023 21:31:58 +0100 Subject: [PATCH 5/6] Add comment --- pandas/io/clipboards.py | 2 ++ pandas/io/parsers/readers.py | 2 ++ pandas/tests/io/test_clipboard.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 51d5f5cf123ca..44bee11518cd3 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -36,6 +36,8 @@ def read_clipboard( numpy-backed nullable dtypes or ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + This is only implemented for the ``python`` + engine. .. versionadded:: 2.0 diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ccfefa59c65b8..9aa927ffe447c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -403,6 +403,8 @@ numpy-backed nullable dtypes or ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + This is only implemented for the ``pyarrow`` or ``python`` + engines. .. versionadded:: 2.0 diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index ae9c5aacf6e6b..d6d4cc92e43d3 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -411,7 +411,7 @@ def test_raw_roundtrip(self, data): assert data == clipboard_get() @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) - @pytest.mark.parametrize("engine", ["c", "python"]) + @pytest.mark.parametrize("engine", ["c", "python", "pyarrow"]) def test_read_clipboard_nullable_dtypes( self, request, mock_clipboard, string_storage, dtype_backend, engine ): From 17f398130c3e576234226df3eb7db4b60639d3b4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 5 Jan 2023 23:49:07 +0100 Subject: [PATCH 6/6] Remove engine --- pandas/tests/io/test_clipboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index d6d4cc92e43d3..ae9c5aacf6e6b 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -411,7 +411,7 @@ def test_raw_roundtrip(self, data): assert data == clipboard_get() @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) - @pytest.mark.parametrize("engine", ["c", "python", "pyarrow"]) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_nullable_dtypes( self, request, mock_clipboard, string_storage, dtype_backend, engine ):