From cc951ca4fc870bb947a891fa6b4bf3d6838e9c1f Mon Sep 17 00:00:00 2001 From: "snitish.iitk@gmail.com" Date: Mon, 3 Mar 2025 14:49:41 -0600 Subject: [PATCH 1/4] BUG: Recognize chained fsspec URLs --- pandas/io/common.py | 2 +- pandas/tests/io/json/test_pandas.py | 1 + pandas/tests/io/test_common.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e0076eb486976..8ac0b5e109749 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -71,7 +71,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") +_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)?://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 144b36166261b..e64fab21b85a5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1753,6 +1753,7 @@ def test_read_timezone_information(self): [ "s3://example-fsspec/", "gcs://another-fsspec/file.json", + "filecache::s3://yet-another-fsspec/file.json", "https://example-site.com/data", "some-protocol://data.txt", ], diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e162815271ab3..1a20b11e6345a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -501,6 +501,15 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") +def test_is_fsspec_url_chained(): + # GH#48978 Support chained fsspec URLs + # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining. + assert icom.is_fsspec_url("filecache::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv") + assert not icom.is_fsspec_url("filecache::://pandas/test.csv") + + @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): From 12d1609c27f6dd218b3218d57355e91568215d3a Mon Sep 17 00:00:00 2001 From: "snitish.iitk@gmail.com" Date: Mon, 3 Mar 2025 14:58:44 -0600 Subject: [PATCH 2/4] Add whatsnew note --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 912b658cffdb6..4bd098897dda4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -717,6 +717,7 @@ I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`) - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) From 85a5c38318eaebb848cc7dd61b93d33639e406e8 Mon Sep 17 00:00:00 2001 From: "snitish.iitk@gmail.com" Date: Tue, 4 Mar 2025 03:15:03 -0600 Subject: [PATCH 3/4] Rename regex variable appropriately and allow more complex chaining --- pandas/io/common.py | 4 ++-- pandas/tests/io/test_common.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 8ac0b5e109749..1a9e6b472463d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -71,7 +71,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)?://") +_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) @@ -291,7 +291,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ return ( isinstance(url, str) - and bool(_RFC_3986_PATTERN.match(url)) + and bool(_FSSPEC_URL_PATTERN.match(url)) and not url.startswith(("http://", "https://")) ) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 1a20b11e6345a..f0d636be7ca8a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -505,6 +505,9 @@ def test_is_fsspec_url_chained(): # GH#48978 Support chained fsspec URLs # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining. assert icom.is_fsspec_url("filecache::s3://pandas/test.csv") + assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/afile.zip") + assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/afile.zip") + assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv") assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv") assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv") assert not icom.is_fsspec_url("filecache::://pandas/test.csv") From bf6bcd6c326f227b39877ad9cb46bfd8cd16e3ac Mon Sep 17 00:00:00 2001 From: "snitish.iitk@gmail.com" Date: Tue, 4 Mar 2025 17:42:26 -0600 Subject: [PATCH 4/4] Fix pre-commit --- pandas/tests/io/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f0d636be7ca8a..99af421d5aa48 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -505,8 +505,8 @@ def test_is_fsspec_url_chained(): # GH#48978 Support chained fsspec URLs # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining. assert icom.is_fsspec_url("filecache::s3://pandas/test.csv") - assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/afile.zip") - assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/afile.zip") + assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip") + assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip") assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv") assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv") assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv")