From 284eaac0199998a33182cbd80bec253d52161b9b Mon Sep 17 00:00:00 2001 From: Kira Sidhu Date: Thu, 2 Nov 2023 14:34:22 -0700 Subject: [PATCH 1/7] chore: refactor version checks for pandas library --- google/cloud/bigquery/__init__.py | 1 + google/cloud/bigquery/_versions_helpers.py | 69 ++++++++++++++++++++++ google/cloud/bigquery/client.py | 2 +- google/cloud/bigquery/exceptions.py | 4 ++ tests/unit/test__versions_helpers.py | 66 +++++++++++++++++++++ 5 files changed, 141 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 40e3a1578..72576e608 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -202,6 +202,7 @@ # Custom exceptions "LegacyBigQueryStorageError", "LegacyPyarrowError", + "LegacyPandasError", ] diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index ce529b76e..f86882d3d 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -24,6 +24,8 @@ _MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") +_MIN_PANDAS_VERSION = packaging.version.Version("1.1.0") +_MAX_PANDAS_VERSION = packaging.version.Version("2.0.0") class PyarrowVersions: @@ -171,3 +173,70 @@ def try_import(self, raise_if_error: bool = False) -> Any: BQ_STORAGE_VERSIONS = BQStorageVersions() + + +class PandasVersions: + """Version comparisons for pandas package.""" + + def __init__(self): + self._installed_version = None + + @property + def installed_version(self) -> packaging.version.Version: + """Return the parsed version of pandas""" + if self._installed_version is None: + import pandas # type: ignore + + self._installed_version = packaging.version.parse( + # Use 0.0.0, since it is earlier than any released version. + # Legacy versions also have the same property, but + # creating a LegacyVersion has been deprecated. + # https://github.com/pypa/packaging/issues/321 + getattr(pandas, "__version__", "0.0.0") + ) + + return self._installed_version + + def try_import(self, raise_if_error: bool = False) -> Any: + """Verify that a recent enough version of pandas extra is installed. + The function assumes that pandas extra is installed, and should thus + be used in places where this assumption holds. + Because `pip` can install an outdated version of this extra despite + the constraints in `setup.py`, the calling code can use this helper + to verify the version compatibility at runtime. + Returns: + The ``pandas`` module or ``None``. + Raises: + exceptions.LegacyPandasError: + If the pandas package is outdated and ``raise_if_error`` is + ``True``. + """ + try: + import pandas + except ImportError as exc: # pragma: NO COVER + if raise_if_error: + raise exceptions.LegacyPandasError( + "pandas package not found. Install pandas version >=" + f" {_MIN_PANDAS_VERSION}" + "and <" + f" {_MAX_PANDAS_VERSION}" + ) from exc + return None + + if ( + self.installed_version < _MIN_PANDAS_VERSION + or self.installed_version >= _MAX_PANDAS_VERSION + ): + if raise_if_error: + msg = ( + "Dependency pandas is outdated, please upgrade" + f" it to version >= {_MIN_PANDAS_VERSION} and < {_MAX_PANDAS_VERSION}" + f" (version found: {self.installed_version})." + ) + raise exceptions.LegacyPandasError(msg) + return None + + return pandas + + +PANDAS_VERSIONS = PandasVersions() diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 496015b21..759e1f603 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -115,6 +115,7 @@ from google.cloud.bigquery.table import RowIterator pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() +pandas = _versions_helpers.PANDAS_VERSIONS.try_import() TimeoutType = Union[float, None] ResumableTimeoutType = Union[ @@ -124,7 +125,6 @@ if typing.TYPE_CHECKING: # pragma: NO COVER # os.PathLike is only subscriptable in Python 3.9+, thus shielding with a condition. PathType = Union[str, bytes, os.PathLike[str], os.PathLike[bytes]] - import pandas # type: ignore import requests # required by api-core _DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py index e94a6c832..62e0d540c 100644 --- a/google/cloud/bigquery/exceptions.py +++ b/google/cloud/bigquery/exceptions.py @@ -29,3 +29,7 @@ class BigQueryStorageNotFoundError(BigQueryError): """Raised when BigQuery Storage extra is not installed when trying to import it. """ + + +class LegacyPandasError(BigQueryError): + """Raised when too old a version of pandas package is detected at runtime.""" diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index 144f14b7c..52d84c1e8 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -26,6 +26,11 @@ except ImportError: # pragma: NO COVER bigquery_storage = None +try: + import pandas # type: ignore +except ImportError: # pragma: NO COVER + pandas = None + from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import exceptions @@ -173,3 +178,64 @@ def test_bqstorage_is_read_session_optional_false(): bqstorage_versions = _versions_helpers.BQStorageVersions() with mock.patch("google.cloud.bigquery_storage.__version__", new="2.5.0"): assert not bqstorage_versions.is_read_session_optional + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_try_import_raises_no_error_w_recent_pandas(): + versions = _versions_helpers.PandasVersions() + with mock.patch("pandas.__version__", new="1.5.0"): + try: + pandas = versions.try_import(raise_if_error=True) + assert pandas is not None + except exceptions.LegacyPandasError: # pragma: NO COVER + raise ("Legacy error raised with a non-legacy dependency version.") + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_try_import_returns_none_w_legacy_pandas(): + versions = _versions_helpers.PandasVersions() + with mock.patch("pandas.__version__", new="1.0.0"): + pandas = versions.try_import() + assert pandas is None + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_try_import_raises_error_w_legacy_pandas(): + versions = _versions_helpers.PandasVersions() + with mock.patch("pandas.__version__", new="1.0.0"): + with pytest.raises(exceptions.LegacyPandasError): + versions.try_import(raise_if_error=True) + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_try_import_returns_none_w_newer_pandas(): + versions = _versions_helpers.PandasVersions() + with mock.patch("pandas.__version__", new="2.0.0"): + pandas = versions.try_import() + assert pandas is None + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_try_import_raises_error_w_newer_pandas(): + versions = _versions_helpers.PandasVersions() + with mock.patch("pandas.__version__", new="2.0.0"): + with pytest.raises(exceptions.LegacyPandasError): + versions.try_import(raise_if_error=True) + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_installed_pandas_version_returns_cached(): + versions = _versions_helpers.PandasVersions() + versions._installed_version = object() + assert versions.installed_version is versions._installed_version + + +@pytest.mark.skipif(pandas is None, reason="pandas is not installed") +def test_installed_pandas_version_returns_parsed_version(): + versions = _versions_helpers.PandasVersions() + with mock.patch("pandas.__version__", new="1.1.0"): + version = versions.installed_version + + assert version.major == 1 + assert version.minor == 1 + assert version.micro == 0 From c426491f910f4a5081691788aee4cebd736fa66c Mon Sep 17 00:00:00 2001 From: Kira Sidhu Date: Thu, 2 Nov 2023 15:38:07 -0700 Subject: [PATCH 2/7] readded removed importing of pandas --- google/cloud/bigquery/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 759e1f603..6d0c12530 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -126,6 +126,7 @@ # os.PathLike is only subscriptable in Python 3.9+, thus shielding with a condition. PathType = Union[str, bytes, os.PathLike[str], os.PathLike[bytes]] import requests # required by api-core + import pandas _DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB _MAX_MULTIPART_SIZE = 5 * 1024 * 1024 From 81d970b0ce6caf73243d3f2d5723bdc43818b2cf Mon Sep 17 00:00:00 2001 From: Kira Sidhu Date: Thu, 2 Nov 2023 17:06:30 -0700 Subject: [PATCH 3/7] revert bad commit --- google/cloud/bigquery/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 6d0c12530..759e1f603 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -126,7 +126,6 @@ # os.PathLike is only subscriptable in Python 3.9+, thus shielding with a condition. PathType = Union[str, bytes, os.PathLike[str], os.PathLike[bytes]] import requests # required by api-core - import pandas _DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB _MAX_MULTIPART_SIZE = 5 * 1024 * 1024 From d6841e4fd6627904f5ad741abef66f5145747bb0 Mon Sep 17 00:00:00 2001 From: Kira Sidhu Date: Mon, 6 Nov 2023 10:31:26 -0800 Subject: [PATCH 4/7] merged from main, added type:ignore tag to get around mypy error --- google/cloud/bigquery/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 759e1f603..8c0c70d40 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -115,7 +115,7 @@ from google.cloud.bigquery.table import RowIterator pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() -pandas = _versions_helpers.PANDAS_VERSIONS.try_import() +pandas = _versions_helpers.PANDAS_VERSIONS.try_import() TimeoutType = Union[float, None] ResumableTimeoutType = Union[ @@ -2488,7 +2488,7 @@ def load_table_from_file( def load_table_from_dataframe( self, - dataframe: "pandas.DataFrame", + dataframe: "pandas.DataFrame", # type: ignore destination: Union[Table, TableReference, str], num_retries: int = _DEFAULT_NUM_RETRIES, job_id: Optional[str] = None, From cb3815018328a69f7b2ad15117950a33fdd49594 Mon Sep 17 00:00:00 2001 From: Kira Sidhu Date: Mon, 6 Nov 2023 15:45:11 -0800 Subject: [PATCH 5/7] Added ignore statement for mypy error, removed checking max version of Pandas --- google/cloud/bigquery/_versions_helpers.py | 12 +++--------- google/cloud/bigquery/client.py | 6 ++++-- tests/unit/test__versions_helpers.py | 16 ---------------- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index f86882d3d..137e73961 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -25,7 +25,6 @@ _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") _MIN_PANDAS_VERSION = packaging.version.Version("1.1.0") -_MAX_PANDAS_VERSION = packaging.version.Version("2.0.0") class PyarrowVersions: @@ -218,19 +217,14 @@ def try_import(self, raise_if_error: bool = False) -> Any: raise exceptions.LegacyPandasError( "pandas package not found. Install pandas version >=" f" {_MIN_PANDAS_VERSION}" - "and <" - f" {_MAX_PANDAS_VERSION}" ) from exc return None - if ( - self.installed_version < _MIN_PANDAS_VERSION - or self.installed_version >= _MAX_PANDAS_VERSION - ): + if self.installed_version < _MIN_PANDAS_VERSION: if raise_if_error: msg = ( - "Dependency pandas is outdated, please upgrade" - f" it to version >= {_MIN_PANDAS_VERSION} and < {_MAX_PANDAS_VERSION}" + "Dependency pyarrow is outdated, please upgrade" + f" it to version >= {_MIN_PANDAS_VERSION}" f" (version found: {self.installed_version})." ) raise exceptions.LegacyPandasError(msg) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 8c0c70d40..d4a759ba4 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -115,7 +115,9 @@ from google.cloud.bigquery.table import RowIterator pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() -pandas = _versions_helpers.PANDAS_VERSIONS.try_import() +pandas = ( + _versions_helpers.PANDAS_VERSIONS.try_import() +) # mypy check fails because pandas import is outside module, there are type: ignore comments related to this TimeoutType = Union[float, None] ResumableTimeoutType = Union[ @@ -2488,7 +2490,7 @@ def load_table_from_file( def load_table_from_dataframe( self, - dataframe: "pandas.DataFrame", # type: ignore + dataframe: "pandas.DataFrame", # type: ignore destination: Union[Table, TableReference, str], num_retries: int = _DEFAULT_NUM_RETRIES, job_id: Optional[str] = None, diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index 52d84c1e8..af063f266 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -207,22 +207,6 @@ def test_try_import_raises_error_w_legacy_pandas(): versions.try_import(raise_if_error=True) -@pytest.mark.skipif(pandas is None, reason="pandas is not installed") -def test_try_import_returns_none_w_newer_pandas(): - versions = _versions_helpers.PandasVersions() - with mock.patch("pandas.__version__", new="2.0.0"): - pandas = versions.try_import() - assert pandas is None - - -@pytest.mark.skipif(pandas is None, reason="pandas is not installed") -def test_try_import_raises_error_w_newer_pandas(): - versions = _versions_helpers.PandasVersions() - with mock.patch("pandas.__version__", new="2.0.0"): - with pytest.raises(exceptions.LegacyPandasError): - versions.try_import(raise_if_error=True) - - @pytest.mark.skipif(pandas is None, reason="pandas is not installed") def test_installed_pandas_version_returns_cached(): versions = _versions_helpers.PandasVersions() From 2e9a08b543fa23947433d0dfdcc71a7617672e4d Mon Sep 17 00:00:00 2001 From: Kira Date: Tue, 7 Nov 2023 09:38:22 -0800 Subject: [PATCH 6/7] updated docstring error --- google/cloud/bigquery/_versions_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py index 137e73961..4ff4b9700 100644 --- a/google/cloud/bigquery/_versions_helpers.py +++ b/google/cloud/bigquery/_versions_helpers.py @@ -223,7 +223,7 @@ def try_import(self, raise_if_error: bool = False) -> Any: if self.installed_version < _MIN_PANDAS_VERSION: if raise_if_error: msg = ( - "Dependency pyarrow is outdated, please upgrade" + "Dependency pandas is outdated, please upgrade" f" it to version >= {_MIN_PANDAS_VERSION}" f" (version found: {self.installed_version})." ) From ed1da86a32becf41b95caf45e022dd89bfa674e5 Mon Sep 17 00:00:00 2001 From: kiraksi Date: Tue, 7 Nov 2023 10:55:55 -0800 Subject: [PATCH 7/7] Added parameterize to test to test multiple supported versons --- tests/unit/test__versions_helpers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py index af063f266..afe170e7a 100644 --- a/tests/unit/test__versions_helpers.py +++ b/tests/unit/test__versions_helpers.py @@ -181,9 +181,10 @@ def test_bqstorage_is_read_session_optional_false(): @pytest.mark.skipif(pandas is None, reason="pandas is not installed") -def test_try_import_raises_no_error_w_recent_pandas(): +@pytest.mark.parametrize("version", ["1.5.0", "2.0.0", "2.1.0"]) +def test_try_import_raises_no_error_w_recent_pandas(version): versions = _versions_helpers.PandasVersions() - with mock.patch("pandas.__version__", new="1.5.0"): + with mock.patch("pandas.__version__", new=version): try: pandas = versions.try_import(raise_if_error=True) assert pandas is not None