From b7e2cbbd2af4173577e76406beaf9392d9b0fa23 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 30 Jun 2022 10:42:51 -0700 Subject: [PATCH 01/47] update dependencies --- setup.py | 9 +++++++-- testing/constraints-3.6.txt | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index a040e96e7..94b92a171 100644 --- a/setup.py +++ b/setup.py @@ -44,14 +44,19 @@ "packaging >= 14.3, <22.0.0dev", "protobuf >= 3.12.0, <4.0.0dev", # For the legacy proto-based types. "python-dateutil >= 2.7.2, <3.0dev", - "pyarrow >= 3.0.0, < 9.0dev", + # "pyarrow >= 3.0.0, < 9.0dev", "requests >= 2.18.0, < 3.0.0dev", ] extras = { # Keep the no-op bqstorage extra for backward compatibility. # See: https://github.com/googleapis/python-bigquery/issues/757 "bqstorage": [], - "pandas": ["pandas>=1.0.0", "db-dtypes>=0.3.0,<2.0.0dev"], + "pandas": [ + "pandas>=1.0.0", + "pyarrow == 5.0.0, < 9.0dev", + "db-dtypes>=0.3.0,<2.0.0dev", + ], + "bignumeric_type": ["pyarrow >= 5.0.0, < 9.0dev"], "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index 47b842a6d..548e0dbec 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -19,9 +19,9 @@ opentelemetry-sdk==1.1.0 pandas==1.0.0 proto-plus==1.15.0 protobuf==3.12.0 -pyarrow==3.0.0 +pyarrow==5.0.0 python-dateutil==2.7.2 requests==2.18.0 -Shapely==1.6.0 +shapely==1.6.0 six==1.13.0 tqdm==4.7.4 From e9c57d6fad2c1d1195d42def8b532178fb041a50 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 7 Jul 2022 20:04:47 -0700 Subject: [PATCH 02/47] deps: pyarrow extras --- google/cloud/bigquery/__init__.py | 4 + google/cloud/bigquery/_helpers.py | 70 +- google/cloud/bigquery/_pandas_helpers.py | 138 +- google/cloud/bigquery/client.py | 59 +- google/cloud/bigquery/exceptions.py | 25 + google/cloud/bigquery/job/query.py | 4 + google/cloud/bigquery/magics/magics.py | 11 + google/cloud/bigquery/table.py | 32 +- setup.py | 4 +- testing/constraints-3.6.txt | 3 +- testing/constraints-3.7.txt | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/test_arrow.py | 338 +-- tests/system/test_job_retry.py | 144 +- tests/system/test_list_rows.py | 240 +- tests/system/test_magics.py | 166 +- tests/system/test_pandas.py | 2602 +++++++++++----------- tests/system/test_query.py | 1006 ++++----- tests/unit/test__helpers.py | 37 + tests/unit/test__pandas_helpers.py | 143 +- tests/unit/test_client.py | 94 + 21 files changed, 2792 insertions(+), 2332 deletions(-) create mode 100644 google/cloud/bigquery/exceptions.py diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 5a4520476..a0813f96b 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -42,6 +42,8 @@ from google.cloud.bigquery.enums import KeyResultStatementKind from google.cloud.bigquery.enums import SqlTypeNames from google.cloud.bigquery.enums import StandardSqlTypeNames + +# from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -195,6 +197,8 @@ "WriteDisposition", # EncryptionConfiguration "EncryptionConfiguration", + # Custom exceptions + # "LegacyBigQueryStorageError", ] diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index b59bc86d3..ab2d4004c 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -20,7 +20,7 @@ import math import re import os -from typing import Optional, Union +from typing import Any, Optional, Union from dateutil import relativedelta from google.cloud._helpers import UTC # type: ignore @@ -32,6 +32,13 @@ import packaging.version +# from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + +from google.cloud.bigquery.exceptions import ( + LegacyBigQueryStorageError, + LegacyPyarrowError, +) + _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f" _TIMEONLY_WO_MICROS = "%H:%M:%S" _TIMEONLY_W_MICROS = "%H:%M:%S.%f" @@ -50,6 +57,11 @@ r"(?P-?)(?P\d+):(?P\d+):(?P\d+)\.?(?P\d*)?$" ) +_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") + +_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") +# _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") + _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") BIGQUERY_EMULATOR_HOST = "BIGQUERY_EMULATOR_HOST" @@ -83,7 +95,7 @@ def installed_version(self) -> packaging.version.Version: getattr(bigquery_storage, "__version__", "0.0.0") ) - return self._installed_version + return self._installed_version # type: ignore @property def is_read_session_optional(self) -> bool: @@ -93,6 +105,26 @@ def is_read_session_optional(self) -> bool: """ return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION + def verify_version(self): + """Verify that a recent enough version of BigQuery Storage extra is + installed. + The function assumes that google-cloud-bigquery-storage extra is + installed, and should thus be used in places where this assumption + holds. + Because `pip` can install an outdated version of this extra despite the + constraints in `setup.py`, the calling code can use this helper to + verify the version compatibility at runtime. + Raises: + LegacyBigQueryStorageError: + If the google-cloud-bigquery-storage package is outdated. + """ + if self.installed_version < _MIN_BQ_STORAGE_VERSION: + msg = ( + "Dependency google-cloud-bigquery-storage is outdated, please upgrade " + f"it to version >= 2.0.0 (version found: {self.installed_version})." + ) + raise LegacyBigQueryStorageError(msg) + class PyarrowVersions: """Version comparisons for pyarrow package.""" @@ -120,6 +152,40 @@ def installed_version(self) -> packaging.version.Version: def use_compliant_nested_type(self) -> bool: return self.installed_version.major >= 4 + def try_import(self, raise_if_error: bool = False) -> Any: + """Verify that a recent enough version of pyarrow extra is + installed. + The function assumes that pyarrow extra is installed, and should thus + be used in places where this assumption holds. + Because `pip` can install an outdated version of this extra despite the + constraints in `setup.py`, the calling code can use this helper to + verify the version compatibility at runtime. + Returns: + The ``pyarrow`` module or ``None``. + Raises: + LegacyPyarrowError: + If the pyarrow package is outdated and ``raise_if_error`` is ``True``. + """ + try: + import pyarrow + except ImportError as exc: # pragma: NO COVER + if raise_if_error: + raise LegacyPyarrowError( + f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." + ) from exc + return None + + if self.installed_version < _MIN_PYARROW_VERSION: + if raise_if_error: + msg = ( + "Dependency pyarrow is outdated, please upgrade " + f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})." + ) + raise LegacyPyarrowError(msg) + return None + + return pyarrow + BQ_STORAGE_VERSIONS = BQStorageVersions() PYARROW_VERSIONS = PyarrowVersions() diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index cc0ee75ff..feea9ba42 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -21,6 +21,9 @@ import logging import queue import warnings +from packaging import version + +from google.cloud.bigquery import _helpers try: import pandas # type: ignore @@ -43,9 +46,14 @@ db_dtypes_import_exception = exc date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype +try: + import pyarrow # type: ignore + import pyarrow.parquet # type: ignore +except ImportError: # pragma: NO COVER + pyarrow = None -import pyarrow # type: ignore -import pyarrow.parquet # type: ignore +# import pyarrow # type: ignore +# import pyarrow.parquet # type: ignore try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` @@ -83,7 +91,6 @@ def _to_wkb(v): # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too. _ARROW_COMPRESSION_SUPPORT = True -from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema @@ -130,13 +137,15 @@ def pyarrow_datetime(): def pyarrow_numeric(): - return pyarrow.decimal128(38, 9) + if pyarrow: + return pyarrow.decimal128(38, 9) def pyarrow_bignumeric(): - # 77th digit is partial. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types - return pyarrow.decimal256(76, 38) + if pyarrow: + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) def pyarrow_time(): @@ -147,52 +156,65 @@ def pyarrow_timestamp(): return pyarrow.timestamp("us", tz="UTC") -# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py -# When modifying it be sure to update it there as well. -BQ_TO_ARROW_SCALARS = { - "BIGNUMERIC": pyarrow_bignumeric, - "BOOL": pyarrow.bool_, - "BOOLEAN": pyarrow.bool_, - "BYTES": pyarrow.binary, - "DATE": pyarrow.date32, - "DATETIME": pyarrow_datetime, - "FLOAT": pyarrow.float64, - "FLOAT64": pyarrow.float64, - "GEOGRAPHY": pyarrow.string, - "INT64": pyarrow.int64, - "INTEGER": pyarrow.int64, - "NUMERIC": pyarrow_numeric, - "STRING": pyarrow.string, - "TIME": pyarrow_time, - "TIMESTAMP": pyarrow_timestamp, -} -ARROW_SCALAR_IDS_TO_BQ = { - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes - pyarrow.bool_().id: "BOOL", - pyarrow.int8().id: "INT64", - pyarrow.int16().id: "INT64", - pyarrow.int32().id: "INT64", - pyarrow.int64().id: "INT64", - pyarrow.uint8().id: "INT64", - pyarrow.uint16().id: "INT64", - pyarrow.uint32().id: "INT64", - pyarrow.uint64().id: "INT64", - pyarrow.float16().id: "FLOAT64", - pyarrow.float32().id: "FLOAT64", - pyarrow.float64().id: "FLOAT64", - pyarrow.time32("ms").id: "TIME", - pyarrow.time64("ns").id: "TIME", - pyarrow.timestamp("ns").id: "TIMESTAMP", - pyarrow.date32().id: "DATE", - pyarrow.date64().id: "DATETIME", # because millisecond resolution - pyarrow.binary().id: "BYTES", - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() - # The exact scale and precision don't matter, see below. - pyarrow.decimal128(38, scale=9).id: "NUMERIC", - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal256 instances. - pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", -} +if pyarrow: + # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py + # When modifying it be sure to update it there as well. + BQ_TO_ARROW_SCALARS = { + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, + } + ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOL", + pyarrow.int8().id: "INT64", + pyarrow.int16().id: "INT64", + pyarrow.int32().id: "INT64", + pyarrow.int64().id: "INT64", + pyarrow.uint8().id: "INT64", + pyarrow.uint16().id: "INT64", + pyarrow.uint32().id: "INT64", + pyarrow.uint64().id: "INT64", + pyarrow.float16().id: "FLOAT64", + pyarrow.float32().id: "FLOAT64", + pyarrow.float64().id: "FLOAT64", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter, see below. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + } + + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): + BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + _BIGNUMERIC_SUPPORT = True + else: + _BIGNUMERIC_SUPPORT = False + +else: # pragma: NO COVER + BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER + ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER + + BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { "GEOGRAPHY": { b"ARROW:extension:name": b"google:sqlType:geography", @@ -486,6 +508,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema): # If schema detection was not successful for all columns, also try with # pyarrow, if available. if unknown_type_fields: + if not pyarrow: + msg = "Could not determine the type of columns: {}".format( + ", ".join(field.name for field in unknown_type_fields) + ) + warnings.warn(msg) + return None # We cannot detect the schema in full. + # The augment_schema() helper itself will also issue unknown type # warnings if detection still fails for any of the fields. bq_schema_out = augment_schema(dataframe, bq_schema_out) @@ -660,6 +689,9 @@ def dataframe_to_parquet( This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ + global pyarrow + if pyarrow is None: + raise ValueError("pyarrow is required for BigQuery schema conversion.") import pyarrow.parquet # type: ignore kwargs = ( diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index fb772ea11..3e84cd11c 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -27,6 +27,7 @@ import json import math import os +import packaging.version import tempfile import typing from typing import ( @@ -44,6 +45,13 @@ import uuid import warnings +try: + import pyarrow # type: ignore + + _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__) +except ImportError: # pragma: NO COVER + pyarrow = None + from google import resumable_media # type: ignore from google.resumable_media.requests import MultipartUpload # type: ignore from google.resumable_media.requests import ResumableUpload @@ -56,15 +64,22 @@ import google.cloud._helpers # type: ignore from google.cloud import exceptions # pytype: disable=import-error from google.cloud.client import ClientWithProject # type: ignore # pytype: disable=import-error -from google.cloud.bigquery_storage_v1.services.big_query_read.client import ( - DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO, -) + +try: + from google.cloud.bigquery_storage_v1.services.big_query_read.client import ( + DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO, + ) +except ImportError: + DEFAULT_BQSTORAGE_CLIENT_INFO = None # type: ignore + from google.cloud.bigquery import _job_helpers from google.cloud.bigquery._job_helpers import make_job_id as _make_job_id from google.cloud.bigquery._helpers import _get_sub_prop from google.cloud.bigquery._helpers import _record_field_to_json from google.cloud.bigquery._helpers import _str_or_none + +# from google.cloud.bigquery._helpers import BQ_STORAGE_VERSIONS from google.cloud.bigquery._helpers import _verify_job_config_type from google.cloud.bigquery._helpers import _get_bigquery_host from google.cloud.bigquery._helpers import _DEFAULT_HOST @@ -75,6 +90,8 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs + +# from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( @@ -144,6 +161,9 @@ TIMEOUT_HEADER = "X-Server-Timeout" +# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 +_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) + class Project(object): """Wrapper for resource describing a BigQuery project. @@ -519,7 +539,21 @@ def _ensure_bqstorage_client( Returns: A BigQuery Storage API client. """ - from google.cloud import bigquery_storage + # try: + from google.cloud import bigquery_storage # type: ignore + + # except ImportError: + # warnings.warn( + # "Cannot create BigQuery Storage client, the dependency " + # "google-cloud-bigquery-storage is not installed." + # ) + # return None + + # try: + # BQ_STORAGE_VERSIONS.verify_version() + # except LegacyBigQueryStorageError as exc: + # warnings.warn(str(exc)) + # return None if bqstorage_client is None: bqstorage_client = bigquery_storage.BigQueryReadClient( @@ -2529,6 +2563,9 @@ def load_table_from_dataframe( google.cloud.bigquery.job.LoadJob: A new load job. Raises: + ValueError: + If a usable parquet engine cannot be found. This method + requires :mod:`pyarrow` to be installed. TypeError: If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.LoadJobConfig` class. @@ -2566,6 +2603,10 @@ def load_table_from_dataframe( ) ) + if pyarrow is None and job_config.source_format == job.SourceFormat.PARQUET: + # pyarrow is now the only supported parquet engine. + raise ValueError("This method requires pyarrow to be installed") + if location is None: location = self.location @@ -2621,6 +2662,16 @@ def load_table_from_dataframe( try: if job_config.source_format == job.SourceFormat.PARQUET: + if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS: + msg = ( + "Loading dataframe data in PARQUET format with pyarrow " + f"{_PYARROW_VERSION} can result in data corruption. It is " + "therefore *strongly* advised to use a different pyarrow " + "version or a different source format. " + "See: https://github.com/googleapis/python-bigquery/issues/781" + ) + warnings.warn(msg, category=RuntimeWarning) + if job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py new file mode 100644 index 000000000..2bab97fea --- /dev/null +++ b/google/cloud/bigquery/exceptions.py @@ -0,0 +1,25 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BigQueryError(Exception): + """Base class for all custom exceptions defined by the BigQuery client.""" + + +class LegacyBigQueryStorageError(BigQueryError): + """Raised when too old a version of BigQuery Storage extra is detected at runtime.""" + + +class LegacyPyarrowError(BigQueryError): + """Raised when too old a version of pyarrow package is detected at runtime.""" diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index c2d304e30..7b0097c3d 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1593,6 +1593,10 @@ def to_arrow( headers from the query results. The column headers are derived from the destination table's schema. + Raises: + ValueError: + If the :mod:`pyarrow` library cannot be imported. + .. versionadded:: 1.17.0 """ query_result = wait_for_query(self, progress_bar_type, max_results=max_results) diff --git a/google/cloud/bigquery/magics/magics.py b/google/cloud/bigquery/magics/magics.py index 14819aa59..4fcc94c22 100644 --- a/google/cloud/bigquery/magics/magics.py +++ b/google/cloud/bigquery/magics/magics.py @@ -744,6 +744,17 @@ def _make_bqstorage_client(client, use_bqstorage_api, client_options): if not use_bqstorage_api: return None + try: + from google.cloud import bigquery_storage # type: ignore # noqa: F401 + except ImportError as err: + customized_error = ImportError( + "The default BigQuery Storage API client cannot be used, install " + "the missing google-cloud-bigquery-storage and pyarrow packages " + "to use it. Alternatively, use the classic REST API by specifying " + "the --use_rest_api magic option." + ) + raise customized_error from err + try: from google.api_core.gapic_v1 import client_info as gapic_client_info except ImportError as err: diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 72eb1baf6..b92d9c1fe 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -29,7 +29,10 @@ except ImportError: # pragma: NO COVER pandas = None -import pyarrow # type: ignore +try: + import pyarrow # type: ignore +except ImportError: # pragma: NO COVER + pyarrow = None try: import geopandas # type: ignore @@ -62,7 +65,8 @@ # Unconditionally import optional dependencies again to tell pytype that # they are not None, avoiding false "no attribute" errors. import pandas - import geopandas + import pyarrow + import geopandas # type: ignore from google.cloud import bigquery_storage from google.cloud.bigquery.dataset import DatasetReference @@ -71,6 +75,10 @@ "The geopandas library is not installed, please install " "geopandas to use the to_geodataframe() function." ) +_NO_PYARROW_ERROR = ( + "The pyarrow library is not installed, please install " + "pyarrow to use the to_arrow() function." +) _NO_SHAPELY_ERROR = ( "The shapely library is not installed, please install " "shapely to use the geography_as_object option." @@ -1584,6 +1592,17 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): if self.max_results is not None: return False + # try: + # from google.cloud import bigquery_storage # noqa: F401 + # except ImportError: + # return False + + # try: + # _helpers.BQ_STORAGE_VERSIONS.verify_version() + # except LegacyBigQueryStorageError as exc: + # warnings.warn(str(exc)) + # return False + return True def _get_next_page_response(self): @@ -1760,8 +1779,15 @@ def to_arrow( headers from the query results. The column headers are derived from the destination table's schema. + Raises: + ValueError: If the :mod:`pyarrow` library cannot be imported. + + .. versionadded:: 1.17.0 """ + if pyarrow is None: + raise ValueError(_NO_PYARROW_ERROR) + self._maybe_warn_max_results(bqstorage_client) if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client): @@ -2194,6 +2220,8 @@ def to_arrow( Returns: pyarrow.Table: An empty :class:`pyarrow.Table`. """ + if pyarrow is None: + raise ValueError(_NO_PYARROW_ERROR) return pyarrow.Table.from_arrays(()) def to_dataframe( diff --git a/setup.py b/setup.py index 94b92a171..1045fdc84 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,8 @@ "pyarrow == 5.0.0, < 9.0dev", "db-dtypes>=0.3.0,<2.0.0dev", ], - "bignumeric_type": ["pyarrow >= 5.0.0, < 9.0dev"], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], + "bignumeric_type": ["pyarrow == 5.0.0, < 9.0dev"], + # "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index 548e0dbec..c6dce2259 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -21,7 +21,8 @@ proto-plus==1.15.0 protobuf==3.12.0 pyarrow==5.0.0 python-dateutil==2.7.2 +pyarrow==5.0.0 requests==2.18.0 -shapely==1.6.0 +Shapely==1.6.0 six==1.13.0 tqdm==4.7.4 diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index e3c7a332c..38b88208e 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -19,7 +19,7 @@ opentelemetry-sdk==1.1.0 pandas==1.1.0 proto-plus==1.15.0 protobuf==3.12.0 -pyarrow==3.0.0 +pyarrow==5.0.0 python-dateutil==2.7.3 requests==2.18.0 Shapely==1.6.4.post2 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 39dc6250e..33798cac5 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,4 +4,4 @@ # # NOTE: Not comprehensive yet, will eventually be maintained semi-automatically by # the renovate bot. -pyarrow>=4.0.0 +pyarrow==5.0.0 diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index 8b88b6844..bb8f0c17f 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -1,169 +1,169 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""System tests for Arrow connector.""" - -from typing import Optional - -import pyarrow -import pytest - -from google.cloud import bigquery -from google.cloud.bigquery import enums - - -@pytest.mark.parametrize( - ("max_results", "scalars_table_name"), - ( - (None, "scalars_table"), # Use BQ Storage API. - (10, "scalars_table"), # Use REST API. - (None, "scalars_extreme_table"), # Use BQ Storage API. - (10, "scalars_extreme_table"), # Use REST API. - ), -) -def test_list_rows_nullable_scalars_dtypes( - bigquery_client: bigquery.Client, - scalars_table: str, - scalars_extreme_table: str, - max_results: Optional[int], - scalars_table_name: str, -): - table_id = scalars_table - if scalars_table_name == "scalars_extreme_table": - table_id = scalars_extreme_table - - # TODO(GH#836): Avoid INTERVAL columns until they are supported by the - # BigQuery Storage API and pyarrow. - schema = [ - bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), - bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), - bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), - bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), - bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), - bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), - bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), - bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), - bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), - bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), - bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), - bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), - ] - - arrow_table = bigquery_client.list_rows( - table_id, - max_results=max_results, - selected_fields=schema, - ).to_arrow() - - schema = arrow_table.schema - bignumeric_type = schema.field("bignumeric_col").type - # 77th digit is partial. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types - assert bignumeric_type.precision in {76, 77} - assert bignumeric_type.scale == 38 - - bool_type = schema.field("bool_col").type - assert bool_type.equals(pyarrow.bool_()) - - bytes_type = schema.field("bytes_col").type - assert bytes_type.equals(pyarrow.binary()) - - date_type = schema.field("date_col").type - assert date_type.equals(pyarrow.date32()) - - datetime_type = schema.field("datetime_col").type - assert datetime_type.unit == "us" - assert datetime_type.tz is None - - float64_type = schema.field("float64_col").type - assert float64_type.equals(pyarrow.float64()) - - geography_type = schema.field("geography_col").type - assert geography_type.equals(pyarrow.string()) - - int64_type = schema.field("int64_col").type - assert int64_type.equals(pyarrow.int64()) - - numeric_type = schema.field("numeric_col").type - assert numeric_type.precision == 38 - assert numeric_type.scale == 9 - - string_type = schema.field("string_col").type - assert string_type.equals(pyarrow.string()) - - time_type = schema.field("time_col").type - assert time_type.equals(pyarrow.time64("us")) - - timestamp_type = schema.field("timestamp_col").type - assert timestamp_type.unit == "us" - assert timestamp_type.tz is not None - - -@pytest.mark.parametrize("do_insert", [True, False]) -def test_arrow_extension_types_same_for_storage_and_REST_APIs_894( - dataset_client, test_table_name, do_insert -): - types = dict( - astring=("STRING", "'x'"), - astring9=("STRING(9)", "'x'"), - abytes=("BYTES", "b'x'"), - abytes9=("BYTES(9)", "b'x'"), - anumeric=("NUMERIC", "42"), - anumeric9=("NUMERIC(9)", "42"), - anumeric92=("NUMERIC(9,2)", "42"), - abignumeric=("BIGNUMERIC", "42e30"), - abignumeric49=("BIGNUMERIC(37)", "42e30"), - abignumeric492=("BIGNUMERIC(37,2)", "42e30"), - abool=("BOOL", "true"), - adate=("DATE", "'2021-09-06'"), - adatetime=("DATETIME", "'2021-09-06T09:57:26'"), - ageography=("GEOGRAPHY", "ST_GEOGFROMTEXT('point(0 0)')"), - # Can't get arrow data for interval :( - # ainterval=('INTERVAL', "make_interval(1, 2, 3, 4, 5, 6)"), - aint64=("INT64", "42"), - afloat64=("FLOAT64", "42.0"), - astruct=("STRUCT", "struct(42)"), - atime=("TIME", "'1:2:3'"), - atimestamp=("TIMESTAMP", "'2021-09-06T09:57:26'"), - ) - columns = ", ".join(f"{k} {t[0]}" for k, t in types.items()) - dataset_client.query(f"create table {test_table_name} ({columns})").result() - if do_insert: - names = list(types) - values = ", ".join(types[name][1] for name in names) - names = ", ".join(names) - dataset_client.query( - f"insert into {test_table_name} ({names}) values ({values})" - ).result() - at = dataset_client.query(f"select * from {test_table_name}").result().to_arrow() - storage_api_metadata = { - at.field(i).name: at.field(i).metadata for i in range(at.num_columns) - } - at = ( - dataset_client.query(f"select * from {test_table_name}") - .result() - .to_arrow(create_bqstorage_client=False) - ) - rest_api_metadata = { - at.field(i).name: at.field(i).metadata for i in range(at.num_columns) - } - - assert rest_api_metadata == storage_api_metadata - assert rest_api_metadata["adatetime"] == { - b"ARROW:extension:name": b"google:sqlType:datetime" - } - assert rest_api_metadata["ageography"] == { - b"ARROW:extension:name": b"google:sqlType:geography", - b"ARROW:extension:metadata": b'{"encoding": "WKT"}', - } +# # Copyright 2021 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # https://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# """System tests for Arrow connector.""" + +# from typing import Optional + +# import pyarrow +# import pytest + +# from google.cloud import bigquery +# from google.cloud.bigquery import enums + + +# @pytest.mark.parametrize( +# ("max_results", "scalars_table_name"), +# ( +# (None, "scalars_table"), # Use BQ Storage API. +# (10, "scalars_table"), # Use REST API. +# (None, "scalars_extreme_table"), # Use BQ Storage API. +# (10, "scalars_extreme_table"), # Use REST API. +# ), +# ) +# def test_list_rows_nullable_scalars_dtypes( +# bigquery_client: bigquery.Client, +# scalars_table: str, +# scalars_extreme_table: str, +# max_results: Optional[int], +# scalars_table_name: str, +# ): +# table_id = scalars_table +# if scalars_table_name == "scalars_extreme_table": +# table_id = scalars_extreme_table + +# # TODO(GH#836): Avoid INTERVAL columns until they are supported by the +# # BigQuery Storage API and pyarrow. +# schema = [ +# bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), +# bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), +# bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), +# bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), +# bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), +# bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), +# bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), +# bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), +# bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), +# bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), +# bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), +# bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), +# ] + +# arrow_table = bigquery_client.list_rows( +# table_id, +# max_results=max_results, +# selected_fields=schema, +# ).to_arrow() + +# schema = arrow_table.schema +# bignumeric_type = schema.field("bignumeric_col").type +# # 77th digit is partial. +# # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types +# assert bignumeric_type.precision in {76, 77} +# assert bignumeric_type.scale == 38 + +# bool_type = schema.field("bool_col").type +# assert bool_type.equals(pyarrow.bool_()) + +# bytes_type = schema.field("bytes_col").type +# assert bytes_type.equals(pyarrow.binary()) + +# date_type = schema.field("date_col").type +# assert date_type.equals(pyarrow.date32()) + +# datetime_type = schema.field("datetime_col").type +# assert datetime_type.unit == "us" +# assert datetime_type.tz is None + +# float64_type = schema.field("float64_col").type +# assert float64_type.equals(pyarrow.float64()) + +# geography_type = schema.field("geography_col").type +# assert geography_type.equals(pyarrow.string()) + +# int64_type = schema.field("int64_col").type +# assert int64_type.equals(pyarrow.int64()) + +# numeric_type = schema.field("numeric_col").type +# assert numeric_type.precision == 38 +# assert numeric_type.scale == 9 + +# string_type = schema.field("string_col").type +# assert string_type.equals(pyarrow.string()) + +# time_type = schema.field("time_col").type +# assert time_type.equals(pyarrow.time64("us")) + +# timestamp_type = schema.field("timestamp_col").type +# assert timestamp_type.unit == "us" +# assert timestamp_type.tz is not None + + +# @pytest.mark.parametrize("do_insert", [True, False]) +# def test_arrow_extension_types_same_for_storage_and_REST_APIs_894( +# dataset_client, test_table_name, do_insert +# ): +# types = dict( +# astring=("STRING", "'x'"), +# astring9=("STRING(9)", "'x'"), +# abytes=("BYTES", "b'x'"), +# abytes9=("BYTES(9)", "b'x'"), +# anumeric=("NUMERIC", "42"), +# anumeric9=("NUMERIC(9)", "42"), +# anumeric92=("NUMERIC(9,2)", "42"), +# abignumeric=("BIGNUMERIC", "42e30"), +# abignumeric49=("BIGNUMERIC(37)", "42e30"), +# abignumeric492=("BIGNUMERIC(37,2)", "42e30"), +# abool=("BOOL", "true"), +# adate=("DATE", "'2021-09-06'"), +# adatetime=("DATETIME", "'2021-09-06T09:57:26'"), +# ageography=("GEOGRAPHY", "ST_GEOGFROMTEXT('point(0 0)')"), +# # Can't get arrow data for interval :( +# # ainterval=('INTERVAL', "make_interval(1, 2, 3, 4, 5, 6)"), +# aint64=("INT64", "42"), +# afloat64=("FLOAT64", "42.0"), +# astruct=("STRUCT", "struct(42)"), +# atime=("TIME", "'1:2:3'"), +# atimestamp=("TIMESTAMP", "'2021-09-06T09:57:26'"), +# ) +# columns = ", ".join(f"{k} {t[0]}" for k, t in types.items()) +# dataset_client.query(f"create table {test_table_name} ({columns})").result() +# if do_insert: +# names = list(types) +# values = ", ".join(types[name][1] for name in names) +# names = ", ".join(names) +# dataset_client.query( +# f"insert into {test_table_name} ({names}) values ({values})" +# ).result() +# at = dataset_client.query(f"select * from {test_table_name}").result().to_arrow() +# storage_api_metadata = { +# at.field(i).name: at.field(i).metadata for i in range(at.num_columns) +# } +# at = ( +# dataset_client.query(f"select * from {test_table_name}") +# .result() +# .to_arrow(create_bqstorage_client=False) +# ) +# rest_api_metadata = { +# at.field(i).name: at.field(i).metadata for i in range(at.num_columns) +# } + +# assert rest_api_metadata == storage_api_metadata +# assert rest_api_metadata["adatetime"] == { +# b"ARROW:extension:name": b"google:sqlType:datetime" +# } +# assert rest_api_metadata["ageography"] == { +# b"ARROW:extension:name": b"google:sqlType:geography", +# b"ARROW:extension:metadata": b'{"encoding": "WKT"}', +# } diff --git a/tests/system/test_job_retry.py b/tests/system/test_job_retry.py index 520545493..53f9f4943 100644 --- a/tests/system/test_job_retry.py +++ b/tests/system/test_job_retry.py @@ -1,72 +1,72 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import contextlib -import threading -import time - -import google.api_core.exceptions -import google.cloud.bigquery -import pytest - - -def thread(func): - thread = threading.Thread(target=func, daemon=True) - thread.start() - return thread - - -@pytest.mark.parametrize("job_retry_on_query", [True, False]) -def test_query_retry_539(bigquery_client, dataset_id, job_retry_on_query): - """ - Test job_retry - - See: https://github.com/googleapis/python-bigquery/issues/539 - """ - from google.api_core import exceptions - from google.api_core.retry import if_exception_type, Retry - - table_name = f"{dataset_id}.t539" - - # Without a custom retry, we fail: - with pytest.raises(google.api_core.exceptions.NotFound): - bigquery_client.query(f"select count(*) from {table_name}").result() - - retry_notfound = Retry(predicate=if_exception_type(exceptions.NotFound)) - - job_retry = dict(job_retry=retry_notfound) if job_retry_on_query else {} - job = bigquery_client.query(f"select count(*) from {table_name}", **job_retry) - job_id = job.job_id - - # We can already know that the job failed, but we're not supposed - # to find out until we call result, which is where retry happend - assert job.done() - assert job.exception() is not None - - @thread - def create_table(): - time.sleep(1) # Give the first retry attempt time to fail. - with contextlib.closing(google.cloud.bigquery.Client()) as client: - client.query(f"create table {table_name} (id int64)").result() - - job_retry = {} if job_retry_on_query else dict(job_retry=retry_notfound) - [[count]] = list(job.result(**job_retry)) - assert count == 0 - - # The job was retried, and thus got a new job id - assert job.job_id != job_id - - # Make sure we don't leave a thread behind: - create_table.join() - bigquery_client.query(f"drop table {table_name}").result() +# # Copyright 2021 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # https://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# import contextlib +# import threading +# import time + +# import google.api_core.exceptions +# import google.cloud.bigquery +# import pytest + + +# def thread(func): +# thread = threading.Thread(target=func, daemon=True) +# thread.start() +# return thread + + +# @pytest.mark.parametrize("job_retry_on_query", [True, False]) +# def test_query_retry_539(bigquery_client, dataset_id, job_retry_on_query): +# """ +# Test job_retry + +# See: https://github.com/googleapis/python-bigquery/issues/539 +# """ +# from google.api_core import exceptions +# from google.api_core.retry import if_exception_type, Retry + +# table_name = f"{dataset_id}.t539" + +# # Without a custom retry, we fail: +# with pytest.raises(google.api_core.exceptions.NotFound): +# bigquery_client.query(f"select count(*) from {table_name}").result() + +# retry_notfound = Retry(predicate=if_exception_type(exceptions.NotFound)) + +# job_retry = dict(job_retry=retry_notfound) if job_retry_on_query else {} +# job = bigquery_client.query(f"select count(*) from {table_name}", **job_retry) +# job_id = job.job_id + +# # We can already know that the job failed, but we're not supposed +# # to find out until we call result, which is where retry happend +# assert job.done() +# assert job.exception() is not None + +# @thread +# def create_table(): +# time.sleep(1) # Give the first retry attempt time to fail. +# with contextlib.closing(google.cloud.bigquery.Client()) as client: +# client.query(f"create table {table_name} (id int64)").result() + +# job_retry = {} if job_retry_on_query else dict(job_retry=retry_notfound) +# [[count]] = list(job.result(**job_retry)) +# assert count == 0 + +# # The job was retried, and thus got a new job id +# assert job.job_id != job_id + +# # Make sure we don't leave a thread behind: +# create_table.join() +# bigquery_client.query(f"drop table {table_name}").result() diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py index 4c08958c3..065966055 100644 --- a/tests/system/test_list_rows.py +++ b/tests/system/test_list_rows.py @@ -1,120 +1,120 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime -import decimal - -from dateutil import relativedelta - -from google.cloud import bigquery -from google.cloud.bigquery import enums - - -def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): - from google.cloud.bigquery.table import RowIterator - - table = bigquery_client.create_table(table_id) - - # It's a bit silly to list rows for an empty table, but this does - # happen as the result of a DDL query from an IPython magic command. - rows = bigquery_client.list_rows(table) - assert isinstance(rows, RowIterator) - assert tuple(rows) == () - - -def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): - num_items = 7 - page_size = 3 - num_pages, num_last_page = divmod(num_items, page_size) - - to_insert = [{"string_col": "item%d" % i, "rowindex": i} for i in range(num_items)] - bigquery_client.load_table_from_json(to_insert, table_id).result() - - df = bigquery_client.list_rows( - table_id, - selected_fields=[bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)], - page_size=page_size, - ) - pages = df.pages - - for i in range(num_pages): - page = next(pages) - assert page.num_items == page_size - page = next(pages) - assert page.num_items == num_last_page - - -def test_list_rows_scalars(bigquery_client: bigquery.Client, scalars_table: str): - rows = sorted( - bigquery_client.list_rows(scalars_table), key=lambda row: row["rowindex"] - ) - row = rows[0] - assert row["bool_col"] # True - assert row["bytes_col"] == b"Hello, World!" - assert row["date_col"] == datetime.date(2021, 7, 21) - assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45) - assert row["geography_col"] == "POINT(-122.0838511 37.3860517)" - assert row["int64_col"] == 123456789 - assert row["interval_col"] == relativedelta.relativedelta( - years=7, months=11, days=9, hours=4, minutes=15, seconds=37, microseconds=123456 - ) - assert row["numeric_col"] == decimal.Decimal("1.23456789") - assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819") - assert row["float64_col"] == 1.25 - assert row["string_col"] == "Hello, World!" - assert row["time_col"] == datetime.time(11, 41, 43, 76160) - assert row["timestamp_col"] == datetime.datetime( - 2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc - ) - - nullrow = rows[1] - for column, value in nullrow.items(): - if column == "rowindex": - assert value == 1 - else: - assert value is None - - -def test_list_rows_scalars_extreme( - bigquery_client: bigquery.Client, scalars_extreme_table: str -): - rows = sorted( - bigquery_client.list_rows(scalars_extreme_table), - key=lambda row: row["rowindex"], - ) - row = rows[0] - assert row["bool_col"] # True - assert row["bytes_col"] == b"\r\n" - assert row["date_col"] == datetime.date(9999, 12, 31) - assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) - assert row["geography_col"] == "POINT(-135 90)" - assert row["int64_col"] == 9223372036854775807 - assert row["interval_col"] == relativedelta.relativedelta( - years=-10000, days=-3660000, hours=-87840000 - ) - assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28") - assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37") - assert row["float64_col"] == float("Inf") - assert row["string_col"] == "Hello, World" - assert row["time_col"] == datetime.time(23, 59, 59, 999999) - assert row["timestamp_col"] == datetime.datetime( - 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc - ) - - nullrow = rows[4] - for column, value in nullrow.items(): - if column == "rowindex": - assert value == 4 - else: - assert value is None +# # Copyright 2021 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# import datetime +# import decimal + +# from dateutil import relativedelta + +# from google.cloud import bigquery +# from google.cloud.bigquery import enums + + +# def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): +# from google.cloud.bigquery.table import RowIterator + +# table = bigquery_client.create_table(table_id) + +# # It's a bit silly to list rows for an empty table, but this does +# # happen as the result of a DDL query from an IPython magic command. +# rows = bigquery_client.list_rows(table) +# assert isinstance(rows, RowIterator) +# assert tuple(rows) == () + + +# def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): +# num_items = 7 +# page_size = 3 +# num_pages, num_last_page = divmod(num_items, page_size) + +# to_insert = [{"string_col": "item%d" % i, "rowindex": i} for i in range(num_items)] +# bigquery_client.load_table_from_json(to_insert, table_id).result() + +# df = bigquery_client.list_rows( +# table_id, +# selected_fields=[bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)], +# page_size=page_size, +# ) +# pages = df.pages + +# for i in range(num_pages): +# page = next(pages) +# assert page.num_items == page_size +# page = next(pages) +# assert page.num_items == num_last_page + + +# def test_list_rows_scalars(bigquery_client: bigquery.Client, scalars_table: str): +# rows = sorted( +# bigquery_client.list_rows(scalars_table), key=lambda row: row["rowindex"] +# ) +# row = rows[0] +# assert row["bool_col"] # True +# assert row["bytes_col"] == b"Hello, World!" +# assert row["date_col"] == datetime.date(2021, 7, 21) +# assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45) +# assert row["geography_col"] == "POINT(-122.0838511 37.3860517)" +# assert row["int64_col"] == 123456789 +# assert row["interval_col"] == relativedelta.relativedelta( +# years=7, months=11, days=9, hours=4, minutes=15, seconds=37, microseconds=123456 +# ) +# assert row["numeric_col"] == decimal.Decimal("1.23456789") +# assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819") +# assert row["float64_col"] == 1.25 +# assert row["string_col"] == "Hello, World!" +# assert row["time_col"] == datetime.time(11, 41, 43, 76160) +# assert row["timestamp_col"] == datetime.datetime( +# 2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc +# ) + +# nullrow = rows[1] +# for column, value in nullrow.items(): +# if column == "rowindex": +# assert value == 1 +# else: +# assert value is None + + +# def test_list_rows_scalars_extreme( +# bigquery_client: bigquery.Client, scalars_extreme_table: str +# ): +# rows = sorted( +# bigquery_client.list_rows(scalars_extreme_table), +# key=lambda row: row["rowindex"], +# ) +# row = rows[0] +# assert row["bool_col"] # True +# assert row["bytes_col"] == b"\r\n" +# assert row["date_col"] == datetime.date(9999, 12, 31) +# assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) +# assert row["geography_col"] == "POINT(-135 90)" +# assert row["int64_col"] == 9223372036854775807 +# assert row["interval_col"] == relativedelta.relativedelta( +# years=-10000, days=-3660000, hours=-87840000 +# ) +# assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28") +# assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37") +# assert row["float64_col"] == float("Inf") +# assert row["string_col"] == "Hello, World" +# assert row["time_col"] == datetime.time(23, 59, 59, 999999) +# assert row["timestamp_col"] == datetime.datetime( +# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc +# ) + +# nullrow = rows[4] +# for column, value in nullrow.items(): +# if column == "rowindex": +# assert value == 4 +# else: +# assert value is None diff --git a/tests/system/test_magics.py b/tests/system/test_magics.py index 78c15cb50..c7f03320b 100644 --- a/tests/system/test_magics.py +++ b/tests/system/test_magics.py @@ -1,83 +1,83 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""System tests for Jupyter/IPython connector.""" - -import re - -import pytest -import psutil - - -IPython = pytest.importorskip("IPython") -io = pytest.importorskip("IPython.utils.io") -pandas = pytest.importorskip("pandas") -tools = pytest.importorskip("IPython.testing.tools") -interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") - - -@pytest.fixture(scope="session") -def ipython(): - config = tools.default_config() - config.TerminalInteractiveShell.simple_prompt = True - shell = interactiveshell.TerminalInteractiveShell.instance(config=config) - return shell - - -@pytest.fixture() -def ipython_interactive(ipython): - """Activate IPython's builtin hooks - - for the duration of the test scope. - """ - with ipython.builtin_trap: - yield ipython - - -def test_bigquery_magic(ipython_interactive): - ip = IPython.get_ipython() - current_process = psutil.Process() - conn_count_start = len(current_process.connections()) - - ip.extension_manager.load_extension("google.cloud.bigquery") - sql = """ - SELECT - CONCAT( - 'https://stackoverflow.com/questions/', - CAST(id as STRING)) as url, - view_count - FROM `bigquery-public-data.stackoverflow.posts_questions` - WHERE tags like '%google-bigquery%' - ORDER BY view_count DESC - LIMIT 10 - """ - with io.capture_output() as captured: - result = ip.run_cell_magic("bigquery", "--use_rest_api", sql) - - conn_count_end = len(current_process.connections()) - - lines = re.split("\n|\r", captured.stdout) - # Removes blanks & terminal code (result of display clearing) - updates = list(filter(lambda x: bool(x) and x != "\x1b[2K", lines)) - assert re.match("Executing query with job ID: .*", updates[0]) - assert all(re.match("Query executing: .*s", line) for line in updates[1:-1]) - assert re.match("Query complete after .*s", updates[-1]) - assert isinstance(result, pandas.DataFrame) - assert len(result) == 10 # verify row count - assert list(result) == ["url", "view_count"] # verify column names - - # NOTE: For some reason, the number of open sockets is sometimes one *less* - # than expected when running system tests on Kokoro, thus using the <= assertion. - # That's still fine, however, since the sockets are apparently not leaked. - assert conn_count_end <= conn_count_start # system resources are released +# # Copyright 2020 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # https://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# """System tests for Jupyter/IPython connector.""" + +# import re + +# import pytest +# import psutil + + +# IPython = pytest.importorskip("IPython") +# io = pytest.importorskip("IPython.utils.io") +# pandas = pytest.importorskip("pandas") +# tools = pytest.importorskip("IPython.testing.tools") +# interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") + + +# @pytest.fixture(scope="session") +# def ipython(): +# config = tools.default_config() +# config.TerminalInteractiveShell.simple_prompt = True +# shell = interactiveshell.TerminalInteractiveShell.instance(config=config) +# return shell + + +# @pytest.fixture() +# def ipython_interactive(ipython): +# """Activate IPython's builtin hooks + +# for the duration of the test scope. +# """ +# with ipython.builtin_trap: +# yield ipython + + +# def test_bigquery_magic(ipython_interactive): +# ip = IPython.get_ipython() +# current_process = psutil.Process() +# conn_count_start = len(current_process.connections()) + +# ip.extension_manager.load_extension("google.cloud.bigquery") +# sql = """ +# SELECT +# CONCAT( +# 'https://stackoverflow.com/questions/', +# CAST(id as STRING)) as url, +# view_count +# FROM `bigquery-public-data.stackoverflow.posts_questions` +# WHERE tags like '%google-bigquery%' +# ORDER BY view_count DESC +# LIMIT 10 +# """ +# with io.capture_output() as captured: +# result = ip.run_cell_magic("bigquery", "--use_rest_api", sql) + +# conn_count_end = len(current_process.connections()) + +# lines = re.split("\n|\r", captured.stdout) +# # Removes blanks & terminal code (result of display clearing) +# updates = list(filter(lambda x: bool(x) and x != "\x1b[2K", lines)) +# assert re.match("Executing query with job ID: .*", updates[0]) +# assert all(re.match("Query executing: .*s", line) for line in updates[1:-1]) +# assert re.match("Query complete after .*s", updates[-1]) +# assert isinstance(result, pandas.DataFrame) +# assert len(result) == 10 # verify row count +# assert list(result) == ["url", "view_count"] # verify column names + +# # NOTE: For some reason, the number of open sockets is sometimes one *less* +# # than expected when running system tests on Kokoro, thus using the <= assertion. +# # That's still fine, however, since the sockets are apparently not leaked. +# assert conn_count_end <= conn_count_start # system resources are released diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 34e4243c4..7154c7b8d 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -1,1301 +1,1301 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""System tests for pandas connector.""" - -import collections -import datetime -import decimal -import json -import io -import operator -import warnings - -import google.api_core.retry -import pkg_resources -import pytest - -from google.cloud import bigquery -from google.cloud import bigquery_storage -from google.cloud.bigquery import enums - -from . import helpers - - -pandas = pytest.importorskip("pandas", minversion="0.23.0") -numpy = pytest.importorskip("numpy") - - -PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version -PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0") - - -class MissingDataError(Exception): - pass - - -def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_id): - """Test that a DataFrame with dtypes that map well to BigQuery types - can be uploaded without specifying a schema. - - https://github.com/googleapis/google-cloud-python/issues/9044 - """ - df_data = collections.OrderedDict( - [ - ("bool_col", pandas.Series([True, False, True], dtype="bool")), - ( - "ts_col", - pandas.Series( - [ - datetime.datetime(2010, 1, 2, 3, 44, 50), - datetime.datetime(2011, 2, 3, 14, 50, 59), - datetime.datetime(2012, 3, 14, 15, 16), - ], - dtype="datetime64[ns]", - ).dt.tz_localize(datetime.timezone.utc), - ), - ( - "dt_col_no_tz", - pandas.Series( - [ - datetime.datetime(2010, 1, 2, 3, 44, 50), - datetime.datetime(2011, 2, 3, 14, 50, 59), - datetime.datetime(2012, 3, 14, 15, 16), - ], - dtype="datetime64[ns]", - ), - ), - ("float32_col", pandas.Series([1.0, 2.0, 3.0], dtype="float32")), - ("float64_col", pandas.Series([4.0, 5.0, 6.0], dtype="float64")), - ("int8_col", pandas.Series([-12, -11, -10], dtype="int8")), - ("int16_col", pandas.Series([-9, -8, -7], dtype="int16")), - ("int32_col", pandas.Series([-6, -5, -4], dtype="int32")), - ("int64_col", pandas.Series([-3, -2, -1], dtype="int64")), - ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), - ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), - ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), - ( - "date_col", - pandas.Series( - [ - datetime.date(2010, 1, 2), - datetime.date(2011, 2, 3), - datetime.date(2012, 3, 14), - ], - dtype="dbdate", - ), - ), - ( - "time_col", - pandas.Series( - [ - datetime.time(3, 44, 50), - datetime.time(14, 50, 59), - datetime.time(15, 16), - ], - dtype="dbtime", - ), - ), - ("array_bool_col", pandas.Series([[True], [False], [True]])), - ( - "array_ts_col", - pandas.Series( - [ - [ - datetime.datetime( - 2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc - ), - ], - [ - datetime.datetime( - 2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc - ), - ], - [ - datetime.datetime( - 2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc - ), - ], - ], - ), - ), - ( - "array_dt_col_no_tz", - pandas.Series( - [ - [datetime.datetime(2010, 1, 2, 3, 44, 50)], - [datetime.datetime(2011, 2, 3, 14, 50, 59)], - [datetime.datetime(2012, 3, 14, 15, 16)], - ], - ), - ), - ( - "array_float32_col", - pandas.Series( - [numpy.array([_], dtype="float32") for _ in [1.0, 2.0, 3.0]] - ), - ), - ( - "array_float64_col", - pandas.Series( - [numpy.array([_], dtype="float64") for _ in [4.0, 5.0, 6.0]] - ), - ), - ( - "array_int8_col", - pandas.Series( - [numpy.array([_], dtype="int8") for _ in [-12, -11, -10]] - ), - ), - ( - "array_int16_col", - pandas.Series([numpy.array([_], dtype="int16") for _ in [-9, -8, -7]]), - ), - ( - "array_int32_col", - pandas.Series([numpy.array([_], dtype="int32") for _ in [-6, -5, -4]]), - ), - ( - "array_int64_col", - pandas.Series([numpy.array([_], dtype="int64") for _ in [-3, -2, -1]]), - ), - ( - "array_uint8_col", - pandas.Series([numpy.array([_], dtype="uint8") for _ in [0, 1, 2]]), - ), - ( - "array_uint16_col", - pandas.Series([numpy.array([_], dtype="uint16") for _ in [3, 4, 5]]), - ), - ( - "array_uint32_col", - pandas.Series([numpy.array([_], dtype="uint32") for _ in [6, 7, 8]]), - ), - ] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - - table_id = "{}.{}.load_table_from_dataframe_w_automatic_schema".format( - bigquery_client.project, dataset_id - ) - - load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) - load_job.result() - - table = bigquery_client.get_table(table_id) - assert tuple(table.schema) == ( - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - bigquery.SchemaField("dt_col_no_tz", "DATETIME"), - bigquery.SchemaField("float32_col", "FLOAT"), - bigquery.SchemaField("float64_col", "FLOAT"), - bigquery.SchemaField("int8_col", "INTEGER"), - bigquery.SchemaField("int16_col", "INTEGER"), - bigquery.SchemaField("int32_col", "INTEGER"), - bigquery.SchemaField("int64_col", "INTEGER"), - bigquery.SchemaField("uint8_col", "INTEGER"), - bigquery.SchemaField("uint16_col", "INTEGER"), - bigquery.SchemaField("uint32_col", "INTEGER"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("array_bool_col", "BOOLEAN", mode="REPEATED"), - bigquery.SchemaField("array_ts_col", "TIMESTAMP", mode="REPEATED"), - bigquery.SchemaField("array_dt_col_no_tz", "DATETIME", mode="REPEATED"), - bigquery.SchemaField("array_float32_col", "FLOAT", mode="REPEATED"), - bigquery.SchemaField("array_float64_col", "FLOAT", mode="REPEATED"), - bigquery.SchemaField("array_int8_col", "INTEGER", mode="REPEATED"), - bigquery.SchemaField("array_int16_col", "INTEGER", mode="REPEATED"), - bigquery.SchemaField("array_int32_col", "INTEGER", mode="REPEATED"), - bigquery.SchemaField("array_int64_col", "INTEGER", mode="REPEATED"), - bigquery.SchemaField("array_uint8_col", "INTEGER", mode="REPEATED"), - bigquery.SchemaField("array_uint16_col", "INTEGER", mode="REPEATED"), - bigquery.SchemaField("array_uint32_col", "INTEGER", mode="REPEATED"), - ) - - assert numpy.array( - sorted(map(list, bigquery_client.list_rows(table)), key=lambda r: r[5]), - dtype="object", - ).transpose().tolist() == [ - # bool_col - [True, False, True], - # ts_col - [ - datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc), - datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc), - datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc), - ], - # dt_col_no_tz - [ - datetime.datetime(2010, 1, 2, 3, 44, 50), - datetime.datetime(2011, 2, 3, 14, 50, 59), - datetime.datetime(2012, 3, 14, 15, 16), - ], - # float32_col - [1.0, 2.0, 3.0], - # float64_col - [4.0, 5.0, 6.0], - # int8_col - [-12, -11, -10], - # int16_col - [-9, -8, -7], - # int32_col - [-6, -5, -4], - # int64_col - [-3, -2, -1], - # uint8_col - [0, 1, 2], - # uint16_col - [3, 4, 5], - # uint32_col - [6, 7, 8], - # date_col - [ - datetime.date(2010, 1, 2), - datetime.date(2011, 2, 3), - datetime.date(2012, 3, 14), - ], - # time_col - [datetime.time(3, 44, 50), datetime.time(14, 50, 59), datetime.time(15, 16)], - # array_bool_col - [[True], [False], [True]], - # array_ts_col - [ - [datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc)], - [datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc)], - [datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc)], - ], - # array_dt_col - [ - [datetime.datetime(2010, 1, 2, 3, 44, 50)], - [datetime.datetime(2011, 2, 3, 14, 50, 59)], - [datetime.datetime(2012, 3, 14, 15, 16)], - ], - # array_float32_col - [[1.0], [2.0], [3.0]], - # array_float64_col - [[4.0], [5.0], [6.0]], - # array_int8_col - [[-12], [-11], [-10]], - # array_int16_col - [[-9], [-8], [-7]], - # array_int32_col - [[-6], [-5], [-4]], - # array_int64_col - [[-3], [-2], [-1]], - # array_uint8_col - [[0], [1], [2]], - # array_uint16_col - [[3], [4], [5]], - # array_uint32_col - [[6], [7], [8]], - ] - - -@pytest.mark.skipif( - PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, - reason="Only `pandas version >=1.0.0` is supported", -) -def test_load_table_from_dataframe_w_nullable_int64_datatype( - bigquery_client, dataset_id -): - """Test that a DataFrame containing column with None-type values and int64 datatype - can be uploaded if a BigQuery schema is specified. - - https://github.com/googleapis/python-bigquery/issues/22 - """ - table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( - bigquery_client.project, dataset_id - ) - table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),) - table = helpers.retry_403(bigquery_client.create_table)( - bigquery.Table(table_id, schema=table_schema) - ) - - df_data = collections.OrderedDict( - [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) - load_job.result() - table = bigquery_client.get_table(table_id) - assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) - assert table.num_rows == 4 - - -@pytest.mark.skipif( - PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, - reason="Only `pandas version >=1.0.0` is supported", -) -def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema( - bigquery_client, dataset_id, table_id -): - """Test that a DataFrame containing column with None-type values and int64 datatype - can be uploaded without specifying a schema. - - https://github.com/googleapis/python-bigquery/issues/22 - """ - - df_data = collections.OrderedDict( - [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] - ) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) - load_job.result() - table = bigquery_client.get_table(table_id) - assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) - assert table.num_rows == 4 - - -def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): - """Test that a DataFrame with null columns can be uploaded if a - BigQuery schema is specified. - - See: https://github.com/googleapis/google-cloud-python/issues/7370 - """ - # Schema with all scalar types. - table_schema = ( - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("dt_col", "DATETIME"), - bigquery.SchemaField("float_col", "FLOAT"), - bigquery.SchemaField("geo_col", "GEOGRAPHY"), - bigquery.SchemaField("int_col", "INTEGER"), - bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("bignum_col", "BIGNUMERIC"), - bigquery.SchemaField("str_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - ) - - num_rows = 100 - nulls = [None] * num_rows - df_data = [ - ("bool_col", nulls), - ("bytes_col", nulls), - ("date_col", nulls), - ("dt_col", nulls), - ("float_col", nulls), - ("geo_col", nulls), - ("int_col", nulls), - ("num_col", nulls), - ("bignum_col", nulls), - ("str_col", nulls), - ("time_col", nulls), - ("ts_col", nulls), - ] - df_data = collections.OrderedDict(df_data) - dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - - table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( - bigquery_client.project, dataset_id - ) - - # Create the table before loading so that schema mismatch errors are - # identified. - table = helpers.retry_403(bigquery_client.create_table)( - bigquery.Table(table_id, schema=table_schema) - ) - - job_config = bigquery.LoadJobConfig(schema=table_schema) - load_job = bigquery_client.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = bigquery_client.get_table(table) - assert tuple(table.schema) == table_schema - assert table.num_rows == num_rows - - -def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id): - """Test that a DataFrame with required columns can be uploaded if a - BigQuery schema is specified. - - See: https://github.com/googleapis/google-cloud-python/issues/8093 - """ - table_schema = ( - bigquery.SchemaField("name", "STRING", mode="REQUIRED"), - bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), - ) - - records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}] - dataframe = pandas.DataFrame(records, columns=["name", "age"]) - job_config = bigquery.LoadJobConfig(schema=table_schema) - table_id = "{}.{}.load_table_from_dataframe_w_required".format( - bigquery_client.project, dataset_id - ) - - # Create the table before loading so that schema mismatch errors are - # identified. - table = helpers.retry_403(bigquery_client.create_table)( - bigquery.Table(table_id, schema=table_schema) - ) - - job_config = bigquery.LoadJobConfig(schema=table_schema) - load_job = bigquery_client.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = bigquery_client.get_table(table) - assert tuple(table.schema) == table_schema - assert table.num_rows == 2 - - -def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id): - # Schema with all scalar types. - # See: - # https://github.com/googleapis/python-bigquery/issues/61 - # https://issuetracker.google.com/issues/151765076 - table_schema = ( - bigquery.SchemaField("row_num", "INTEGER"), - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("dt_col", "DATETIME"), - bigquery.SchemaField("float_col", "FLOAT"), - bigquery.SchemaField("geo_col", "GEOGRAPHY"), - bigquery.SchemaField("int_col", "INTEGER"), - bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("bignum_col", "BIGNUMERIC"), - bigquery.SchemaField("str_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - ) - - df_data = [ - ("row_num", [1, 2, 3]), - ("bool_col", [True, None, False]), - ("bytes_col", [b"abc", None, b"def"]), - ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)]), - ( - "dt_col", - [ - datetime.datetime(1, 1, 1, 0, 0, 0), - None, - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - ], - ), - ("float_col", [float("-inf"), float("nan"), float("inf")]), - ( - "geo_col", - ["POINT(30 10)", None, "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], - ), - ("int_col", [-9223372036854775808, None, 9223372036854775807]), - ( - "num_col", - [ - decimal.Decimal("-99999999999999999999999999999.999999999"), - None, - decimal.Decimal("99999999999999999999999999999.999999999"), - ], - ), - ( - "bignum_col", - [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - None, - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ], - ), - ("str_col", ["abc", None, "def"]), - ( - "time_col", - [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], - ), - ( - "ts_col", - [ - datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), - None, - datetime.datetime( - 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc - ), - ], - ), - ] - df_data = collections.OrderedDict(df_data) - dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - - table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( - bigquery_client.project, dataset_id - ) - - job_config = bigquery.LoadJobConfig(schema=table_schema) - load_job = bigquery_client.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = bigquery_client.get_table(table_id) - assert tuple(table.schema) == table_schema - assert table.num_rows == 3 - - result = bigquery_client.list_rows(table).to_dataframe() - result.sort_values("row_num", inplace=True) - - # Check that extreme DATE/DATETIME values are loaded correctly. - # https://github.com/googleapis/python-bigquery/issues/1076 - assert result["date_col"][0] == datetime.date(1, 1, 1) - assert result["date_col"][2] == datetime.date(9999, 12, 31) - assert result["dt_col"][0] == datetime.datetime(1, 1, 1, 0, 0, 0) - assert result["dt_col"][2] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) - assert result["ts_col"][0] == datetime.datetime( - 1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc - ) - assert result["ts_col"][2] == datetime.datetime( - 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc - ) - - -def test_load_table_from_dataframe_w_struct_datatype(bigquery_client, dataset_id): - """Test that a DataFrame with struct datatype can be uploaded if a - BigQuery schema is specified. - - https://github.com/googleapis/python-bigquery/issues/21 - """ - table_id = "{}.{}.load_table_from_dataframe_w_struct_datatype".format( - bigquery_client.project, dataset_id - ) - table_schema = [ - bigquery.SchemaField( - "bar", - "RECORD", - fields=[ - bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), - bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), - ], - mode="REQUIRED", - ), - ] - table = helpers.retry_403(bigquery_client.create_table)( - bigquery.Table(table_id, schema=table_schema) - ) - - df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}] - dataframe = pandas.DataFrame(data={"bar": df_data}, columns=["bar"]) - - load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) - load_job.result() - - table = bigquery_client.get_table(table_id) - assert table.schema == table_schema - assert table.num_rows == 3 - - -def test_load_table_from_dataframe_w_explicit_schema_source_format_csv( - bigquery_client, dataset_id -): - from google.cloud.bigquery.job import SourceFormat - - table_schema = ( - bigquery.SchemaField("bool_col", "BOOLEAN"), - bigquery.SchemaField("bytes_col", "BYTES"), - bigquery.SchemaField("date_col", "DATE"), - bigquery.SchemaField("dt_col", "DATETIME"), - bigquery.SchemaField("float_col", "FLOAT"), - bigquery.SchemaField("geo_col", "GEOGRAPHY"), - bigquery.SchemaField("int_col", "INTEGER"), - bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("bignum_col", "BIGNUMERIC"), - bigquery.SchemaField("str_col", "STRING"), - bigquery.SchemaField("time_col", "TIME"), - bigquery.SchemaField("ts_col", "TIMESTAMP"), - ) - df_data = collections.OrderedDict( - [ - ("bool_col", [True, None, False]), - ("bytes_col", ["abc", None, "def"]), - ( - "date_col", - [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], - ), - ( - "dt_col", - [ - datetime.datetime(1, 1, 1, 0, 0, 0), - None, - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - ], - ), - ("float_col", [float("-inf"), float("nan"), float("inf")]), - ( - "geo_col", - [ - "POINT(30 10)", - None, - "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", - ], - ), - ("int_col", [-9223372036854775808, None, 9223372036854775807]), - ( - "num_col", - [ - decimal.Decimal("-99999999999999999999999999999.999999999"), - None, - decimal.Decimal("99999999999999999999999999999.999999999"), - ], - ), - ( - "bignum_col", - [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - None, - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ], - ), - ("str_col", ["abc", None, "def"]), - ( - "time_col", - [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], - ), - ( - "ts_col", - [ - datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), - None, - datetime.datetime( - 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc - ), - ], - ), - ] - ) - dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - - table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format( - bigquery_client.project, dataset_id - ) - - job_config = bigquery.LoadJobConfig( - schema=table_schema, source_format=SourceFormat.CSV - ) - load_job = bigquery_client.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = bigquery_client.get_table(table_id) - assert tuple(table.schema) == table_schema - assert table.num_rows == 3 - - -def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats( - bigquery_client, dataset_id, table_id -): - from google.cloud.bigquery.job import SourceFormat - - table_schema = (bigquery.SchemaField("float_col", "FLOAT"),) - df_data = collections.OrderedDict( - [ - ( - "float_col", - [ - 0.14285714285714285, - 0.51428571485748, - 0.87128748, - 1.807960649, - 2.0679610649, - 2.4406779661016949, - 3.7148514257, - 3.8571428571428572, - 1.51251252e40, - ], - ), - ] - ) - dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - - job_config = bigquery.LoadJobConfig( - schema=table_schema, source_format=SourceFormat.CSV - ) - load_job = bigquery_client.load_table_from_dataframe( - dataframe, table_id, job_config=job_config - ) - load_job.result() - - table = bigquery_client.get_table(table_id) - rows = bigquery_client.list_rows(table_id) - floats = [r.values()[0] for r in rows] - assert tuple(table.schema) == table_schema - assert table.num_rows == 9 - assert floats == df_data["float_col"] - - -def test_query_results_to_dataframe(bigquery_client): - QUERY = """ - SELECT id, author, time_ts, dead - FROM `bigquery-public-data.hacker_news.comments` - LIMIT 10 - """ - - df = bigquery_client.query(QUERY).result().to_dataframe() - - assert isinstance(df, pandas.DataFrame) - assert len(df) == 10 # verify the number of rows - column_names = ["id", "author", "time_ts", "dead"] - assert list(df) == column_names # verify the column names - exp_datatypes = { - "id": int, - "author": str, - "time_ts": pandas.Timestamp, - "dead": bool, - } - for _, row in df.iterrows(): - for col in column_names: - # all the schema fields are nullable, so None is acceptable - if not pandas.isna(row[col]): - assert isinstance(row[col], exp_datatypes[col]) - - -def test_query_results_to_dataframe_w_bqstorage(bigquery_client): - query = """ - SELECT id, author, time_ts, dead - FROM `bigquery-public-data.hacker_news.comments` - LIMIT 10 - """ - - bqstorage_client = bigquery_storage.BigQueryReadClient( - credentials=bigquery_client._credentials - ) - - df = bigquery_client.query(query).result().to_dataframe(bqstorage_client) - - assert isinstance(df, pandas.DataFrame) - assert len(df) == 10 # verify the number of rows - column_names = ["id", "author", "time_ts", "dead"] - assert list(df) == column_names - exp_datatypes = { - "id": int, - "author": str, - "time_ts": pandas.Timestamp, - "dead": bool, - } - for index, row in df.iterrows(): - for col in column_names: - # all the schema fields are nullable, so None is acceptable - if not pandas.isna(row[col]): - assert isinstance(row[col], exp_datatypes[col]) - - -def test_insert_rows_from_dataframe(bigquery_client, dataset_id): - SF = bigquery.SchemaField - schema = [ - SF("float_col", "FLOAT", mode="REQUIRED"), - SF("int_col", "INTEGER", mode="REQUIRED"), - SF("bool_col", "BOOLEAN", mode="REQUIRED"), - SF("string_col", "STRING", mode="NULLABLE"), - SF("date_col", "DATE", mode="NULLABLE"), - SF("time_col", "TIME", mode="NULLABLE"), - ] - - dataframe = pandas.DataFrame( - [ - { - "float_col": 1.11, - "bool_col": True, - "string_col": "my string", - "int_col": 10, - "date_col": datetime.date(2021, 1, 1), - "time_col": datetime.time(21, 1, 1), - }, - { - "float_col": 2.22, - "bool_col": False, - "string_col": "another string", - "int_col": 20, - "date_col": datetime.date(2021, 1, 2), - "time_col": datetime.time(21, 1, 2), - }, - { - "float_col": 3.33, - "bool_col": False, - "string_col": "another string", - "int_col": 30, - "date_col": datetime.date(2021, 1, 3), - "time_col": datetime.time(21, 1, 3), - }, - { - "float_col": 4.44, - "bool_col": True, - "string_col": "another string", - "int_col": 40, - "date_col": datetime.date(2021, 1, 4), - "time_col": datetime.time(21, 1, 4), - }, - { - "float_col": 5.55, - "bool_col": False, - "string_col": "another string", - "int_col": 50, - "date_col": datetime.date(2021, 1, 5), - "time_col": datetime.time(21, 1, 5), - }, - { - "float_col": 6.66, - "bool_col": True, - # Include a NaN value, because pandas often uses NaN as a - # NULL value indicator. - "string_col": float("NaN"), - "int_col": 60, - "date_col": datetime.date(2021, 1, 6), - "time_col": datetime.time(21, 1, 6), - }, - ] - ) - dataframe["date_col"] = dataframe["date_col"].astype("dbdate") - dataframe["time_col"] = dataframe["time_col"].astype("dbtime") - - table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe" - table_arg = bigquery.Table(table_id, schema=schema) - table = helpers.retry_403(bigquery_client.create_table)(table_arg) - - chunk_errors = bigquery_client.insert_rows_from_dataframe( - table, dataframe, chunk_size=3 - ) - for errors in chunk_errors: - assert not errors - expected = [ - # Pandas often represents NULL values as NaN. Convert to None for - # easier comparison. - tuple(None if col != col else col for col in data_row) - for data_row in dataframe.itertuples(index=False) - ] - - # Use query to fetch rows instead of listing directly from the table so - # that we get values from the streaming buffer "within a few seconds". - # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability - @google.api_core.retry.Retry( - predicate=google.api_core.retry.if_exception_type(MissingDataError) - ) - def get_rows(): - rows = list( - bigquery_client.query( - "SELECT * FROM `{}.{}.{}`".format( - table.project, table.dataset_id, table.table_id - ) - ) - ) - if len(rows) != len(expected): - raise MissingDataError() - return rows - - rows = get_rows() - sorted_rows = sorted(rows, key=operator.attrgetter("int_col")) - row_tuples = [r.values() for r in sorted_rows] - - for row, expected_row in zip(row_tuples, expected): - assert ( - # Use Counter to verify the same number of values in each, because - # column order does not matter. - collections.Counter(row) - == collections.Counter(expected_row) - ) - - -def test_nested_table_to_dataframe(bigquery_client, dataset_id): - from google.cloud.bigquery.job import SourceFormat - from google.cloud.bigquery.job import WriteDisposition - - SF = bigquery.SchemaField - schema = [ - SF("string_col", "STRING", mode="NULLABLE"), - SF( - "record_col", - "RECORD", - mode="NULLABLE", - fields=[ - SF("nested_string", "STRING", mode="NULLABLE"), - SF("nested_repeated", "INTEGER", mode="REPEATED"), - SF( - "nested_record", - "RECORD", - mode="NULLABLE", - fields=[SF("nested_nested_string", "STRING", mode="NULLABLE")], - ), - ], - ), - SF("bigfloat_col", "FLOAT", mode="NULLABLE"), - SF("smallfloat_col", "FLOAT", mode="NULLABLE"), - ] - record = { - "nested_string": "another string value", - "nested_repeated": [0, 1, 2], - "nested_record": {"nested_nested_string": "some deep insight"}, - } - to_insert = [ - { - "string_col": "Some value", - "record_col": record, - "bigfloat_col": 3.14, - "smallfloat_col": 2.72, - } - ] - rows = [json.dumps(row) for row in to_insert] - body = io.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) - table_id = f"{bigquery_client.project}.{dataset_id}.test_nested_table_to_dataframe" - job_config = bigquery.LoadJobConfig() - job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE - job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON - job_config.schema = schema - # Load a table using a local JSON file from memory. - bigquery_client.load_table_from_file(body, table_id, job_config=job_config).result() - - df = bigquery_client.list_rows(table_id, selected_fields=schema).to_dataframe( - dtypes={"smallfloat_col": "float16"} - ) - - assert isinstance(df, pandas.DataFrame) - assert len(df) == 1 # verify the number of rows - exp_columns = ["string_col", "record_col", "bigfloat_col", "smallfloat_col"] - assert list(df) == exp_columns # verify the column names - row = df.iloc[0] - # verify the row content - assert row["string_col"] == "Some value" - expected_keys = tuple(sorted(record.keys())) - row_keys = tuple(sorted(row["record_col"].keys())) - assert row_keys == expected_keys - # Can't compare numpy arrays, which pyarrow encodes the embedded - # repeated column to, so convert to list. - assert list(row["record_col"]["nested_repeated"]) == [0, 1, 2] - # verify that nested data can be accessed with indices/keys - assert row["record_col"]["nested_repeated"][0] == 0 - assert ( - row["record_col"]["nested_record"]["nested_nested_string"] - == "some deep insight" - ) - # verify dtypes - assert df.dtypes["bigfloat_col"].name == "float64" - assert df.dtypes["smallfloat_col"].name == "float16" - - -def test_list_rows_max_results_w_bqstorage(bigquery_client): - table_ref = bigquery.DatasetReference("bigquery-public-data", "utility_us").table( - "country_code_iso" - ) - bqstorage_client = bigquery_storage.BigQueryReadClient( - credentials=bigquery_client._credentials - ) - - row_iterator = bigquery_client.list_rows( - table_ref, - selected_fields=[bigquery.SchemaField("country_name", "STRING")], - max_results=100, - ) - with pytest.warns( - UserWarning, match="Cannot use bqstorage_client if max_results is set" - ): - dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) - - assert len(dataframe.index) == 100 - - -@pytest.mark.parametrize( - ("max_results",), - ( - (None,), - (10,), - ), # Use BQ Storage API. # Use REST API. -) -def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): - # TODO(GH#836): Avoid INTERVAL columns until they are supported by the - # BigQuery Storage API and pyarrow. - schema = [ - bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), - bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), - bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), - bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), - bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), - bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), - bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), - bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), - bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), - bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), - bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), - bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), - ] - - df = bigquery_client.list_rows( - scalars_table, - max_results=max_results, - selected_fields=schema, - ).to_dataframe() - - assert df.dtypes["bool_col"].name == "boolean" - assert df.dtypes["datetime_col"].name == "datetime64[ns]" - assert df.dtypes["float64_col"].name == "float64" - assert df.dtypes["int64_col"].name == "Int64" - assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" - assert df.dtypes["date_col"].name == "dbdate" - assert df.dtypes["time_col"].name == "dbtime" - - # decimal.Decimal is used to avoid loss of precision. - assert df.dtypes["bignumeric_col"].name == "object" - assert df.dtypes["numeric_col"].name == "object" - - # pandas uses Python string and bytes objects. - assert df.dtypes["bytes_col"].name == "object" - assert df.dtypes["string_col"].name == "object" - - -@pytest.mark.parametrize( - ("max_results",), - ( - (None,), - (10,), - ), # Use BQ Storage API. # Use REST API. -) -def test_list_rows_nullable_scalars_extreme_dtypes( - bigquery_client, scalars_extreme_table, max_results -): - # TODO(GH#836): Avoid INTERVAL columns until they are supported by the - # BigQuery Storage API and pyarrow. - schema = [ - bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), - bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), - bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), - bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), - bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), - bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), - bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), - bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), - bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), - bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), - bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), - bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), - ] - - df = bigquery_client.list_rows( - scalars_extreme_table, - max_results=max_results, - selected_fields=schema, - ).to_dataframe() - - # Extreme values are out-of-bounds for pandas datetime64 values, which use - # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must - # be represented with object. - # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations - assert df.dtypes["date_col"].name == "object" - assert df.dtypes["datetime_col"].name == "object" - assert df.dtypes["timestamp_col"].name == "object" - - # These pandas dtypes can handle the same ranges as BigQuery. - assert df.dtypes["bool_col"].name == "boolean" - assert df.dtypes["float64_col"].name == "float64" - assert df.dtypes["int64_col"].name == "Int64" - assert df.dtypes["time_col"].name == "dbtime" - - # decimal.Decimal is used to avoid loss of precision. - assert df.dtypes["numeric_col"].name == "object" - assert df.dtypes["bignumeric_col"].name == "object" - - # pandas uses Python string and bytes objects. - assert df.dtypes["bytes_col"].name == "object" - assert df.dtypes["string_col"].name == "object" - - -def test_upload_time_and_datetime_56(bigquery_client, dataset_id): - df = pandas.DataFrame( - dict( - dt=[ - datetime.datetime(2020, 1, 8, 8, 0, 0), - datetime.datetime( - 2020, - 1, - 8, - 8, - 0, - 0, - tzinfo=datetime.timezone(datetime.timedelta(hours=-7)), - ), - ], - t=[datetime.time(0, 0, 10, 100001), None], - ) - ) - table = f"{dataset_id}.test_upload_time_and_datetime" - bigquery_client.load_table_from_dataframe(df, table).result() - data = list(map(list, bigquery_client.list_rows(table))) - assert data == [ - [ - datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc), - datetime.time(0, 0, 10, 100001), - ], - [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None], - ] - - from google.cloud.bigquery import job, schema - - table = f"{dataset_id}.test_upload_time_and_datetime_dt" - config = job.LoadJobConfig( - schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")] - ) - - bigquery_client.load_table_from_dataframe(df, table, job_config=config).result() - data = list(map(list, bigquery_client.list_rows(table))) - assert data == [ - [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)], - [datetime.datetime(2020, 1, 8, 15, 0), None], - ] - - -def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): - wkt = pytest.importorskip("shapely.wkt") - bigquery_client.query( - f"create table {dataset_id}.lake (name string, geog geography)" - ).result() - bigquery_client.query( - f""" - insert into {dataset_id}.lake (name, geog) values - ('foo', st_geogfromtext('point(0 0)')), - ('bar', st_geogfromtext('point(0 1)')), - ('baz', null) - """ - ).result() - df = bigquery_client.query( - f"select * from {dataset_id}.lake order by name" - ).to_dataframe(geography_as_object=True) - assert list(df["name"]) == ["bar", "baz", "foo"] - assert df["geog"][0] == wkt.loads("point(0 1)") - assert pandas.isna(df["geog"][1]) - assert df["geog"][2] == wkt.loads("point(0 0)") - - -def test_to_geodataframe(bigquery_client, dataset_id): - geopandas = pytest.importorskip("geopandas") - from shapely import wkt - - bigquery_client.query( - f"create table {dataset_id}.geolake (name string, geog geography)" - ).result() - bigquery_client.query( - f""" - insert into {dataset_id}.geolake (name, geog) values - ('foo', st_geogfromtext('point(0 0)')), - ('bar', st_geogfromtext('polygon((0 0, 1 0, 1 1, 0 0))')), - ('baz', null) - """ - ).result() - df = bigquery_client.query( - f"select * from {dataset_id}.geolake order by name" - ).to_geodataframe() - assert df["geog"][0] == wkt.loads("polygon((0 0, 1 0, 1 1, 0 0))") - assert pandas.isna(df["geog"][1]) - assert df["geog"][2] == wkt.loads("point(0 0)") - assert isinstance(df, geopandas.GeoDataFrame) - assert isinstance(df["geog"], geopandas.GeoSeries) - - with warnings.catch_warnings(): - # Computing the area on a GeoDataFrame that uses a geographic Coordinate - # Reference System (CRS) produces a warning that we are not interested in. - # We do not mind if the computed area is incorrect with respect to the - # GeoDataFrame data, as long as it matches the expected "incorrect" value. - warnings.filterwarnings("ignore", category=UserWarning) - assert df.area[0] == 0.5 - assert pandas.isna(df.area[1]) - assert df.area[2] == 0.0 - - assert df.crs.srs == "EPSG:4326" - assert df.crs.name == "WGS 84" - assert df.geog.crs.srs == "EPSG:4326" - assert df.geog.crs.name == "WGS 84" - - -def test_load_geodataframe(bigquery_client, dataset_id): - geopandas = pytest.importorskip("geopandas") - import pandas - from shapely import wkt - from google.cloud.bigquery.schema import SchemaField - - df = geopandas.GeoDataFrame( - pandas.DataFrame( - dict( - name=["foo", "bar"], - geo1=[None, None], - geo2=[None, wkt.loads("Point(1 1)")], - ) - ), - geometry="geo1", - ) - - table_id = f"{dataset_id}.lake_from_gp" - bigquery_client.load_table_from_dataframe(df, table_id).result() - - table = bigquery_client.get_table(table_id) - assert table.schema == [ - SchemaField("name", "STRING", "NULLABLE"), - SchemaField("geo1", "GEOGRAPHY", "NULLABLE"), - SchemaField("geo2", "GEOGRAPHY", "NULLABLE"), - ] - assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ["bar", None, "POINT(1 1)"], - ["foo", None, None], - ] - - -def test_load_dataframe_w_shapely(bigquery_client, dataset_id): - wkt = pytest.importorskip("shapely.wkt") - from google.cloud.bigquery.schema import SchemaField - - df = pandas.DataFrame( - dict(name=["foo", "bar"], geo=[None, wkt.loads("Point(1 1)")]) - ) - - table_id = f"{dataset_id}.lake_from_shapes" - bigquery_client.load_table_from_dataframe(df, table_id).result() - - table = bigquery_client.get_table(table_id) - assert table.schema == [ - SchemaField("name", "STRING", "NULLABLE"), - SchemaField("geo", "GEOGRAPHY", "NULLABLE"), - ] - assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ["bar", "POINT(1 1)"], - ["foo", None], - ] - - bigquery_client.load_table_from_dataframe(df, table_id).result() - assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ["bar", "POINT(1 1)"], - ["bar", "POINT(1 1)"], - ["foo", None], - ["foo", None], - ] - - -def test_load_dataframe_w_wkb(bigquery_client, dataset_id): - wkt = pytest.importorskip("shapely.wkt") - from shapely import wkb - from google.cloud.bigquery.schema import SchemaField - - df = pandas.DataFrame( - dict(name=["foo", "bar"], geo=[None, wkb.dumps(wkt.loads("Point(1 1)"))]) - ) - - table_id = f"{dataset_id}.lake_from_wkb" - # We create the table first, to inform the interpretation of the wkb data - bigquery_client.query( - f"create table {table_id} (name string, geo GEOGRAPHY)" - ).result() - bigquery_client.load_table_from_dataframe(df, table_id).result() - - table = bigquery_client.get_table(table_id) - assert table.schema == [ - SchemaField("name", "STRING", "NULLABLE"), - SchemaField("geo", "GEOGRAPHY", "NULLABLE"), - ] - assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ - ["bar", "POINT(1 1)"], - ["foo", None], - ] +# # Copyright 2021 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # https://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# """System tests for pandas connector.""" + +# import collections +# import datetime +# import decimal +# import json +# import io +# import operator +# import warnings + +# import google.api_core.retry +# import pkg_resources +# import pytest + +# from google.cloud import bigquery +# from google.cloud import bigquery_storage +# from google.cloud.bigquery import enums + +# from . import helpers + + +# pandas = pytest.importorskip("pandas", minversion="0.23.0") +# numpy = pytest.importorskip("numpy") + + +# PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version +# PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0") + + +# class MissingDataError(Exception): +# pass + + +# def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_id): +# """Test that a DataFrame with dtypes that map well to BigQuery types +# can be uploaded without specifying a schema. + +# https://github.com/googleapis/google-cloud-python/issues/9044 +# """ +# df_data = collections.OrderedDict( +# [ +# ("bool_col", pandas.Series([True, False, True], dtype="bool")), +# ( +# "ts_col", +# pandas.Series( +# [ +# datetime.datetime(2010, 1, 2, 3, 44, 50), +# datetime.datetime(2011, 2, 3, 14, 50, 59), +# datetime.datetime(2012, 3, 14, 15, 16), +# ], +# dtype="datetime64[ns]", +# ).dt.tz_localize(datetime.timezone.utc), +# ), +# ( +# "dt_col_no_tz", +# pandas.Series( +# [ +# datetime.datetime(2010, 1, 2, 3, 44, 50), +# datetime.datetime(2011, 2, 3, 14, 50, 59), +# datetime.datetime(2012, 3, 14, 15, 16), +# ], +# dtype="datetime64[ns]", +# ), +# ), +# ("float32_col", pandas.Series([1.0, 2.0, 3.0], dtype="float32")), +# ("float64_col", pandas.Series([4.0, 5.0, 6.0], dtype="float64")), +# ("int8_col", pandas.Series([-12, -11, -10], dtype="int8")), +# ("int16_col", pandas.Series([-9, -8, -7], dtype="int16")), +# ("int32_col", pandas.Series([-6, -5, -4], dtype="int32")), +# ("int64_col", pandas.Series([-3, -2, -1], dtype="int64")), +# ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), +# ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), +# ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), +# ( +# "date_col", +# pandas.Series( +# [ +# datetime.date(2010, 1, 2), +# datetime.date(2011, 2, 3), +# datetime.date(2012, 3, 14), +# ], +# dtype="dbdate", +# ), +# ), +# ( +# "time_col", +# pandas.Series( +# [ +# datetime.time(3, 44, 50), +# datetime.time(14, 50, 59), +# datetime.time(15, 16), +# ], +# dtype="dbtime", +# ), +# ), +# ("array_bool_col", pandas.Series([[True], [False], [True]])), +# ( +# "array_ts_col", +# pandas.Series( +# [ +# [ +# datetime.datetime( +# 2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc +# ), +# ], +# [ +# datetime.datetime( +# 2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc +# ), +# ], +# [ +# datetime.datetime( +# 2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc +# ), +# ], +# ], +# ), +# ), +# ( +# "array_dt_col_no_tz", +# pandas.Series( +# [ +# [datetime.datetime(2010, 1, 2, 3, 44, 50)], +# [datetime.datetime(2011, 2, 3, 14, 50, 59)], +# [datetime.datetime(2012, 3, 14, 15, 16)], +# ], +# ), +# ), +# ( +# "array_float32_col", +# pandas.Series( +# [numpy.array([_], dtype="float32") for _ in [1.0, 2.0, 3.0]] +# ), +# ), +# ( +# "array_float64_col", +# pandas.Series( +# [numpy.array([_], dtype="float64") for _ in [4.0, 5.0, 6.0]] +# ), +# ), +# ( +# "array_int8_col", +# pandas.Series( +# [numpy.array([_], dtype="int8") for _ in [-12, -11, -10]] +# ), +# ), +# ( +# "array_int16_col", +# pandas.Series([numpy.array([_], dtype="int16") for _ in [-9, -8, -7]]), +# ), +# ( +# "array_int32_col", +# pandas.Series([numpy.array([_], dtype="int32") for _ in [-6, -5, -4]]), +# ), +# ( +# "array_int64_col", +# pandas.Series([numpy.array([_], dtype="int64") for _ in [-3, -2, -1]]), +# ), +# ( +# "array_uint8_col", +# pandas.Series([numpy.array([_], dtype="uint8") for _ in [0, 1, 2]]), +# ), +# ( +# "array_uint16_col", +# pandas.Series([numpy.array([_], dtype="uint16") for _ in [3, 4, 5]]), +# ), +# ( +# "array_uint32_col", +# pandas.Series([numpy.array([_], dtype="uint32") for _ in [6, 7, 8]]), +# ), +# ] +# ) +# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + +# table_id = "{}.{}.load_table_from_dataframe_w_automatic_schema".format( +# bigquery_client.project, dataset_id +# ) + +# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) +# load_job.result() + +# table = bigquery_client.get_table(table_id) +# assert tuple(table.schema) == ( +# bigquery.SchemaField("bool_col", "BOOLEAN"), +# bigquery.SchemaField("ts_col", "TIMESTAMP"), +# bigquery.SchemaField("dt_col_no_tz", "DATETIME"), +# bigquery.SchemaField("float32_col", "FLOAT"), +# bigquery.SchemaField("float64_col", "FLOAT"), +# bigquery.SchemaField("int8_col", "INTEGER"), +# bigquery.SchemaField("int16_col", "INTEGER"), +# bigquery.SchemaField("int32_col", "INTEGER"), +# bigquery.SchemaField("int64_col", "INTEGER"), +# bigquery.SchemaField("uint8_col", "INTEGER"), +# bigquery.SchemaField("uint16_col", "INTEGER"), +# bigquery.SchemaField("uint32_col", "INTEGER"), +# bigquery.SchemaField("date_col", "DATE"), +# bigquery.SchemaField("time_col", "TIME"), +# bigquery.SchemaField("array_bool_col", "BOOLEAN", mode="REPEATED"), +# bigquery.SchemaField("array_ts_col", "TIMESTAMP", mode="REPEATED"), +# bigquery.SchemaField("array_dt_col_no_tz", "DATETIME", mode="REPEATED"), +# bigquery.SchemaField("array_float32_col", "FLOAT", mode="REPEATED"), +# bigquery.SchemaField("array_float64_col", "FLOAT", mode="REPEATED"), +# bigquery.SchemaField("array_int8_col", "INTEGER", mode="REPEATED"), +# bigquery.SchemaField("array_int16_col", "INTEGER", mode="REPEATED"), +# bigquery.SchemaField("array_int32_col", "INTEGER", mode="REPEATED"), +# bigquery.SchemaField("array_int64_col", "INTEGER", mode="REPEATED"), +# bigquery.SchemaField("array_uint8_col", "INTEGER", mode="REPEATED"), +# bigquery.SchemaField("array_uint16_col", "INTEGER", mode="REPEATED"), +# bigquery.SchemaField("array_uint32_col", "INTEGER", mode="REPEATED"), +# ) + +# assert numpy.array( +# sorted(map(list, bigquery_client.list_rows(table)), key=lambda r: r[5]), +# dtype="object", +# ).transpose().tolist() == [ +# # bool_col +# [True, False, True], +# # ts_col +# [ +# datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc), +# datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc), +# datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc), +# ], +# # dt_col_no_tz +# [ +# datetime.datetime(2010, 1, 2, 3, 44, 50), +# datetime.datetime(2011, 2, 3, 14, 50, 59), +# datetime.datetime(2012, 3, 14, 15, 16), +# ], +# # float32_col +# [1.0, 2.0, 3.0], +# # float64_col +# [4.0, 5.0, 6.0], +# # int8_col +# [-12, -11, -10], +# # int16_col +# [-9, -8, -7], +# # int32_col +# [-6, -5, -4], +# # int64_col +# [-3, -2, -1], +# # uint8_col +# [0, 1, 2], +# # uint16_col +# [3, 4, 5], +# # uint32_col +# [6, 7, 8], +# # date_col +# [ +# datetime.date(2010, 1, 2), +# datetime.date(2011, 2, 3), +# datetime.date(2012, 3, 14), +# ], +# # time_col +# [datetime.time(3, 44, 50), datetime.time(14, 50, 59), datetime.time(15, 16)], +# # array_bool_col +# [[True], [False], [True]], +# # array_ts_col +# [ +# [datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc)], +# [datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc)], +# [datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc)], +# ], +# # array_dt_col +# [ +# [datetime.datetime(2010, 1, 2, 3, 44, 50)], +# [datetime.datetime(2011, 2, 3, 14, 50, 59)], +# [datetime.datetime(2012, 3, 14, 15, 16)], +# ], +# # array_float32_col +# [[1.0], [2.0], [3.0]], +# # array_float64_col +# [[4.0], [5.0], [6.0]], +# # array_int8_col +# [[-12], [-11], [-10]], +# # array_int16_col +# [[-9], [-8], [-7]], +# # array_int32_col +# [[-6], [-5], [-4]], +# # array_int64_col +# [[-3], [-2], [-1]], +# # array_uint8_col +# [[0], [1], [2]], +# # array_uint16_col +# [[3], [4], [5]], +# # array_uint32_col +# [[6], [7], [8]], +# ] + + +# @pytest.mark.skipif( +# PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, +# reason="Only `pandas version >=1.0.0` is supported", +# ) +# def test_load_table_from_dataframe_w_nullable_int64_datatype( +# bigquery_client, dataset_id +# ): +# """Test that a DataFrame containing column with None-type values and int64 datatype +# can be uploaded if a BigQuery schema is specified. + +# https://github.com/googleapis/python-bigquery/issues/22 +# """ +# table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( +# bigquery_client.project, dataset_id +# ) +# table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),) +# table = helpers.retry_403(bigquery_client.create_table)( +# bigquery.Table(table_id, schema=table_schema) +# ) + +# df_data = collections.OrderedDict( +# [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] +# ) +# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) +# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) +# load_job.result() +# table = bigquery_client.get_table(table_id) +# assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) +# assert table.num_rows == 4 + + +# @pytest.mark.skipif( +# PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, +# reason="Only `pandas version >=1.0.0` is supported", +# ) +# def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema( +# bigquery_client, dataset_id, table_id +# ): +# """Test that a DataFrame containing column with None-type values and int64 datatype +# can be uploaded without specifying a schema. + +# https://github.com/googleapis/python-bigquery/issues/22 +# """ + +# df_data = collections.OrderedDict( +# [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] +# ) +# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) +# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) +# load_job.result() +# table = bigquery_client.get_table(table_id) +# assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) +# assert table.num_rows == 4 + + +# def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): +# """Test that a DataFrame with null columns can be uploaded if a +# BigQuery schema is specified. + +# See: https://github.com/googleapis/google-cloud-python/issues/7370 +# """ +# # Schema with all scalar types. +# table_schema = ( +# bigquery.SchemaField("bool_col", "BOOLEAN"), +# bigquery.SchemaField("bytes_col", "BYTES"), +# bigquery.SchemaField("date_col", "DATE"), +# bigquery.SchemaField("dt_col", "DATETIME"), +# bigquery.SchemaField("float_col", "FLOAT"), +# bigquery.SchemaField("geo_col", "GEOGRAPHY"), +# bigquery.SchemaField("int_col", "INTEGER"), +# bigquery.SchemaField("num_col", "NUMERIC"), +# bigquery.SchemaField("bignum_col", "BIGNUMERIC"), +# bigquery.SchemaField("str_col", "STRING"), +# bigquery.SchemaField("time_col", "TIME"), +# bigquery.SchemaField("ts_col", "TIMESTAMP"), +# ) + +# num_rows = 100 +# nulls = [None] * num_rows +# df_data = [ +# ("bool_col", nulls), +# ("bytes_col", nulls), +# ("date_col", nulls), +# ("dt_col", nulls), +# ("float_col", nulls), +# ("geo_col", nulls), +# ("int_col", nulls), +# ("num_col", nulls), +# ("bignum_col", nulls), +# ("str_col", nulls), +# ("time_col", nulls), +# ("ts_col", nulls), +# ] +# df_data = collections.OrderedDict(df_data) +# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + +# table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( +# bigquery_client.project, dataset_id +# ) + +# # Create the table before loading so that schema mismatch errors are +# # identified. +# table = helpers.retry_403(bigquery_client.create_table)( +# bigquery.Table(table_id, schema=table_schema) +# ) + +# job_config = bigquery.LoadJobConfig(schema=table_schema) +# load_job = bigquery_client.load_table_from_dataframe( +# dataframe, table_id, job_config=job_config +# ) +# load_job.result() + +# table = bigquery_client.get_table(table) +# assert tuple(table.schema) == table_schema +# assert table.num_rows == num_rows + + +# def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id): +# """Test that a DataFrame with required columns can be uploaded if a +# BigQuery schema is specified. + +# See: https://github.com/googleapis/google-cloud-python/issues/8093 +# """ +# table_schema = ( +# bigquery.SchemaField("name", "STRING", mode="REQUIRED"), +# bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), +# ) + +# records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}] +# dataframe = pandas.DataFrame(records, columns=["name", "age"]) +# job_config = bigquery.LoadJobConfig(schema=table_schema) +# table_id = "{}.{}.load_table_from_dataframe_w_required".format( +# bigquery_client.project, dataset_id +# ) + +# # Create the table before loading so that schema mismatch errors are +# # identified. +# table = helpers.retry_403(bigquery_client.create_table)( +# bigquery.Table(table_id, schema=table_schema) +# ) + +# job_config = bigquery.LoadJobConfig(schema=table_schema) +# load_job = bigquery_client.load_table_from_dataframe( +# dataframe, table_id, job_config=job_config +# ) +# load_job.result() + +# table = bigquery_client.get_table(table) +# assert tuple(table.schema) == table_schema +# assert table.num_rows == 2 + + +# def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id): +# # Schema with all scalar types. +# # See: +# # https://github.com/googleapis/python-bigquery/issues/61 +# # https://issuetracker.google.com/issues/151765076 +# table_schema = ( +# bigquery.SchemaField("row_num", "INTEGER"), +# bigquery.SchemaField("bool_col", "BOOLEAN"), +# bigquery.SchemaField("bytes_col", "BYTES"), +# bigquery.SchemaField("date_col", "DATE"), +# bigquery.SchemaField("dt_col", "DATETIME"), +# bigquery.SchemaField("float_col", "FLOAT"), +# bigquery.SchemaField("geo_col", "GEOGRAPHY"), +# bigquery.SchemaField("int_col", "INTEGER"), +# bigquery.SchemaField("num_col", "NUMERIC"), +# bigquery.SchemaField("bignum_col", "BIGNUMERIC"), +# bigquery.SchemaField("str_col", "STRING"), +# bigquery.SchemaField("time_col", "TIME"), +# bigquery.SchemaField("ts_col", "TIMESTAMP"), +# ) + +# df_data = [ +# ("row_num", [1, 2, 3]), +# ("bool_col", [True, None, False]), +# ("bytes_col", [b"abc", None, b"def"]), +# ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)]), +# ( +# "dt_col", +# [ +# datetime.datetime(1, 1, 1, 0, 0, 0), +# None, +# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), +# ], +# ), +# ("float_col", [float("-inf"), float("nan"), float("inf")]), +# ( +# "geo_col", +# ["POINT(30 10)", None, "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], +# ), +# ("int_col", [-9223372036854775808, None, 9223372036854775807]), +# ( +# "num_col", +# [ +# decimal.Decimal("-99999999999999999999999999999.999999999"), +# None, +# decimal.Decimal("99999999999999999999999999999.999999999"), +# ], +# ), +# ( +# "bignum_col", +# [ +# decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), +# None, +# decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), +# ], +# ), +# ("str_col", ["abc", None, "def"]), +# ( +# "time_col", +# [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], +# ), +# ( +# "ts_col", +# [ +# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), +# None, +# datetime.datetime( +# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc +# ), +# ], +# ), +# ] +# df_data = collections.OrderedDict(df_data) +# dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + +# table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( +# bigquery_client.project, dataset_id +# ) + +# job_config = bigquery.LoadJobConfig(schema=table_schema) +# load_job = bigquery_client.load_table_from_dataframe( +# dataframe, table_id, job_config=job_config +# ) +# load_job.result() + +# table = bigquery_client.get_table(table_id) +# assert tuple(table.schema) == table_schema +# assert table.num_rows == 3 + +# result = bigquery_client.list_rows(table).to_dataframe() +# result.sort_values("row_num", inplace=True) + +# # Check that extreme DATE/DATETIME values are loaded correctly. +# # https://github.com/googleapis/python-bigquery/issues/1076 +# assert result["date_col"][0] == datetime.date(1, 1, 1) +# assert result["date_col"][2] == datetime.date(9999, 12, 31) +# assert result["dt_col"][0] == datetime.datetime(1, 1, 1, 0, 0, 0) +# assert result["dt_col"][2] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) +# assert result["ts_col"][0] == datetime.datetime( +# 1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc +# ) +# assert result["ts_col"][2] == datetime.datetime( +# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc +# ) + + +# def test_load_table_from_dataframe_w_struct_datatype(bigquery_client, dataset_id): +# """Test that a DataFrame with struct datatype can be uploaded if a +# BigQuery schema is specified. + +# https://github.com/googleapis/python-bigquery/issues/21 +# """ +# table_id = "{}.{}.load_table_from_dataframe_w_struct_datatype".format( +# bigquery_client.project, dataset_id +# ) +# table_schema = [ +# bigquery.SchemaField( +# "bar", +# "RECORD", +# fields=[ +# bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), +# bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), +# ], +# mode="REQUIRED", +# ), +# ] +# table = helpers.retry_403(bigquery_client.create_table)( +# bigquery.Table(table_id, schema=table_schema) +# ) + +# df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}] +# dataframe = pandas.DataFrame(data={"bar": df_data}, columns=["bar"]) + +# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) +# load_job.result() + +# table = bigquery_client.get_table(table_id) +# assert table.schema == table_schema +# assert table.num_rows == 3 + + +# def test_load_table_from_dataframe_w_explicit_schema_source_format_csv( +# bigquery_client, dataset_id +# ): +# from google.cloud.bigquery.job import SourceFormat + +# table_schema = ( +# bigquery.SchemaField("bool_col", "BOOLEAN"), +# bigquery.SchemaField("bytes_col", "BYTES"), +# bigquery.SchemaField("date_col", "DATE"), +# bigquery.SchemaField("dt_col", "DATETIME"), +# bigquery.SchemaField("float_col", "FLOAT"), +# bigquery.SchemaField("geo_col", "GEOGRAPHY"), +# bigquery.SchemaField("int_col", "INTEGER"), +# bigquery.SchemaField("num_col", "NUMERIC"), +# bigquery.SchemaField("bignum_col", "BIGNUMERIC"), +# bigquery.SchemaField("str_col", "STRING"), +# bigquery.SchemaField("time_col", "TIME"), +# bigquery.SchemaField("ts_col", "TIMESTAMP"), +# ) +# df_data = collections.OrderedDict( +# [ +# ("bool_col", [True, None, False]), +# ("bytes_col", ["abc", None, "def"]), +# ( +# "date_col", +# [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], +# ), +# ( +# "dt_col", +# [ +# datetime.datetime(1, 1, 1, 0, 0, 0), +# None, +# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), +# ], +# ), +# ("float_col", [float("-inf"), float("nan"), float("inf")]), +# ( +# "geo_col", +# [ +# "POINT(30 10)", +# None, +# "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", +# ], +# ), +# ("int_col", [-9223372036854775808, None, 9223372036854775807]), +# ( +# "num_col", +# [ +# decimal.Decimal("-99999999999999999999999999999.999999999"), +# None, +# decimal.Decimal("99999999999999999999999999999.999999999"), +# ], +# ), +# ( +# "bignum_col", +# [ +# decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), +# None, +# decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), +# ], +# ), +# ("str_col", ["abc", None, "def"]), +# ( +# "time_col", +# [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], +# ), +# ( +# "ts_col", +# [ +# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), +# None, +# datetime.datetime( +# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc +# ), +# ], +# ), +# ] +# ) +# dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + +# table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format( +# bigquery_client.project, dataset_id +# ) + +# job_config = bigquery.LoadJobConfig( +# schema=table_schema, source_format=SourceFormat.CSV +# ) +# load_job = bigquery_client.load_table_from_dataframe( +# dataframe, table_id, job_config=job_config +# ) +# load_job.result() + +# table = bigquery_client.get_table(table_id) +# assert tuple(table.schema) == table_schema +# assert table.num_rows == 3 + + +# def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats( +# bigquery_client, dataset_id, table_id +# ): +# from google.cloud.bigquery.job import SourceFormat + +# table_schema = (bigquery.SchemaField("float_col", "FLOAT"),) +# df_data = collections.OrderedDict( +# [ +# ( +# "float_col", +# [ +# 0.14285714285714285, +# 0.51428571485748, +# 0.87128748, +# 1.807960649, +# 2.0679610649, +# 2.4406779661016949, +# 3.7148514257, +# 3.8571428571428572, +# 1.51251252e40, +# ], +# ), +# ] +# ) +# dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + +# job_config = bigquery.LoadJobConfig( +# schema=table_schema, source_format=SourceFormat.CSV +# ) +# load_job = bigquery_client.load_table_from_dataframe( +# dataframe, table_id, job_config=job_config +# ) +# load_job.result() + +# table = bigquery_client.get_table(table_id) +# rows = bigquery_client.list_rows(table_id) +# floats = [r.values()[0] for r in rows] +# assert tuple(table.schema) == table_schema +# assert table.num_rows == 9 +# assert floats == df_data["float_col"] + + +# def test_query_results_to_dataframe(bigquery_client): +# QUERY = """ +# SELECT id, author, time_ts, dead +# FROM `bigquery-public-data.hacker_news.comments` +# LIMIT 10 +# """ + +# df = bigquery_client.query(QUERY).result().to_dataframe() + +# assert isinstance(df, pandas.DataFrame) +# assert len(df) == 10 # verify the number of rows +# column_names = ["id", "author", "time_ts", "dead"] +# assert list(df) == column_names # verify the column names +# exp_datatypes = { +# "id": int, +# "author": str, +# "time_ts": pandas.Timestamp, +# "dead": bool, +# } +# for _, row in df.iterrows(): +# for col in column_names: +# # all the schema fields are nullable, so None is acceptable +# if not pandas.isna(row[col]): +# assert isinstance(row[col], exp_datatypes[col]) + + +# def test_query_results_to_dataframe_w_bqstorage(bigquery_client): +# query = """ +# SELECT id, author, time_ts, dead +# FROM `bigquery-public-data.hacker_news.comments` +# LIMIT 10 +# """ + +# bqstorage_client = bigquery_storage.BigQueryReadClient( +# credentials=bigquery_client._credentials +# ) + +# df = bigquery_client.query(query).result().to_dataframe(bqstorage_client) + +# assert isinstance(df, pandas.DataFrame) +# assert len(df) == 10 # verify the number of rows +# column_names = ["id", "author", "time_ts", "dead"] +# assert list(df) == column_names +# exp_datatypes = { +# "id": int, +# "author": str, +# "time_ts": pandas.Timestamp, +# "dead": bool, +# } +# for index, row in df.iterrows(): +# for col in column_names: +# # all the schema fields are nullable, so None is acceptable +# if not pandas.isna(row[col]): +# assert isinstance(row[col], exp_datatypes[col]) + + +# def test_insert_rows_from_dataframe(bigquery_client, dataset_id): +# SF = bigquery.SchemaField +# schema = [ +# SF("float_col", "FLOAT", mode="REQUIRED"), +# SF("int_col", "INTEGER", mode="REQUIRED"), +# SF("bool_col", "BOOLEAN", mode="REQUIRED"), +# SF("string_col", "STRING", mode="NULLABLE"), +# SF("date_col", "DATE", mode="NULLABLE"), +# SF("time_col", "TIME", mode="NULLABLE"), +# ] + +# dataframe = pandas.DataFrame( +# [ +# { +# "float_col": 1.11, +# "bool_col": True, +# "string_col": "my string", +# "int_col": 10, +# "date_col": datetime.date(2021, 1, 1), +# "time_col": datetime.time(21, 1, 1), +# }, +# { +# "float_col": 2.22, +# "bool_col": False, +# "string_col": "another string", +# "int_col": 20, +# "date_col": datetime.date(2021, 1, 2), +# "time_col": datetime.time(21, 1, 2), +# }, +# { +# "float_col": 3.33, +# "bool_col": False, +# "string_col": "another string", +# "int_col": 30, +# "date_col": datetime.date(2021, 1, 3), +# "time_col": datetime.time(21, 1, 3), +# }, +# { +# "float_col": 4.44, +# "bool_col": True, +# "string_col": "another string", +# "int_col": 40, +# "date_col": datetime.date(2021, 1, 4), +# "time_col": datetime.time(21, 1, 4), +# }, +# { +# "float_col": 5.55, +# "bool_col": False, +# "string_col": "another string", +# "int_col": 50, +# "date_col": datetime.date(2021, 1, 5), +# "time_col": datetime.time(21, 1, 5), +# }, +# { +# "float_col": 6.66, +# "bool_col": True, +# # Include a NaN value, because pandas often uses NaN as a +# # NULL value indicator. +# "string_col": float("NaN"), +# "int_col": 60, +# "date_col": datetime.date(2021, 1, 6), +# "time_col": datetime.time(21, 1, 6), +# }, +# ] +# ) +# dataframe["date_col"] = dataframe["date_col"].astype("dbdate") +# dataframe["time_col"] = dataframe["time_col"].astype("dbtime") + +# table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe" +# table_arg = bigquery.Table(table_id, schema=schema) +# table = helpers.retry_403(bigquery_client.create_table)(table_arg) + +# chunk_errors = bigquery_client.insert_rows_from_dataframe( +# table, dataframe, chunk_size=3 +# ) +# for errors in chunk_errors: +# assert not errors +# expected = [ +# # Pandas often represents NULL values as NaN. Convert to None for +# # easier comparison. +# tuple(None if col != col else col for col in data_row) +# for data_row in dataframe.itertuples(index=False) +# ] + +# # Use query to fetch rows instead of listing directly from the table so +# # that we get values from the streaming buffer "within a few seconds". +# # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability +# @google.api_core.retry.Retry( +# predicate=google.api_core.retry.if_exception_type(MissingDataError) +# ) +# def get_rows(): +# rows = list( +# bigquery_client.query( +# "SELECT * FROM `{}.{}.{}`".format( +# table.project, table.dataset_id, table.table_id +# ) +# ) +# ) +# if len(rows) != len(expected): +# raise MissingDataError() +# return rows + +# rows = get_rows() +# sorted_rows = sorted(rows, key=operator.attrgetter("int_col")) +# row_tuples = [r.values() for r in sorted_rows] + +# for row, expected_row in zip(row_tuples, expected): +# assert ( +# # Use Counter to verify the same number of values in each, because +# # column order does not matter. +# collections.Counter(row) +# == collections.Counter(expected_row) +# ) + + +# def test_nested_table_to_dataframe(bigquery_client, dataset_id): +# from google.cloud.bigquery.job import SourceFormat +# from google.cloud.bigquery.job import WriteDisposition + +# SF = bigquery.SchemaField +# schema = [ +# SF("string_col", "STRING", mode="NULLABLE"), +# SF( +# "record_col", +# "RECORD", +# mode="NULLABLE", +# fields=[ +# SF("nested_string", "STRING", mode="NULLABLE"), +# SF("nested_repeated", "INTEGER", mode="REPEATED"), +# SF( +# "nested_record", +# "RECORD", +# mode="NULLABLE", +# fields=[SF("nested_nested_string", "STRING", mode="NULLABLE")], +# ), +# ], +# ), +# SF("bigfloat_col", "FLOAT", mode="NULLABLE"), +# SF("smallfloat_col", "FLOAT", mode="NULLABLE"), +# ] +# record = { +# "nested_string": "another string value", +# "nested_repeated": [0, 1, 2], +# "nested_record": {"nested_nested_string": "some deep insight"}, +# } +# to_insert = [ +# { +# "string_col": "Some value", +# "record_col": record, +# "bigfloat_col": 3.14, +# "smallfloat_col": 2.72, +# } +# ] +# rows = [json.dumps(row) for row in to_insert] +# body = io.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) +# table_id = f"{bigquery_client.project}.{dataset_id}.test_nested_table_to_dataframe" +# job_config = bigquery.LoadJobConfig() +# job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE +# job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON +# job_config.schema = schema +# # Load a table using a local JSON file from memory. +# bigquery_client.load_table_from_file(body, table_id, job_config=job_config).result() + +# df = bigquery_client.list_rows(table_id, selected_fields=schema).to_dataframe( +# dtypes={"smallfloat_col": "float16"} +# ) + +# assert isinstance(df, pandas.DataFrame) +# assert len(df) == 1 # verify the number of rows +# exp_columns = ["string_col", "record_col", "bigfloat_col", "smallfloat_col"] +# assert list(df) == exp_columns # verify the column names +# row = df.iloc[0] +# # verify the row content +# assert row["string_col"] == "Some value" +# expected_keys = tuple(sorted(record.keys())) +# row_keys = tuple(sorted(row["record_col"].keys())) +# assert row_keys == expected_keys +# # Can't compare numpy arrays, which pyarrow encodes the embedded +# # repeated column to, so convert to list. +# assert list(row["record_col"]["nested_repeated"]) == [0, 1, 2] +# # verify that nested data can be accessed with indices/keys +# assert row["record_col"]["nested_repeated"][0] == 0 +# assert ( +# row["record_col"]["nested_record"]["nested_nested_string"] +# == "some deep insight" +# ) +# # verify dtypes +# assert df.dtypes["bigfloat_col"].name == "float64" +# assert df.dtypes["smallfloat_col"].name == "float16" + + +# def test_list_rows_max_results_w_bqstorage(bigquery_client): +# table_ref = bigquery.DatasetReference("bigquery-public-data", "utility_us").table( +# "country_code_iso" +# ) +# bqstorage_client = bigquery_storage.BigQueryReadClient( +# credentials=bigquery_client._credentials +# ) + +# row_iterator = bigquery_client.list_rows( +# table_ref, +# selected_fields=[bigquery.SchemaField("country_name", "STRING")], +# max_results=100, +# ) +# with pytest.warns( +# UserWarning, match="Cannot use bqstorage_client if max_results is set" +# ): +# dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) + +# assert len(dataframe.index) == 100 + + +# @pytest.mark.parametrize( +# ("max_results",), +# ( +# (None,), +# (10,), +# ), # Use BQ Storage API. # Use REST API. +# ) +# def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): +# # TODO(GH#836): Avoid INTERVAL columns until they are supported by the +# # BigQuery Storage API and pyarrow. +# schema = [ +# bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), +# bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), +# bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), +# bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), +# bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), +# bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), +# bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), +# bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), +# bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), +# bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), +# bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), +# bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), +# ] + +# df = bigquery_client.list_rows( +# scalars_table, +# max_results=max_results, +# selected_fields=schema, +# ).to_dataframe() + +# assert df.dtypes["bool_col"].name == "boolean" +# assert df.dtypes["datetime_col"].name == "datetime64[ns]" +# assert df.dtypes["float64_col"].name == "float64" +# assert df.dtypes["int64_col"].name == "Int64" +# assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" +# assert df.dtypes["date_col"].name == "dbdate" +# assert df.dtypes["time_col"].name == "dbtime" + +# # decimal.Decimal is used to avoid loss of precision. +# assert df.dtypes["bignumeric_col"].name == "object" +# assert df.dtypes["numeric_col"].name == "object" + +# # pandas uses Python string and bytes objects. +# assert df.dtypes["bytes_col"].name == "object" +# assert df.dtypes["string_col"].name == "object" + + +# @pytest.mark.parametrize( +# ("max_results",), +# ( +# (None,), +# (10,), +# ), # Use BQ Storage API. # Use REST API. +# ) +# def test_list_rows_nullable_scalars_extreme_dtypes( +# bigquery_client, scalars_extreme_table, max_results +# ): +# # TODO(GH#836): Avoid INTERVAL columns until they are supported by the +# # BigQuery Storage API and pyarrow. +# schema = [ +# bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), +# bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), +# bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), +# bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), +# bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), +# bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), +# bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), +# bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), +# bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), +# bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), +# bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), +# bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), +# ] + +# df = bigquery_client.list_rows( +# scalars_extreme_table, +# max_results=max_results, +# selected_fields=schema, +# ).to_dataframe() + +# # Extreme values are out-of-bounds for pandas datetime64 values, which use +# # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must +# # be represented with object. +# # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations +# assert df.dtypes["date_col"].name == "object" +# assert df.dtypes["datetime_col"].name == "object" +# assert df.dtypes["timestamp_col"].name == "object" + +# # These pandas dtypes can handle the same ranges as BigQuery. +# assert df.dtypes["bool_col"].name == "boolean" +# assert df.dtypes["float64_col"].name == "float64" +# assert df.dtypes["int64_col"].name == "Int64" +# assert df.dtypes["time_col"].name == "dbtime" + +# # decimal.Decimal is used to avoid loss of precision. +# assert df.dtypes["numeric_col"].name == "object" +# assert df.dtypes["bignumeric_col"].name == "object" + +# # pandas uses Python string and bytes objects. +# assert df.dtypes["bytes_col"].name == "object" +# assert df.dtypes["string_col"].name == "object" + + +# def test_upload_time_and_datetime_56(bigquery_client, dataset_id): +# df = pandas.DataFrame( +# dict( +# dt=[ +# datetime.datetime(2020, 1, 8, 8, 0, 0), +# datetime.datetime( +# 2020, +# 1, +# 8, +# 8, +# 0, +# 0, +# tzinfo=datetime.timezone(datetime.timedelta(hours=-7)), +# ), +# ], +# t=[datetime.time(0, 0, 10, 100001), None], +# ) +# ) +# table = f"{dataset_id}.test_upload_time_and_datetime" +# bigquery_client.load_table_from_dataframe(df, table).result() +# data = list(map(list, bigquery_client.list_rows(table))) +# assert data == [ +# [ +# datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc), +# datetime.time(0, 0, 10, 100001), +# ], +# [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None], +# ] + +# from google.cloud.bigquery import job, schema + +# table = f"{dataset_id}.test_upload_time_and_datetime_dt" +# config = job.LoadJobConfig( +# schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")] +# ) + +# bigquery_client.load_table_from_dataframe(df, table, job_config=config).result() +# data = list(map(list, bigquery_client.list_rows(table))) +# assert data == [ +# [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)], +# [datetime.datetime(2020, 1, 8, 15, 0), None], +# ] + + +# def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): +# wkt = pytest.importorskip("shapely.wkt") +# bigquery_client.query( +# f"create table {dataset_id}.lake (name string, geog geography)" +# ).result() +# bigquery_client.query( +# f""" +# insert into {dataset_id}.lake (name, geog) values +# ('foo', st_geogfromtext('point(0 0)')), +# ('bar', st_geogfromtext('point(0 1)')), +# ('baz', null) +# """ +# ).result() +# df = bigquery_client.query( +# f"select * from {dataset_id}.lake order by name" +# ).to_dataframe(geography_as_object=True) +# assert list(df["name"]) == ["bar", "baz", "foo"] +# assert df["geog"][0] == wkt.loads("point(0 1)") +# assert pandas.isna(df["geog"][1]) +# assert df["geog"][2] == wkt.loads("point(0 0)") + + +# def test_to_geodataframe(bigquery_client, dataset_id): +# geopandas = pytest.importorskip("geopandas") +# from shapely import wkt + +# bigquery_client.query( +# f"create table {dataset_id}.geolake (name string, geog geography)" +# ).result() +# bigquery_client.query( +# f""" +# insert into {dataset_id}.geolake (name, geog) values +# ('foo', st_geogfromtext('point(0 0)')), +# ('bar', st_geogfromtext('polygon((0 0, 1 0, 1 1, 0 0))')), +# ('baz', null) +# """ +# ).result() +# df = bigquery_client.query( +# f"select * from {dataset_id}.geolake order by name" +# ).to_geodataframe() +# assert df["geog"][0] == wkt.loads("polygon((0 0, 1 0, 1 1, 0 0))") +# assert pandas.isna(df["geog"][1]) +# assert df["geog"][2] == wkt.loads("point(0 0)") +# assert isinstance(df, geopandas.GeoDataFrame) +# assert isinstance(df["geog"], geopandas.GeoSeries) + +# with warnings.catch_warnings(): +# # Computing the area on a GeoDataFrame that uses a geographic Coordinate +# # Reference System (CRS) produces a warning that we are not interested in. +# # We do not mind if the computed area is incorrect with respect to the +# # GeoDataFrame data, as long as it matches the expected "incorrect" value. +# warnings.filterwarnings("ignore", category=UserWarning) +# assert df.area[0] == 0.5 +# assert pandas.isna(df.area[1]) +# assert df.area[2] == 0.0 + +# assert df.crs.srs == "EPSG:4326" +# assert df.crs.name == "WGS 84" +# assert df.geog.crs.srs == "EPSG:4326" +# assert df.geog.crs.name == "WGS 84" + + +# def test_load_geodataframe(bigquery_client, dataset_id): +# geopandas = pytest.importorskip("geopandas") +# import pandas +# from shapely import wkt +# from google.cloud.bigquery.schema import SchemaField + +# df = geopandas.GeoDataFrame( +# pandas.DataFrame( +# dict( +# name=["foo", "bar"], +# geo1=[None, None], +# geo2=[None, wkt.loads("Point(1 1)")], +# ) +# ), +# geometry="geo1", +# ) + +# table_id = f"{dataset_id}.lake_from_gp" +# bigquery_client.load_table_from_dataframe(df, table_id).result() + +# table = bigquery_client.get_table(table_id) +# assert table.schema == [ +# SchemaField("name", "STRING", "NULLABLE"), +# SchemaField("geo1", "GEOGRAPHY", "NULLABLE"), +# SchemaField("geo2", "GEOGRAPHY", "NULLABLE"), +# ] +# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ +# ["bar", None, "POINT(1 1)"], +# ["foo", None, None], +# ] + + +# def test_load_dataframe_w_shapely(bigquery_client, dataset_id): +# wkt = pytest.importorskip("shapely.wkt") +# from google.cloud.bigquery.schema import SchemaField + +# df = pandas.DataFrame( +# dict(name=["foo", "bar"], geo=[None, wkt.loads("Point(1 1)")]) +# ) + +# table_id = f"{dataset_id}.lake_from_shapes" +# bigquery_client.load_table_from_dataframe(df, table_id).result() + +# table = bigquery_client.get_table(table_id) +# assert table.schema == [ +# SchemaField("name", "STRING", "NULLABLE"), +# SchemaField("geo", "GEOGRAPHY", "NULLABLE"), +# ] +# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ +# ["bar", "POINT(1 1)"], +# ["foo", None], +# ] + +# bigquery_client.load_table_from_dataframe(df, table_id).result() +# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ +# ["bar", "POINT(1 1)"], +# ["bar", "POINT(1 1)"], +# ["foo", None], +# ["foo", None], +# ] + + +# def test_load_dataframe_w_wkb(bigquery_client, dataset_id): +# wkt = pytest.importorskip("shapely.wkt") +# from shapely import wkb +# from google.cloud.bigquery.schema import SchemaField + +# df = pandas.DataFrame( +# dict(name=["foo", "bar"], geo=[None, wkb.dumps(wkt.loads("Point(1 1)"))]) +# ) + +# table_id = f"{dataset_id}.lake_from_wkb" +# # We create the table first, to inform the interpretation of the wkb data +# bigquery_client.query( +# f"create table {table_id} (name string, geo GEOGRAPHY)" +# ).result() +# bigquery_client.load_table_from_dataframe(df, table_id).result() + +# table = bigquery_client.get_table(table_id) +# assert table.schema == [ +# SchemaField("name", "STRING", "NULLABLE"), +# SchemaField("geo", "GEOGRAPHY", "NULLABLE"), +# ] +# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ +# ["bar", "POINT(1 1)"], +# ["foo", None], +# ] diff --git a/tests/system/test_query.py b/tests/system/test_query.py index 723f927d7..5dc7c7875 100644 --- a/tests/system/test_query.py +++ b/tests/system/test_query.py @@ -1,503 +1,503 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import concurrent.futures -import datetime -import decimal -from typing import Tuple - -from google.api_core import exceptions -import pytest - -from google.cloud import bigquery -from google.cloud.bigquery.query import ArrayQueryParameter -from google.cloud.bigquery.query import ScalarQueryParameter -from google.cloud.bigquery.query import ScalarQueryParameterType -from google.cloud.bigquery.query import StructQueryParameter -from google.cloud.bigquery.query import StructQueryParameterType - - -@pytest.fixture(params=["INSERT", "QUERY"]) -def query_api_method(request): - return request.param - - -@pytest.fixture(scope="session") -def table_with_9999_columns_10_rows(bigquery_client, project_id, dataset_id): - """Generate a table of maximum width via CREATE TABLE AS SELECT. - - The first column is named 'rowval', and has a value from 1..rowcount - Subsequent columns are named col_ and contain the value N*rowval, where - N is between 1 and 9999 inclusive. - """ - table_id = "many_columns" - row_count = 10 - col_projections = ",".join(f"r * {n} as col_{n}" for n in range(1, 10000)) - sql = f""" - CREATE TABLE `{project_id}.{dataset_id}.{table_id}` - AS - SELECT - r as rowval, - {col_projections} - FROM - UNNEST(GENERATE_ARRAY(1,{row_count},1)) as r - """ - query_job = bigquery_client.query(sql) - query_job.result() - - return f"{project_id}.{dataset_id}.{table_id}" - - -def test_query_many_columns( - bigquery_client, table_with_9999_columns_10_rows, query_api_method -): - # Test working with the widest schema BigQuery supports, 10k columns. - query_job = bigquery_client.query( - f"SELECT * FROM `{table_with_9999_columns_10_rows}`", - api_method=query_api_method, - ) - rows = list(query_job) - assert len(rows) == 10 - - # check field representations adhere to expected values. - for row in rows: - rowval = row["rowval"] - for column in range(1, 10000): - assert row[f"col_{column}"] == rowval * column - - -def test_query_w_timeout(bigquery_client, query_api_method): - job_config = bigquery.QueryJobConfig() - job_config.use_query_cache = False - - query_job = bigquery_client.query( - "SELECT * FROM `bigquery-public-data.github_repos.commits`;", - location="US", - job_config=job_config, - api_method=query_api_method, - ) - - with pytest.raises(concurrent.futures.TimeoutError): - query_job.result(timeout=1) - - # Even though the query takes >1 second, the call to getQueryResults - # should succeed. - assert not query_job.done(timeout=1) - assert bigquery_client.cancel_job(query_job) is not None - - -def test_query_statistics(bigquery_client, query_api_method): - """ - A system test to exercise some of the extended query statistics. - - Note: We construct a query that should need at least three stages by - specifying a JOIN query. Exact plan and stats are effectively - non-deterministic, so we're largely interested in confirming values - are present. - """ - - job_config = bigquery.QueryJobConfig() - job_config.use_query_cache = False - - query_job = bigquery_client.query( - """ - SELECT - COUNT(1) - FROM - ( - SELECT - year, - wban_number - FROM `bigquery-public-data.samples.gsod` - LIMIT 1000 - ) lside - INNER JOIN - ( - SELECT - year, - state - FROM `bigquery-public-data.samples.natality` - LIMIT 1000 - ) rside - ON - lside.year = rside.year - """, - location="US", - job_config=job_config, - api_method=query_api_method, - ) - - # run the job to completion - query_job.result() - - # Must reload job to get stats if jobs.query was used. - if query_api_method == "QUERY": - query_job.reload() - - # Assert top-level stats - assert not query_job.cache_hit - assert query_job.destination is not None - assert query_job.done - assert not query_job.dry_run - assert query_job.num_dml_affected_rows is None - assert query_job.priority == "INTERACTIVE" - assert query_job.total_bytes_billed > 1 - assert query_job.total_bytes_processed > 1 - assert query_job.statement_type == "SELECT" - assert query_job.slot_millis > 1 - - # Make assertions on the shape of the query plan. - plan = query_job.query_plan - assert len(plan) >= 3 - first_stage = plan[0] - assert first_stage.start is not None - assert first_stage.end is not None - assert first_stage.entry_id is not None - assert first_stage.name is not None - assert first_stage.parallel_inputs > 0 - assert first_stage.completed_parallel_inputs > 0 - assert first_stage.shuffle_output_bytes > 0 - assert first_stage.status == "COMPLETE" - - # Query plan is a digraph. Ensure it has inter-stage links, - # but not every stage has inputs. - stages_with_inputs = 0 - for entry in plan: - if len(entry.input_stages) > 0: - stages_with_inputs = stages_with_inputs + 1 - assert stages_with_inputs > 0 - assert len(plan) > stages_with_inputs - - -@pytest.mark.parametrize( - ("sql", "expected", "query_parameters"), - ( - ( - "SELECT @question", - "What is the answer to life, the universe, and everything?", - [ - ScalarQueryParameter( - name="question", - type_="STRING", - value="What is the answer to life, the universe, and everything?", - ) - ], - ), - ( - "SELECT @answer", - 42, - [ScalarQueryParameter(name="answer", type_="INT64", value=42)], - ), - ( - "SELECT @pi", - 3.1415926, - [ScalarQueryParameter(name="pi", type_="FLOAT64", value=3.1415926)], - ), - ( - "SELECT @pi_numeric_param", - decimal.Decimal("3.141592654"), - [ - ScalarQueryParameter( - name="pi_numeric_param", - type_="NUMERIC", - value=decimal.Decimal("3.141592654"), - ) - ], - ), - ( - "SELECT @bignum_param", - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - [ - ScalarQueryParameter( - name="bignum_param", - type_="BIGNUMERIC", - value=decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - ) - ], - ), - ( - "SELECT @truthy", - True, - [ScalarQueryParameter(name="truthy", type_="BOOL", value=True)], - ), - ( - "SELECT @beef", - b"DEADBEEF", - [ScalarQueryParameter(name="beef", type_="BYTES", value=b"DEADBEEF")], - ), - ( - "SELECT @naive", - datetime.datetime(2016, 12, 5, 12, 41, 9), - [ - ScalarQueryParameter( - name="naive", - type_="DATETIME", - value=datetime.datetime(2016, 12, 5, 12, 41, 9), - ) - ], - ), - ( - "SELECT @naive_date", - datetime.date(2016, 12, 5), - [ - ScalarQueryParameter( - name="naive_date", type_="DATE", value=datetime.date(2016, 12, 5) - ) - ], - ), - ( - "SELECT @naive_time", - datetime.time(12, 41, 9, 62500), - [ - ScalarQueryParameter( - name="naive_time", - type_="TIME", - value=datetime.time(12, 41, 9, 62500), - ) - ], - ), - ( - "SELECT @zoned", - datetime.datetime(2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc), - [ - ScalarQueryParameter( - name="zoned", - type_="TIMESTAMP", - value=datetime.datetime( - 2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc - ), - ) - ], - ), - ( - "SELECT @array_param", - [1, 2], - [ - ArrayQueryParameter( - name="array_param", array_type="INT64", values=[1, 2] - ) - ], - ), - ( - "SELECT (@hitchhiker.question, @hitchhiker.answer)", - ({"_field_1": "What is the answer?", "_field_2": 42}), - [ - StructQueryParameter( - "hitchhiker", - ScalarQueryParameter( - name="question", - type_="STRING", - value="What is the answer?", - ), - ScalarQueryParameter( - name="answer", - type_="INT64", - value=42, - ), - ), - ], - ), - ( - "SELECT " - "((@rectangle.bottom_right.x - @rectangle.top_left.x) " - "* (@rectangle.top_left.y - @rectangle.bottom_right.y))", - 100, - [ - StructQueryParameter( - "rectangle", - StructQueryParameter( - "top_left", - ScalarQueryParameter("x", "INT64", 12), - ScalarQueryParameter("y", "INT64", 102), - ), - StructQueryParameter( - "bottom_right", - ScalarQueryParameter("x", "INT64", 22), - ScalarQueryParameter("y", "INT64", 92), - ), - ) - ], - ), - ( - "SELECT ?", - [ - {"name": "Phred Phlyntstone", "age": 32}, - {"name": "Bharney Rhubbyl", "age": 31}, - ], - [ - ArrayQueryParameter( - name=None, - array_type="RECORD", - values=[ - StructQueryParameter( - None, - ScalarQueryParameter( - name="name", type_="STRING", value="Phred Phlyntstone" - ), - ScalarQueryParameter(name="age", type_="INT64", value=32), - ), - StructQueryParameter( - None, - ScalarQueryParameter( - name="name", type_="STRING", value="Bharney Rhubbyl" - ), - ScalarQueryParameter(name="age", type_="INT64", value=31), - ), - ], - ) - ], - ), - ( - "SELECT @empty_array_param", - [], - [ - ArrayQueryParameter( - name="empty_array_param", - values=[], - array_type=StructQueryParameterType( - ScalarQueryParameterType(name="foo", type_="INT64"), - ScalarQueryParameterType(name="bar", type_="STRING"), - ), - ) - ], - ), - ( - "SELECT @roles", - { - "hero": {"name": "Phred Phlyntstone", "age": 32}, - "sidekick": {"name": "Bharney Rhubbyl", "age": 31}, - }, - [ - StructQueryParameter( - "roles", - StructQueryParameter( - "hero", - ScalarQueryParameter( - name="name", type_="STRING", value="Phred Phlyntstone" - ), - ScalarQueryParameter(name="age", type_="INT64", value=32), - ), - StructQueryParameter( - "sidekick", - ScalarQueryParameter( - name="name", type_="STRING", value="Bharney Rhubbyl" - ), - ScalarQueryParameter(name="age", type_="INT64", value=31), - ), - ), - ], - ), - ( - "SELECT ?", - {"friends": ["Jack", "Jill"]}, - [ - StructQueryParameter( - None, - ArrayQueryParameter( - name="friends", array_type="STRING", values=["Jack", "Jill"] - ), - ) - ], - ), - ), -) -def test_query_parameters( - bigquery_client, query_api_method, sql, expected, query_parameters -): - jconfig = bigquery.QueryJobConfig() - jconfig.query_parameters = query_parameters - query_job = bigquery_client.query( - sql, - job_config=jconfig, - api_method=query_api_method, - ) - rows = list(query_job.result()) - assert len(rows) == 1 - assert len(rows[0]) == 1 - assert rows[0][0] == expected - - -def test_dry_run( - bigquery_client: bigquery.Client, - query_api_method: str, - scalars_table_multi_location: Tuple[str, str], -): - location, full_table_id = scalars_table_multi_location - query_config = bigquery.QueryJobConfig() - query_config.dry_run = True - - query_string = f"SELECT * FROM {full_table_id}" - query_job = bigquery_client.query( - query_string, - location=location, - job_config=query_config, - api_method=query_api_method, - ) - - # Note: `query_job.result()` is not necessary on a dry run query. All - # necessary information is returned in the initial response. - assert query_job.dry_run is True - assert query_job.total_bytes_processed > 0 - assert len(query_job.schema) > 0 - - -def test_query_error_w_api_method_query(bigquery_client: bigquery.Client): - """No job is returned from jobs.query if the query fails.""" - - with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): - bigquery_client.query( - "SELECT * FROM not_a_real_dataset.doesnt_exist", api_method="QUERY" - ) - - -def test_query_error_w_api_method_default(bigquery_client: bigquery.Client): - """Test that an exception is not thrown until fetching the results. - - For backwards compatibility, jobs.insert is the default API method. With - jobs.insert, a failed query job is "sucessfully" created. An exception is - thrown when fetching the results. - """ - - query_job = bigquery_client.query("SELECT * FROM not_a_real_dataset.doesnt_exist") - - with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): - query_job.result() - - -def test_session(bigquery_client: bigquery.Client, query_api_method: str): - initial_config = bigquery.QueryJobConfig() - initial_config.create_session = True - initial_query = """ - CREATE TEMPORARY TABLE numbers(id INT64) - AS - SELECT * FROM UNNEST([1, 2, 3, 4, 5]) AS id; - """ - initial_job = bigquery_client.query( - initial_query, job_config=initial_config, api_method=query_api_method - ) - initial_job.result() - session_id = initial_job.session_info.session_id - assert session_id is not None - - second_config = bigquery.QueryJobConfig() - second_config.connection_properties = [ - bigquery.ConnectionProperty("session_id", session_id), - ] - second_job = bigquery_client.query( - "SELECT COUNT(*) FROM numbers;", job_config=second_config - ) - rows = list(second_job.result()) - - assert len(rows) == 1 - assert rows[0][0] == 5 +# # Copyright 2021 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# import concurrent.futures +# import datetime +# import decimal +# from typing import Tuple + +# from google.api_core import exceptions +# import pytest + +# from google.cloud import bigquery +# from google.cloud.bigquery.query import ArrayQueryParameter +# from google.cloud.bigquery.query import ScalarQueryParameter +# from google.cloud.bigquery.query import ScalarQueryParameterType +# from google.cloud.bigquery.query import StructQueryParameter +# from google.cloud.bigquery.query import StructQueryParameterType + + +# @pytest.fixture(params=["INSERT", "QUERY"]) +# def query_api_method(request): +# return request.param + + +# @pytest.fixture(scope="session") +# def table_with_9999_columns_10_rows(bigquery_client, project_id, dataset_id): +# """Generate a table of maximum width via CREATE TABLE AS SELECT. + +# The first column is named 'rowval', and has a value from 1..rowcount +# Subsequent columns are named col_ and contain the value N*rowval, where +# N is between 1 and 9999 inclusive. +# """ +# table_id = "many_columns" +# row_count = 10 +# col_projections = ",".join(f"r * {n} as col_{n}" for n in range(1, 10000)) +# sql = f""" +# CREATE TABLE `{project_id}.{dataset_id}.{table_id}` +# AS +# SELECT +# r as rowval, +# {col_projections} +# FROM +# UNNEST(GENERATE_ARRAY(1,{row_count},1)) as r +# """ +# query_job = bigquery_client.query(sql) +# query_job.result() + +# return f"{project_id}.{dataset_id}.{table_id}" + + +# def test_query_many_columns( +# bigquery_client, table_with_9999_columns_10_rows, query_api_method +# ): +# # Test working with the widest schema BigQuery supports, 10k columns. +# query_job = bigquery_client.query( +# f"SELECT * FROM `{table_with_9999_columns_10_rows}`", +# api_method=query_api_method, +# ) +# rows = list(query_job) +# assert len(rows) == 10 + +# # check field representations adhere to expected values. +# for row in rows: +# rowval = row["rowval"] +# for column in range(1, 10000): +# assert row[f"col_{column}"] == rowval * column + + +# def test_query_w_timeout(bigquery_client, query_api_method): +# job_config = bigquery.QueryJobConfig() +# job_config.use_query_cache = False + +# query_job = bigquery_client.query( +# "SELECT * FROM `bigquery-public-data.github_repos.commits`;", +# location="US", +# job_config=job_config, +# api_method=query_api_method, +# ) + +# with pytest.raises(concurrent.futures.TimeoutError): +# query_job.result(timeout=1) + +# # Even though the query takes >1 second, the call to getQueryResults +# # should succeed. +# assert not query_job.done(timeout=1) +# assert bigquery_client.cancel_job(query_job) is not None + + +# def test_query_statistics(bigquery_client, query_api_method): +# """ +# A system test to exercise some of the extended query statistics. + +# Note: We construct a query that should need at least three stages by +# specifying a JOIN query. Exact plan and stats are effectively +# non-deterministic, so we're largely interested in confirming values +# are present. +# """ + +# job_config = bigquery.QueryJobConfig() +# job_config.use_query_cache = False + +# query_job = bigquery_client.query( +# """ +# SELECT +# COUNT(1) +# FROM +# ( +# SELECT +# year, +# wban_number +# FROM `bigquery-public-data.samples.gsod` +# LIMIT 1000 +# ) lside +# INNER JOIN +# ( +# SELECT +# year, +# state +# FROM `bigquery-public-data.samples.natality` +# LIMIT 1000 +# ) rside +# ON +# lside.year = rside.year +# """, +# location="US", +# job_config=job_config, +# api_method=query_api_method, +# ) + +# # run the job to completion +# query_job.result() + +# # Must reload job to get stats if jobs.query was used. +# if query_api_method == "QUERY": +# query_job.reload() + +# # Assert top-level stats +# assert not query_job.cache_hit +# assert query_job.destination is not None +# assert query_job.done +# assert not query_job.dry_run +# assert query_job.num_dml_affected_rows is None +# assert query_job.priority == "INTERACTIVE" +# assert query_job.total_bytes_billed > 1 +# assert query_job.total_bytes_processed > 1 +# assert query_job.statement_type == "SELECT" +# assert query_job.slot_millis > 1 + +# # Make assertions on the shape of the query plan. +# plan = query_job.query_plan +# assert len(plan) >= 3 +# first_stage = plan[0] +# assert first_stage.start is not None +# assert first_stage.end is not None +# assert first_stage.entry_id is not None +# assert first_stage.name is not None +# assert first_stage.parallel_inputs > 0 +# assert first_stage.completed_parallel_inputs > 0 +# assert first_stage.shuffle_output_bytes > 0 +# assert first_stage.status == "COMPLETE" + +# # Query plan is a digraph. Ensure it has inter-stage links, +# # but not every stage has inputs. +# stages_with_inputs = 0 +# for entry in plan: +# if len(entry.input_stages) > 0: +# stages_with_inputs = stages_with_inputs + 1 +# assert stages_with_inputs > 0 +# assert len(plan) > stages_with_inputs + + +# @pytest.mark.parametrize( +# ("sql", "expected", "query_parameters"), +# ( +# ( +# "SELECT @question", +# "What is the answer to life, the universe, and everything?", +# [ +# ScalarQueryParameter( +# name="question", +# type_="STRING", +# value="What is the answer to life, the universe, and everything?", +# ) +# ], +# ), +# ( +# "SELECT @answer", +# 42, +# [ScalarQueryParameter(name="answer", type_="INT64", value=42)], +# ), +# ( +# "SELECT @pi", +# 3.1415926, +# [ScalarQueryParameter(name="pi", type_="FLOAT64", value=3.1415926)], +# ), +# ( +# "SELECT @pi_numeric_param", +# decimal.Decimal("3.141592654"), +# [ +# ScalarQueryParameter( +# name="pi_numeric_param", +# type_="NUMERIC", +# value=decimal.Decimal("3.141592654"), +# ) +# ], +# ), +# ( +# "SELECT @bignum_param", +# decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), +# [ +# ScalarQueryParameter( +# name="bignum_param", +# type_="BIGNUMERIC", +# value=decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), +# ) +# ], +# ), +# ( +# "SELECT @truthy", +# True, +# [ScalarQueryParameter(name="truthy", type_="BOOL", value=True)], +# ), +# ( +# "SELECT @beef", +# b"DEADBEEF", +# [ScalarQueryParameter(name="beef", type_="BYTES", value=b"DEADBEEF")], +# ), +# ( +# "SELECT @naive", +# datetime.datetime(2016, 12, 5, 12, 41, 9), +# [ +# ScalarQueryParameter( +# name="naive", +# type_="DATETIME", +# value=datetime.datetime(2016, 12, 5, 12, 41, 9), +# ) +# ], +# ), +# ( +# "SELECT @naive_date", +# datetime.date(2016, 12, 5), +# [ +# ScalarQueryParameter( +# name="naive_date", type_="DATE", value=datetime.date(2016, 12, 5) +# ) +# ], +# ), +# ( +# "SELECT @naive_time", +# datetime.time(12, 41, 9, 62500), +# [ +# ScalarQueryParameter( +# name="naive_time", +# type_="TIME", +# value=datetime.time(12, 41, 9, 62500), +# ) +# ], +# ), +# ( +# "SELECT @zoned", +# datetime.datetime(2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc), +# [ +# ScalarQueryParameter( +# name="zoned", +# type_="TIMESTAMP", +# value=datetime.datetime( +# 2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc +# ), +# ) +# ], +# ), +# ( +# "SELECT @array_param", +# [1, 2], +# [ +# ArrayQueryParameter( +# name="array_param", array_type="INT64", values=[1, 2] +# ) +# ], +# ), +# ( +# "SELECT (@hitchhiker.question, @hitchhiker.answer)", +# ({"_field_1": "What is the answer?", "_field_2": 42}), +# [ +# StructQueryParameter( +# "hitchhiker", +# ScalarQueryParameter( +# name="question", +# type_="STRING", +# value="What is the answer?", +# ), +# ScalarQueryParameter( +# name="answer", +# type_="INT64", +# value=42, +# ), +# ), +# ], +# ), +# ( +# "SELECT " +# "((@rectangle.bottom_right.x - @rectangle.top_left.x) " +# "* (@rectangle.top_left.y - @rectangle.bottom_right.y))", +# 100, +# [ +# StructQueryParameter( +# "rectangle", +# StructQueryParameter( +# "top_left", +# ScalarQueryParameter("x", "INT64", 12), +# ScalarQueryParameter("y", "INT64", 102), +# ), +# StructQueryParameter( +# "bottom_right", +# ScalarQueryParameter("x", "INT64", 22), +# ScalarQueryParameter("y", "INT64", 92), +# ), +# ) +# ], +# ), +# ( +# "SELECT ?", +# [ +# {"name": "Phred Phlyntstone", "age": 32}, +# {"name": "Bharney Rhubbyl", "age": 31}, +# ], +# [ +# ArrayQueryParameter( +# name=None, +# array_type="RECORD", +# values=[ +# StructQueryParameter( +# None, +# ScalarQueryParameter( +# name="name", type_="STRING", value="Phred Phlyntstone" +# ), +# ScalarQueryParameter(name="age", type_="INT64", value=32), +# ), +# StructQueryParameter( +# None, +# ScalarQueryParameter( +# name="name", type_="STRING", value="Bharney Rhubbyl" +# ), +# ScalarQueryParameter(name="age", type_="INT64", value=31), +# ), +# ], +# ) +# ], +# ), +# ( +# "SELECT @empty_array_param", +# [], +# [ +# ArrayQueryParameter( +# name="empty_array_param", +# values=[], +# array_type=StructQueryParameterType( +# ScalarQueryParameterType(name="foo", type_="INT64"), +# ScalarQueryParameterType(name="bar", type_="STRING"), +# ), +# ) +# ], +# ), +# ( +# "SELECT @roles", +# { +# "hero": {"name": "Phred Phlyntstone", "age": 32}, +# "sidekick": {"name": "Bharney Rhubbyl", "age": 31}, +# }, +# [ +# StructQueryParameter( +# "roles", +# StructQueryParameter( +# "hero", +# ScalarQueryParameter( +# name="name", type_="STRING", value="Phred Phlyntstone" +# ), +# ScalarQueryParameter(name="age", type_="INT64", value=32), +# ), +# StructQueryParameter( +# "sidekick", +# ScalarQueryParameter( +# name="name", type_="STRING", value="Bharney Rhubbyl" +# ), +# ScalarQueryParameter(name="age", type_="INT64", value=31), +# ), +# ), +# ], +# ), +# ( +# "SELECT ?", +# {"friends": ["Jack", "Jill"]}, +# [ +# StructQueryParameter( +# None, +# ArrayQueryParameter( +# name="friends", array_type="STRING", values=["Jack", "Jill"] +# ), +# ) +# ], +# ), +# ), +# ) +# def test_query_parameters( +# bigquery_client, query_api_method, sql, expected, query_parameters +# ): +# jconfig = bigquery.QueryJobConfig() +# jconfig.query_parameters = query_parameters +# query_job = bigquery_client.query( +# sql, +# job_config=jconfig, +# api_method=query_api_method, +# ) +# rows = list(query_job.result()) +# assert len(rows) == 1 +# assert len(rows[0]) == 1 +# assert rows[0][0] == expected + + +# def test_dry_run( +# bigquery_client: bigquery.Client, +# query_api_method: str, +# scalars_table_multi_location: Tuple[str, str], +# ): +# location, full_table_id = scalars_table_multi_location +# query_config = bigquery.QueryJobConfig() +# query_config.dry_run = True + +# query_string = f"SELECT * FROM {full_table_id}" +# query_job = bigquery_client.query( +# query_string, +# location=location, +# job_config=query_config, +# api_method=query_api_method, +# ) + +# # Note: `query_job.result()` is not necessary on a dry run query. All +# # necessary information is returned in the initial response. +# assert query_job.dry_run is True +# assert query_job.total_bytes_processed > 0 +# assert len(query_job.schema) > 0 + + +# def test_query_error_w_api_method_query(bigquery_client: bigquery.Client): +# """No job is returned from jobs.query if the query fails.""" + +# with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): +# bigquery_client.query( +# "SELECT * FROM not_a_real_dataset.doesnt_exist", api_method="QUERY" +# ) + + +# def test_query_error_w_api_method_default(bigquery_client: bigquery.Client): +# """Test that an exception is not thrown until fetching the results. + +# For backwards compatibility, jobs.insert is the default API method. With +# jobs.insert, a failed query job is "sucessfully" created. An exception is +# thrown when fetching the results. +# """ + +# query_job = bigquery_client.query("SELECT * FROM not_a_real_dataset.doesnt_exist") + +# with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): +# query_job.result() + + +# def test_session(bigquery_client: bigquery.Client, query_api_method: str): +# initial_config = bigquery.QueryJobConfig() +# initial_config.create_session = True +# initial_query = """ +# CREATE TEMPORARY TABLE numbers(id INT64) +# AS +# SELECT * FROM UNNEST([1, 2, 3, 4, 5]) AS id; +# """ +# initial_job = bigquery_client.query( +# initial_query, job_config=initial_config, api_method=query_api_method +# ) +# initial_job.result() +# session_id = initial_job.session_info.session_id +# assert session_id is not None + +# second_config = bigquery.QueryJobConfig() +# second_config.connection_properties = [ +# bigquery.ConnectionProperty("session_id", session_id), +# ] +# second_job = bigquery_client.query( +# "SELECT COUNT(*) FROM numbers;", job_config=second_config +# ) +# rows = list(second_job.result()) + +# assert len(rows) == 1 +# assert rows[0][0] == 5 diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 2e714c707..842af6d55 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -19,7 +19,13 @@ import mock +try: + from google.cloud import bigquery_storage # type: ignore +except ImportError: # pragma: NO COVER + bigquery_storage = None + +@unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): def tearDown(self): from google.cloud.bigquery import _helpers @@ -32,6 +38,37 @@ def _object_under_test(self): return _helpers.BQStorageVersions() + def _call_fut(self): + from google.cloud.bigquery import _helpers + + _helpers.BQ_STORAGE_VERSIONS._installed_version = None + return _helpers.BQ_STORAGE_VERSIONS.verify_version() + + def test_raises_no_error_w_recent_bqstorage(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + with mock.patch("google.cloud.bigquery_storage.__version__", new="2.0.0"): + try: + self._call_fut() + except LegacyBigQueryStorageError: # pragma: NO COVER + self.fail("Legacy error raised with a non-legacy dependency version.") + + def test_raises_error_w_legacy_bqstorage(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + with mock.patch("google.cloud.bigquery_storage.__version__", new="1.9.9"): + with self.assertRaises(LegacyBigQueryStorageError): + self._call_fut() + + def test_raises_error_w_unknown_bqstorage_version(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + with mock.patch("google.cloud.bigquery_storage", autospec=True) as fake_module: + del fake_module.__version__ + error_pattern = r"version found: 0.0.0" + with self.assertRaisesRegex(LegacyBigQueryStorageError, error_pattern): + self._call_fut() + def test_installed_version_returns_cached(self): versions = self._object_under_test() versions._installed_version = object() diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 1a3f918eb..5780fb9b6 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -30,8 +30,13 @@ except ImportError: # pragma: NO COVER pandas = None -import pyarrow -import pyarrow.types +try: + import pyarrow + import pyarrow.types +except ImportError: # pragma: NO COVER + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() try: import geopandas @@ -44,6 +49,7 @@ from google.cloud import bigquery_storage from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT PANDAS_MINIUM_VERSION = pkg_resources.parse_version("1.0.0") @@ -54,6 +60,11 @@ # Set to less than MIN version. PANDAS_INSTALLED_VERSION = pkg_resources.parse_version("0.0.0") +skip_if_no_bignumeric = pytest.mark.skipif( + not _BIGNUMERIC_SUPPORT, + reason="BIGNUMERIC support requires pyarrow>=3.0.0", +) + @pytest.fixture def module_under_test(): @@ -75,6 +86,7 @@ def is_datetime(type_): )(type_) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def is_numeric(type_): # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type return all_( @@ -142,7 +154,12 @@ def test_all_(): ("FLOAT", "NULLABLE", pyarrow.types.is_float64), ("FLOAT64", "NULLABLE", pyarrow.types.is_float64), ("NUMERIC", "NULLABLE", is_numeric), - ("BIGNUMERIC", "NULLABLE", is_bignumeric), + pytest.param( + "BIGNUMERIC", + "NULLABLE", + is_bignumeric, + marks=skip_if_no_bignumeric, + ), ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), ("BOOL", "NULLABLE", pyarrow.types.is_boolean), ("TIMESTAMP", "NULLABLE", is_timestamp), @@ -221,10 +238,11 @@ def test_all_(): "REPEATED", all_(pyarrow.types.is_list, lambda type_: is_numeric(type_.value_type)), ), - ( + pytest.param( "BIGNUMERIC", "REPEATED", all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + marks=skip_if_no_bignumeric, ), ( "BOOLEAN", @@ -280,6 +298,7 @@ def test_all_(): ("UNKNOWN_TYPE", "REPEATED", is_none), ], ) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_data_type(module_under_test, bq_type, bq_mode, is_correct_type): field = schema.SchemaField("ignored_name", bq_type, mode=bq_mode) actual = module_under_test.bq_to_arrow_data_type(field) @@ -287,6 +306,7 @@ def test_bq_to_arrow_data_type(module_under_test, bq_type, bq_mode, is_correct_t @pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), @@ -334,6 +354,7 @@ def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): @pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), @@ -343,7 +364,7 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), - schema.SchemaField("field08", "BIGNUMERIC"), + # schema.SchemaField("field08", "BIGNUMERIC"), schema.SchemaField("field09", "BOOLEAN"), schema.SchemaField("field10", "BOOL"), schema.SchemaField("field11", "TIMESTAMP"), @@ -352,6 +373,8 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field14", "DATETIME"), schema.SchemaField("field15", "GEOGRAPHY"), ) + if _BIGNUMERIC_SUPPORT: + fields += (schema.SchemaField("field08", "BIGNUMERIC"),) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) @@ -364,7 +387,7 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + # pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), pyarrow.field("field11", module_under_test.pyarrow_timestamp()), @@ -373,6 +396,8 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field14", module_under_test.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) + if _BIGNUMERIC_SUPPORT: + expected += (pyarrow.field("field08", module_under_test.pyarrow_bignumeric()),) expected_value_type = pyarrow.struct(expected) assert pyarrow.types.is_list(actual) @@ -381,6 +406,7 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): assert actual.value_type.equals(expected_value_type) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): fields = ( schema.SchemaField("field1", "STRING"), @@ -417,7 +443,7 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): decimal.Decimal("999.123456789"), ], ), - ( + pytest.param( "BIGNUMERIC", [ decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), @@ -425,6 +451,7 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), decimal.Decimal("3.141592653589793238462643383279"), ], + marks=skip_if_no_bignumeric, ), ("BOOLEAN", [True, None, False, None]), ("BOOL", [False, None, True, None]), @@ -479,6 +506,7 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): ], ) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type, rows): series = pandas.Series(rows, dtype="object") bq_field = schema.SchemaField("field_name", bq_type) @@ -513,6 +541,7 @@ def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type, rows): ], ) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows): rows = [pandas.Timestamp(row) for row in rows] series = pandas.Series(rows) @@ -523,6 +552,7 @@ def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_array_w_arrays(module_under_test): rows = [[1, 2, 3], [], [4, 5, 6]] series = pandas.Series(rows, dtype="object") @@ -534,6 +564,7 @@ def test_bq_to_arrow_array_w_arrays(module_under_test): @pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_array_w_structs(module_under_test, bq_type): rows = [ {"int_col": 123, "string_col": "abc"}, @@ -555,6 +586,7 @@ def test_bq_to_arrow_array_w_structs(module_under_test, bq_type): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_array_w_special_floats(module_under_test): bq_field = schema.SchemaField("field_name", "FLOAT64") rows = [float("-inf"), float("nan"), float("inf"), None] @@ -622,6 +654,7 @@ def test_bq_to_arrow_array_w_geography_type_wkb_data(module_under_test): assert array.to_pylist() == list(series) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( schema.SchemaField("field1", "STRING"), @@ -647,6 +680,7 @@ def test_get_column_or_index_not_found(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_get_column_or_index_with_multiindex_not_found(module_under_test): dataframe = pandas.DataFrame( {"column_name": [1, 2, 3, 4, 5, 6]}, @@ -984,6 +1018,7 @@ def test_dataframe_to_arrow_with_multiindex(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_arrow_with_required_fields(module_under_test): bq_schema = ( schema.SchemaField("field01", "STRING", mode="REQUIRED"), @@ -993,7 +1028,7 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field05", "FLOAT", mode="REQUIRED"), schema.SchemaField("field06", "FLOAT64", mode="REQUIRED"), schema.SchemaField("field07", "NUMERIC", mode="REQUIRED"), - schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"), + # schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"), schema.SchemaField("field09", "BOOLEAN", mode="REQUIRED"), schema.SchemaField("field10", "BOOL", mode="REQUIRED"), schema.SchemaField("field11", "TIMESTAMP", mode="REQUIRED"), @@ -1002,6 +1037,8 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field14", "DATETIME", mode="REQUIRED"), schema.SchemaField("field15", "GEOGRAPHY", mode="REQUIRED"), ) + if _BIGNUMERIC_SUPPORT: + bq_schema += (schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"),) data = { "field01": ["hello", "world"], @@ -1011,10 +1048,10 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): "field05": [1.25, 9.75], "field06": [-1.75, -3.5], "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], - "field08": [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ], + # "field08": [ + # decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + # decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + # ], "field09": [True, False], "field10": [False, True], "field11": [ @@ -1029,6 +1066,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): ], "field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], } + if _BIGNUMERIC_SUPPORT: + data["field08"] = [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ] dataframe = pandas.DataFrame(data) arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema) @@ -1040,6 +1082,7 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_arrow_with_unknown_type(module_under_test): bq_schema = ( schema.SchemaField("field00", "UNKNOWN_TYPE"), @@ -1072,6 +1115,7 @@ def test_dataframe_to_arrow_with_unknown_type(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_arrow_dict_sequence_schema(module_under_test): dict_schema = [ {"name": "field01", "type": "STRING", "mode": "REQUIRED"}, @@ -1093,6 +1137,15 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pyarrow", None) + with pytest.raises(ValueError) as exc_context: + module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None) + assert "pyarrow is required" in str(exc_context.value) + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_parquet_w_extra_fields(module_under_test): with pytest.raises(ValueError) as exc_context: module_under_test.dataframe_to_parquet( @@ -1104,6 +1157,7 @@ def test_dataframe_to_parquet_w_extra_fields(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_parquet_w_missing_fields(module_under_test): with pytest.raises(ValueError) as exc_context: module_under_test.dataframe_to_parquet( @@ -1115,6 +1169,7 @@ def test_dataframe_to_parquet_w_missing_fields(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_parquet_compression_method(module_under_test): bq_schema = (schema.SchemaField("field00", "STRING"),) dataframe = pandas.DataFrame({"field00": ["foo", "bar"]}) @@ -1134,6 +1189,34 @@ def test_dataframe_to_parquet_compression_method(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test): + dataframe = pandas.DataFrame( + data=[ + {"id": 10, "status": "FOO", "execution_date": datetime.date(2019, 5, 10)}, + {"id": 20, "status": "BAR", "created_at": datetime.date(2018, 9, 12)}, + ] + ) + + no_pyarrow_patch = mock.patch(module_under_test.__name__ + ".pyarrow", None) + + with no_pyarrow_patch, warnings.catch_warnings(record=True) as warned: + detected_schema = module_under_test.dataframe_to_bq_schema( + dataframe, bq_schema=[] + ) + + assert detected_schema is None + + # a warning should also be issued + expected_warnings = [ + warning for warning in warned if "could not determine" in str(warning).lower() + ] + assert len(expected_warnings) == 1 + msg = str(expected_warnings[0]) + assert "execution_date" in msg and "created_at" in msg + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test): dataframe = pandas.DataFrame( data=[ @@ -1163,6 +1246,7 @@ def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test): dataframe = pandas.DataFrame( data=[ @@ -1249,6 +1333,7 @@ def test__first_array_valid_no_arrays_with_valid_items(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_augment_schema_type_detection_succeeds(module_under_test): dataframe = pandas.DataFrame( data=[ @@ -1282,8 +1367,12 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"), schema.SchemaField("string_field", field_type=None, mode="NULLABLE"), schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), - schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), + # schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), ) + if _BIGNUMERIC_SUPPORT: + current_schema += ( # type: ignore + schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), # type: ignore + ) # type: ignore with warnings.catch_warnings(record=True) as warned: augmented_schema = module_under_test.augment_schema(dataframe, current_schema) @@ -1305,16 +1394,23 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("bytes_field", field_type="BYTES", mode="NULLABLE"), schema.SchemaField("string_field", field_type="STRING", mode="NULLABLE"), schema.SchemaField("numeric_field", field_type="NUMERIC", mode="NULLABLE"), - schema.SchemaField( - "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" - ), - ) + # schema.SchemaField( + # "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" + # ), + ) + if _BIGNUMERIC_SUPPORT: + expected_schema += ( + schema.SchemaField( + "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" + ), + ) by_name = operator.attrgetter("name") assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_augment_schema_repeated_fields(module_under_test): dataframe = pandas.DataFrame( data=[ @@ -1427,6 +1523,7 @@ def test_augment_schema_type_detection_fails_array_data(module_under_test): assert "all_none_array" in warning_msg and "empty_array" in warning_msg +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_dataframe_to_parquet_dict_sequence_schema(module_under_test): pandas = pytest.importorskip("pandas") @@ -1457,6 +1554,9 @@ def test_dataframe_to_parquet_dict_sequence_schema(module_under_test): assert schema_arg == expected_schema_arg +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test__download_table_bqstorage_stream_includes_read_session( monkeypatch, module_under_test ): @@ -1487,7 +1587,8 @@ def test__download_table_bqstorage_stream_includes_read_session( @pytest.mark.skipif( - not _helpers.BQ_STORAGE_VERSIONS.is_read_session_optional, + bigquery_storage is None + or not _helpers.BQ_STORAGE_VERSIONS.is_read_session_optional, reason="Requires `google-cloud-bigquery-storage` >= 2.6.0", ) def test__download_table_bqstorage_stream_omits_read_session( @@ -1527,6 +1628,9 @@ def test__download_table_bqstorage_stream_omits_read_session( (7, {"max_queue_size": None}, 7, 0), # infinite queue size ], ) +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test__download_table_bqstorage( module_under_test, stream_count, @@ -1577,6 +1681,7 @@ def fake_download_stream( assert queue_used.maxsize == expected_maxsize +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_download_arrow_row_iterator_unknown_field_type(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), @@ -1612,6 +1717,7 @@ def test_download_arrow_row_iterator_unknown_field_type(module_under_test): assert col.to_pylist() == [2.2, 22.22, 222.222] +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_download_arrow_row_iterator_known_field_type(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), @@ -1646,6 +1752,7 @@ def test_download_arrow_row_iterator_known_field_type(module_under_test): assert col.to_pylist() == ["2.2", "22.22", "222.222"] +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_download_arrow_row_iterator_dict_sequence_schema(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 30bab8fa9..7f3bb6032 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -27,6 +27,7 @@ import warnings import mock +import packaging import requests import pytest import pkg_resources @@ -53,6 +54,11 @@ msg = "Error importing from opentelemetry, is the installed version compatible?" raise ImportError(msg) from exc +try: + import pyarrow +except (ImportError, AttributeError): # pragma: NO COVER + pyarrow = None + import google.api_core.exceptions from google.api_core import client_info import google.cloud._helpers @@ -6833,6 +6839,7 @@ def test_load_table_from_file_w_invalid_job_config(self): assert "Expected an instance of LoadJobConfig" in err_msg @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -6928,6 +6935,7 @@ def test_load_table_from_dataframe(self): assert "description" not in field @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_client_location(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -6972,6 +6980,7 @@ def test_load_table_from_dataframe_w_client_location(self): assert sent_config.source_format == job.SourceFormat.PARQUET @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_custom_job_config_wihtout_source_format(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7026,6 +7035,7 @@ def test_load_table_from_dataframe_w_custom_job_config_wihtout_source_format(sel assert job_config.to_api_repr() == original_config_copy.to_api_repr() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_custom_job_config_w_source_format(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7081,6 +7091,7 @@ def test_load_table_from_dataframe_w_custom_job_config_w_source_format(self): assert job_config.to_api_repr() == original_config_copy.to_api_repr() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_parquet_options_none(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7132,6 +7143,7 @@ def test_load_table_from_dataframe_w_parquet_options_none(self): assert sent_config.parquet_options.enable_list_inference is True @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_list_inference_none(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7191,6 +7203,7 @@ def test_load_table_from_dataframe_w_list_inference_none(self): assert job_config.to_api_repr() == original_config_copy.to_api_repr() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_list_inference_false(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7251,6 +7264,7 @@ def test_load_table_from_dataframe_w_list_inference_false(self): assert job_config.to_api_repr() == original_config_copy.to_api_repr() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_custom_job_config_w_wrong_source_format(self): from google.cloud.bigquery import job @@ -7270,6 +7284,7 @@ def test_load_table_from_dataframe_w_custom_job_config_w_wrong_source_format(sel assert "Got unexpected source_format:" in str(exc.value) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_automatic_schema(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7370,6 +7385,7 @@ def test_load_table_from_dataframe_w_automatic_schema(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_automatic_schema_detection_fails(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7429,6 +7445,7 @@ def test_load_table_from_dataframe_w_automatic_schema_detection_fails(self): assert sent_config.schema is None @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_index_and_auto_schema(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7490,6 +7507,7 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): assert sent_schema == expected_sent_schema @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_unknown_table(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES @@ -7528,6 +7546,7 @@ def test_load_table_from_dataframe_unknown_table(self): pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION, "Only `pandas version >=1.0.0` supported", ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nullable_int64_datatype(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7575,6 +7594,7 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype(self): pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION, "Only `pandas version >=1.0.0` supported", ) + # @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7619,6 +7639,7 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_struct_fields(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7678,6 +7699,7 @@ def test_load_table_from_dataframe_struct_fields(self): assert sent_config.schema == schema @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_array_fields(self): """Test that a DataFrame with array columns can be uploaded correctly. @@ -7742,6 +7764,7 @@ def test_load_table_from_dataframe_array_fields(self): assert sent_config.schema == schema @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_array_fields_w_auto_schema(self): """Test that a DataFrame with array columns can be uploaded correctly. @@ -7804,6 +7827,7 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self): assert sent_config.schema == expected_schema @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_partial_schema(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -7887,6 +7911,7 @@ def test_load_table_from_dataframe_w_partial_schema(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_partial_schema_extra_types(self): from google.cloud.bigquery import job from google.cloud.bigquery.schema import SchemaField @@ -7923,6 +7948,7 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self): assert "unknown_col" in message @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): from google.cloud.bigquery import job from google.cloud.bigquery.schema import SchemaField @@ -7955,6 +7981,74 @@ def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): assert call_args.kwargs.get("parquet_compression") == "LZ4" @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + def test_load_table_from_dataframe_wo_pyarrow_raises_error(self): + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) + to_parquet_patch = mock.patch.object( + dataframe, "to_parquet", wraps=dataframe.to_parquet + ) + + with load_patch, get_table_patch, pyarrow_patch, to_parquet_patch: + with pytest.raises(ValueError): + client.load_table_from_dataframe( + dataframe, + self.TABLE_REF, + location=self.LOCATION, + parquet_compression="gzip", + ) + + def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): + pytest.importorskip("pandas", reason="Requires `pandas`") + pytest.importorskip("pyarrow", reason="Requires `pyarrow`") + + client = self._make_client() + records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] + dataframe = pandas.DataFrame(records) + + pyarrow_version_patch = mock.patch( + "google.cloud.bigquery.client._PYARROW_VERSION", + packaging.version.parse("2.0.0"), # A known bad version of pyarrow. + ) + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=google.api_core.exceptions.NotFound("Table not found"), + ) + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with load_patch, get_table_patch, pyarrow_version_patch: + with warnings.catch_warnings(record=True) as warned: + client.load_table_from_dataframe( + dataframe, + self.TABLE_REF, + location=self.LOCATION, + ) + + expected_warnings = [ + warning for warning in warned if "pyarrow" in str(warning).lower() + ] + assert len(expected_warnings) == 1 + assert issubclass(expected_warnings[0].category, RuntimeWarning) + msg = str(expected_warnings[0].message) + assert "pyarrow 2.0.0" in msg + assert "data corruption" in msg + + @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): """Test that a DataFrame with null columns can be uploaded if a BigQuery schema is specified. From 47a489b8e965968ad3b8450996c2890dd7949f0d Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 7 Jul 2022 20:17:45 -0700 Subject: [PATCH 03/47] clean up comments --- google/cloud/bigquery/__init__.py | 4 - google/cloud/bigquery/_helpers.py | 3 - google/cloud/bigquery/_pandas_helpers.py | 3 - google/cloud/bigquery/client.py | 18 - google/cloud/bigquery/exceptions.py | 5 - google/cloud/bigquery/table.py | 11 - setup.py | 1 - tests/system/test_arrow.py | 338 +-- tests/system/test_job_retry.py | 144 +- tests/system/test_list_rows.py | 240 +- tests/system/test_magics.py | 166 +- tests/system/test_pandas.py | 2602 +++++++++++----------- tests/system/test_query.py | 1006 ++++----- tests/unit/test__helpers.py | 1 - 14 files changed, 2248 insertions(+), 2294 deletions(-) diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index a0813f96b..5a4520476 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -42,8 +42,6 @@ from google.cloud.bigquery.enums import KeyResultStatementKind from google.cloud.bigquery.enums import SqlTypeNames from google.cloud.bigquery.enums import StandardSqlTypeNames - -# from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -197,8 +195,6 @@ "WriteDisposition", # EncryptionConfiguration "EncryptionConfiguration", - # Custom exceptions - # "LegacyBigQueryStorageError", ] diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index ab2d4004c..4149c1ffc 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -32,8 +32,6 @@ import packaging.version -# from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - from google.cloud.bigquery.exceptions import ( LegacyBigQueryStorageError, LegacyPyarrowError, @@ -60,7 +58,6 @@ _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") _MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") -# _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index feea9ba42..3dd3d5a80 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -52,9 +52,6 @@ except ImportError: # pragma: NO COVER pyarrow = None -# import pyarrow # type: ignore -# import pyarrow.parquet # type: ignore - try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` from shapely.geometry.base import BaseGeometry as _BaseGeometry # type: ignore diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 3e84cd11c..142f1e305 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -78,8 +78,6 @@ from google.cloud.bigquery._helpers import _get_sub_prop from google.cloud.bigquery._helpers import _record_field_to_json from google.cloud.bigquery._helpers import _str_or_none - -# from google.cloud.bigquery._helpers import BQ_STORAGE_VERSIONS from google.cloud.bigquery._helpers import _verify_job_config_type from google.cloud.bigquery._helpers import _get_bigquery_host from google.cloud.bigquery._helpers import _DEFAULT_HOST @@ -90,8 +88,6 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs - -# from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( @@ -539,22 +535,8 @@ def _ensure_bqstorage_client( Returns: A BigQuery Storage API client. """ - # try: from google.cloud import bigquery_storage # type: ignore - # except ImportError: - # warnings.warn( - # "Cannot create BigQuery Storage client, the dependency " - # "google-cloud-bigquery-storage is not installed." - # ) - # return None - - # try: - # BQ_STORAGE_VERSIONS.verify_version() - # except LegacyBigQueryStorageError as exc: - # warnings.warn(str(exc)) - # return None - if bqstorage_client is None: bqstorage_client = bigquery_storage.BigQueryReadClient( credentials=self._credentials, diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py index 2bab97fea..ed2e59bb8 100644 --- a/google/cloud/bigquery/exceptions.py +++ b/google/cloud/bigquery/exceptions.py @@ -16,10 +16,5 @@ class BigQueryError(Exception): """Base class for all custom exceptions defined by the BigQuery client.""" - -class LegacyBigQueryStorageError(BigQueryError): - """Raised when too old a version of BigQuery Storage extra is detected at runtime.""" - - class LegacyPyarrowError(BigQueryError): """Raised when too old a version of pyarrow package is detected at runtime.""" diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index b92d9c1fe..9d0cd6544 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1592,17 +1592,6 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): if self.max_results is not None: return False - # try: - # from google.cloud import bigquery_storage # noqa: F401 - # except ImportError: - # return False - - # try: - # _helpers.BQ_STORAGE_VERSIONS.verify_version() - # except LegacyBigQueryStorageError as exc: - # warnings.warn(str(exc)) - # return False - return True def _get_next_page_response(self): diff --git a/setup.py b/setup.py index 1045fdc84..32479d8f7 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,6 @@ "packaging >= 14.3, <22.0.0dev", "protobuf >= 3.12.0, <4.0.0dev", # For the legacy proto-based types. "python-dateutil >= 2.7.2, <3.0dev", - # "pyarrow >= 3.0.0, < 9.0dev", "requests >= 2.18.0, < 3.0.0dev", ] extras = { diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index bb8f0c17f..8b88b6844 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -1,169 +1,169 @@ -# # Copyright 2021 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # https://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# """System tests for Arrow connector.""" - -# from typing import Optional - -# import pyarrow -# import pytest - -# from google.cloud import bigquery -# from google.cloud.bigquery import enums - - -# @pytest.mark.parametrize( -# ("max_results", "scalars_table_name"), -# ( -# (None, "scalars_table"), # Use BQ Storage API. -# (10, "scalars_table"), # Use REST API. -# (None, "scalars_extreme_table"), # Use BQ Storage API. -# (10, "scalars_extreme_table"), # Use REST API. -# ), -# ) -# def test_list_rows_nullable_scalars_dtypes( -# bigquery_client: bigquery.Client, -# scalars_table: str, -# scalars_extreme_table: str, -# max_results: Optional[int], -# scalars_table_name: str, -# ): -# table_id = scalars_table -# if scalars_table_name == "scalars_extreme_table": -# table_id = scalars_extreme_table - -# # TODO(GH#836): Avoid INTERVAL columns until they are supported by the -# # BigQuery Storage API and pyarrow. -# schema = [ -# bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), -# bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), -# bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), -# bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), -# bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), -# bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), -# bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), -# bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), -# bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), -# bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), -# bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), -# bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), -# ] - -# arrow_table = bigquery_client.list_rows( -# table_id, -# max_results=max_results, -# selected_fields=schema, -# ).to_arrow() - -# schema = arrow_table.schema -# bignumeric_type = schema.field("bignumeric_col").type -# # 77th digit is partial. -# # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types -# assert bignumeric_type.precision in {76, 77} -# assert bignumeric_type.scale == 38 - -# bool_type = schema.field("bool_col").type -# assert bool_type.equals(pyarrow.bool_()) - -# bytes_type = schema.field("bytes_col").type -# assert bytes_type.equals(pyarrow.binary()) - -# date_type = schema.field("date_col").type -# assert date_type.equals(pyarrow.date32()) - -# datetime_type = schema.field("datetime_col").type -# assert datetime_type.unit == "us" -# assert datetime_type.tz is None - -# float64_type = schema.field("float64_col").type -# assert float64_type.equals(pyarrow.float64()) - -# geography_type = schema.field("geography_col").type -# assert geography_type.equals(pyarrow.string()) - -# int64_type = schema.field("int64_col").type -# assert int64_type.equals(pyarrow.int64()) - -# numeric_type = schema.field("numeric_col").type -# assert numeric_type.precision == 38 -# assert numeric_type.scale == 9 - -# string_type = schema.field("string_col").type -# assert string_type.equals(pyarrow.string()) - -# time_type = schema.field("time_col").type -# assert time_type.equals(pyarrow.time64("us")) - -# timestamp_type = schema.field("timestamp_col").type -# assert timestamp_type.unit == "us" -# assert timestamp_type.tz is not None - - -# @pytest.mark.parametrize("do_insert", [True, False]) -# def test_arrow_extension_types_same_for_storage_and_REST_APIs_894( -# dataset_client, test_table_name, do_insert -# ): -# types = dict( -# astring=("STRING", "'x'"), -# astring9=("STRING(9)", "'x'"), -# abytes=("BYTES", "b'x'"), -# abytes9=("BYTES(9)", "b'x'"), -# anumeric=("NUMERIC", "42"), -# anumeric9=("NUMERIC(9)", "42"), -# anumeric92=("NUMERIC(9,2)", "42"), -# abignumeric=("BIGNUMERIC", "42e30"), -# abignumeric49=("BIGNUMERIC(37)", "42e30"), -# abignumeric492=("BIGNUMERIC(37,2)", "42e30"), -# abool=("BOOL", "true"), -# adate=("DATE", "'2021-09-06'"), -# adatetime=("DATETIME", "'2021-09-06T09:57:26'"), -# ageography=("GEOGRAPHY", "ST_GEOGFROMTEXT('point(0 0)')"), -# # Can't get arrow data for interval :( -# # ainterval=('INTERVAL', "make_interval(1, 2, 3, 4, 5, 6)"), -# aint64=("INT64", "42"), -# afloat64=("FLOAT64", "42.0"), -# astruct=("STRUCT", "struct(42)"), -# atime=("TIME", "'1:2:3'"), -# atimestamp=("TIMESTAMP", "'2021-09-06T09:57:26'"), -# ) -# columns = ", ".join(f"{k} {t[0]}" for k, t in types.items()) -# dataset_client.query(f"create table {test_table_name} ({columns})").result() -# if do_insert: -# names = list(types) -# values = ", ".join(types[name][1] for name in names) -# names = ", ".join(names) -# dataset_client.query( -# f"insert into {test_table_name} ({names}) values ({values})" -# ).result() -# at = dataset_client.query(f"select * from {test_table_name}").result().to_arrow() -# storage_api_metadata = { -# at.field(i).name: at.field(i).metadata for i in range(at.num_columns) -# } -# at = ( -# dataset_client.query(f"select * from {test_table_name}") -# .result() -# .to_arrow(create_bqstorage_client=False) -# ) -# rest_api_metadata = { -# at.field(i).name: at.field(i).metadata for i in range(at.num_columns) -# } - -# assert rest_api_metadata == storage_api_metadata -# assert rest_api_metadata["adatetime"] == { -# b"ARROW:extension:name": b"google:sqlType:datetime" -# } -# assert rest_api_metadata["ageography"] == { -# b"ARROW:extension:name": b"google:sqlType:geography", -# b"ARROW:extension:metadata": b'{"encoding": "WKT"}', -# } +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Arrow connector.""" + +from typing import Optional + +import pyarrow +import pytest + +from google.cloud import bigquery +from google.cloud.bigquery import enums + + +@pytest.mark.parametrize( + ("max_results", "scalars_table_name"), + ( + (None, "scalars_table"), # Use BQ Storage API. + (10, "scalars_table"), # Use REST API. + (None, "scalars_extreme_table"), # Use BQ Storage API. + (10, "scalars_extreme_table"), # Use REST API. + ), +) +def test_list_rows_nullable_scalars_dtypes( + bigquery_client: bigquery.Client, + scalars_table: str, + scalars_extreme_table: str, + max_results: Optional[int], + scalars_table_name: str, +): + table_id = scalars_table + if scalars_table_name == "scalars_extreme_table": + table_id = scalars_extreme_table + + # TODO(GH#836): Avoid INTERVAL columns until they are supported by the + # BigQuery Storage API and pyarrow. + schema = [ + bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), + bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), + bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), + bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), + bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), + bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), + bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), + bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), + bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), + bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), + bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), + bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), + ] + + arrow_table = bigquery_client.list_rows( + table_id, + max_results=max_results, + selected_fields=schema, + ).to_arrow() + + schema = arrow_table.schema + bignumeric_type = schema.field("bignumeric_col").type + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + assert bignumeric_type.precision in {76, 77} + assert bignumeric_type.scale == 38 + + bool_type = schema.field("bool_col").type + assert bool_type.equals(pyarrow.bool_()) + + bytes_type = schema.field("bytes_col").type + assert bytes_type.equals(pyarrow.binary()) + + date_type = schema.field("date_col").type + assert date_type.equals(pyarrow.date32()) + + datetime_type = schema.field("datetime_col").type + assert datetime_type.unit == "us" + assert datetime_type.tz is None + + float64_type = schema.field("float64_col").type + assert float64_type.equals(pyarrow.float64()) + + geography_type = schema.field("geography_col").type + assert geography_type.equals(pyarrow.string()) + + int64_type = schema.field("int64_col").type + assert int64_type.equals(pyarrow.int64()) + + numeric_type = schema.field("numeric_col").type + assert numeric_type.precision == 38 + assert numeric_type.scale == 9 + + string_type = schema.field("string_col").type + assert string_type.equals(pyarrow.string()) + + time_type = schema.field("time_col").type + assert time_type.equals(pyarrow.time64("us")) + + timestamp_type = schema.field("timestamp_col").type + assert timestamp_type.unit == "us" + assert timestamp_type.tz is not None + + +@pytest.mark.parametrize("do_insert", [True, False]) +def test_arrow_extension_types_same_for_storage_and_REST_APIs_894( + dataset_client, test_table_name, do_insert +): + types = dict( + astring=("STRING", "'x'"), + astring9=("STRING(9)", "'x'"), + abytes=("BYTES", "b'x'"), + abytes9=("BYTES(9)", "b'x'"), + anumeric=("NUMERIC", "42"), + anumeric9=("NUMERIC(9)", "42"), + anumeric92=("NUMERIC(9,2)", "42"), + abignumeric=("BIGNUMERIC", "42e30"), + abignumeric49=("BIGNUMERIC(37)", "42e30"), + abignumeric492=("BIGNUMERIC(37,2)", "42e30"), + abool=("BOOL", "true"), + adate=("DATE", "'2021-09-06'"), + adatetime=("DATETIME", "'2021-09-06T09:57:26'"), + ageography=("GEOGRAPHY", "ST_GEOGFROMTEXT('point(0 0)')"), + # Can't get arrow data for interval :( + # ainterval=('INTERVAL', "make_interval(1, 2, 3, 4, 5, 6)"), + aint64=("INT64", "42"), + afloat64=("FLOAT64", "42.0"), + astruct=("STRUCT", "struct(42)"), + atime=("TIME", "'1:2:3'"), + atimestamp=("TIMESTAMP", "'2021-09-06T09:57:26'"), + ) + columns = ", ".join(f"{k} {t[0]}" for k, t in types.items()) + dataset_client.query(f"create table {test_table_name} ({columns})").result() + if do_insert: + names = list(types) + values = ", ".join(types[name][1] for name in names) + names = ", ".join(names) + dataset_client.query( + f"insert into {test_table_name} ({names}) values ({values})" + ).result() + at = dataset_client.query(f"select * from {test_table_name}").result().to_arrow() + storage_api_metadata = { + at.field(i).name: at.field(i).metadata for i in range(at.num_columns) + } + at = ( + dataset_client.query(f"select * from {test_table_name}") + .result() + .to_arrow(create_bqstorage_client=False) + ) + rest_api_metadata = { + at.field(i).name: at.field(i).metadata for i in range(at.num_columns) + } + + assert rest_api_metadata == storage_api_metadata + assert rest_api_metadata["adatetime"] == { + b"ARROW:extension:name": b"google:sqlType:datetime" + } + assert rest_api_metadata["ageography"] == { + b"ARROW:extension:name": b"google:sqlType:geography", + b"ARROW:extension:metadata": b'{"encoding": "WKT"}', + } diff --git a/tests/system/test_job_retry.py b/tests/system/test_job_retry.py index 53f9f4943..520545493 100644 --- a/tests/system/test_job_retry.py +++ b/tests/system/test_job_retry.py @@ -1,72 +1,72 @@ -# # Copyright 2021 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # https://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# import contextlib -# import threading -# import time - -# import google.api_core.exceptions -# import google.cloud.bigquery -# import pytest - - -# def thread(func): -# thread = threading.Thread(target=func, daemon=True) -# thread.start() -# return thread - - -# @pytest.mark.parametrize("job_retry_on_query", [True, False]) -# def test_query_retry_539(bigquery_client, dataset_id, job_retry_on_query): -# """ -# Test job_retry - -# See: https://github.com/googleapis/python-bigquery/issues/539 -# """ -# from google.api_core import exceptions -# from google.api_core.retry import if_exception_type, Retry - -# table_name = f"{dataset_id}.t539" - -# # Without a custom retry, we fail: -# with pytest.raises(google.api_core.exceptions.NotFound): -# bigquery_client.query(f"select count(*) from {table_name}").result() - -# retry_notfound = Retry(predicate=if_exception_type(exceptions.NotFound)) - -# job_retry = dict(job_retry=retry_notfound) if job_retry_on_query else {} -# job = bigquery_client.query(f"select count(*) from {table_name}", **job_retry) -# job_id = job.job_id - -# # We can already know that the job failed, but we're not supposed -# # to find out until we call result, which is where retry happend -# assert job.done() -# assert job.exception() is not None - -# @thread -# def create_table(): -# time.sleep(1) # Give the first retry attempt time to fail. -# with contextlib.closing(google.cloud.bigquery.Client()) as client: -# client.query(f"create table {table_name} (id int64)").result() - -# job_retry = {} if job_retry_on_query else dict(job_retry=retry_notfound) -# [[count]] = list(job.result(**job_retry)) -# assert count == 0 - -# # The job was retried, and thus got a new job id -# assert job.job_id != job_id - -# # Make sure we don't leave a thread behind: -# create_table.join() -# bigquery_client.query(f"drop table {table_name}").result() +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import threading +import time + +import google.api_core.exceptions +import google.cloud.bigquery +import pytest + + +def thread(func): + thread = threading.Thread(target=func, daemon=True) + thread.start() + return thread + + +@pytest.mark.parametrize("job_retry_on_query", [True, False]) +def test_query_retry_539(bigquery_client, dataset_id, job_retry_on_query): + """ + Test job_retry + + See: https://github.com/googleapis/python-bigquery/issues/539 + """ + from google.api_core import exceptions + from google.api_core.retry import if_exception_type, Retry + + table_name = f"{dataset_id}.t539" + + # Without a custom retry, we fail: + with pytest.raises(google.api_core.exceptions.NotFound): + bigquery_client.query(f"select count(*) from {table_name}").result() + + retry_notfound = Retry(predicate=if_exception_type(exceptions.NotFound)) + + job_retry = dict(job_retry=retry_notfound) if job_retry_on_query else {} + job = bigquery_client.query(f"select count(*) from {table_name}", **job_retry) + job_id = job.job_id + + # We can already know that the job failed, but we're not supposed + # to find out until we call result, which is where retry happend + assert job.done() + assert job.exception() is not None + + @thread + def create_table(): + time.sleep(1) # Give the first retry attempt time to fail. + with contextlib.closing(google.cloud.bigquery.Client()) as client: + client.query(f"create table {table_name} (id int64)").result() + + job_retry = {} if job_retry_on_query else dict(job_retry=retry_notfound) + [[count]] = list(job.result(**job_retry)) + assert count == 0 + + # The job was retried, and thus got a new job id + assert job.job_id != job_id + + # Make sure we don't leave a thread behind: + create_table.join() + bigquery_client.query(f"drop table {table_name}").result() diff --git a/tests/system/test_list_rows.py b/tests/system/test_list_rows.py index 065966055..4c08958c3 100644 --- a/tests/system/test_list_rows.py +++ b/tests/system/test_list_rows.py @@ -1,120 +1,120 @@ -# # Copyright 2021 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # http://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# import datetime -# import decimal - -# from dateutil import relativedelta - -# from google.cloud import bigquery -# from google.cloud.bigquery import enums - - -# def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): -# from google.cloud.bigquery.table import RowIterator - -# table = bigquery_client.create_table(table_id) - -# # It's a bit silly to list rows for an empty table, but this does -# # happen as the result of a DDL query from an IPython magic command. -# rows = bigquery_client.list_rows(table) -# assert isinstance(rows, RowIterator) -# assert tuple(rows) == () - - -# def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): -# num_items = 7 -# page_size = 3 -# num_pages, num_last_page = divmod(num_items, page_size) - -# to_insert = [{"string_col": "item%d" % i, "rowindex": i} for i in range(num_items)] -# bigquery_client.load_table_from_json(to_insert, table_id).result() - -# df = bigquery_client.list_rows( -# table_id, -# selected_fields=[bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)], -# page_size=page_size, -# ) -# pages = df.pages - -# for i in range(num_pages): -# page = next(pages) -# assert page.num_items == page_size -# page = next(pages) -# assert page.num_items == num_last_page - - -# def test_list_rows_scalars(bigquery_client: bigquery.Client, scalars_table: str): -# rows = sorted( -# bigquery_client.list_rows(scalars_table), key=lambda row: row["rowindex"] -# ) -# row = rows[0] -# assert row["bool_col"] # True -# assert row["bytes_col"] == b"Hello, World!" -# assert row["date_col"] == datetime.date(2021, 7, 21) -# assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45) -# assert row["geography_col"] == "POINT(-122.0838511 37.3860517)" -# assert row["int64_col"] == 123456789 -# assert row["interval_col"] == relativedelta.relativedelta( -# years=7, months=11, days=9, hours=4, minutes=15, seconds=37, microseconds=123456 -# ) -# assert row["numeric_col"] == decimal.Decimal("1.23456789") -# assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819") -# assert row["float64_col"] == 1.25 -# assert row["string_col"] == "Hello, World!" -# assert row["time_col"] == datetime.time(11, 41, 43, 76160) -# assert row["timestamp_col"] == datetime.datetime( -# 2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc -# ) - -# nullrow = rows[1] -# for column, value in nullrow.items(): -# if column == "rowindex": -# assert value == 1 -# else: -# assert value is None - - -# def test_list_rows_scalars_extreme( -# bigquery_client: bigquery.Client, scalars_extreme_table: str -# ): -# rows = sorted( -# bigquery_client.list_rows(scalars_extreme_table), -# key=lambda row: row["rowindex"], -# ) -# row = rows[0] -# assert row["bool_col"] # True -# assert row["bytes_col"] == b"\r\n" -# assert row["date_col"] == datetime.date(9999, 12, 31) -# assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) -# assert row["geography_col"] == "POINT(-135 90)" -# assert row["int64_col"] == 9223372036854775807 -# assert row["interval_col"] == relativedelta.relativedelta( -# years=-10000, days=-3660000, hours=-87840000 -# ) -# assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28") -# assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37") -# assert row["float64_col"] == float("Inf") -# assert row["string_col"] == "Hello, World" -# assert row["time_col"] == datetime.time(23, 59, 59, 999999) -# assert row["timestamp_col"] == datetime.datetime( -# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc -# ) - -# nullrow = rows[4] -# for column, value in nullrow.items(): -# if column == "rowindex": -# assert value == 4 -# else: -# assert value is None +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import decimal + +from dateutil import relativedelta + +from google.cloud import bigquery +from google.cloud.bigquery import enums + + +def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): + from google.cloud.bigquery.table import RowIterator + + table = bigquery_client.create_table(table_id) + + # It's a bit silly to list rows for an empty table, but this does + # happen as the result of a DDL query from an IPython magic command. + rows = bigquery_client.list_rows(table) + assert isinstance(rows, RowIterator) + assert tuple(rows) == () + + +def test_list_rows_page_size(bigquery_client: bigquery.Client, table_id: str): + num_items = 7 + page_size = 3 + num_pages, num_last_page = divmod(num_items, page_size) + + to_insert = [{"string_col": "item%d" % i, "rowindex": i} for i in range(num_items)] + bigquery_client.load_table_from_json(to_insert, table_id).result() + + df = bigquery_client.list_rows( + table_id, + selected_fields=[bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING)], + page_size=page_size, + ) + pages = df.pages + + for i in range(num_pages): + page = next(pages) + assert page.num_items == page_size + page = next(pages) + assert page.num_items == num_last_page + + +def test_list_rows_scalars(bigquery_client: bigquery.Client, scalars_table: str): + rows = sorted( + bigquery_client.list_rows(scalars_table), key=lambda row: row["rowindex"] + ) + row = rows[0] + assert row["bool_col"] # True + assert row["bytes_col"] == b"Hello, World!" + assert row["date_col"] == datetime.date(2021, 7, 21) + assert row["datetime_col"] == datetime.datetime(2021, 7, 21, 11, 39, 45) + assert row["geography_col"] == "POINT(-122.0838511 37.3860517)" + assert row["int64_col"] == 123456789 + assert row["interval_col"] == relativedelta.relativedelta( + years=7, months=11, days=9, hours=4, minutes=15, seconds=37, microseconds=123456 + ) + assert row["numeric_col"] == decimal.Decimal("1.23456789") + assert row["bignumeric_col"] == decimal.Decimal("10.111213141516171819") + assert row["float64_col"] == 1.25 + assert row["string_col"] == "Hello, World!" + assert row["time_col"] == datetime.time(11, 41, 43, 76160) + assert row["timestamp_col"] == datetime.datetime( + 2021, 7, 21, 17, 43, 43, 945289, tzinfo=datetime.timezone.utc + ) + + nullrow = rows[1] + for column, value in nullrow.items(): + if column == "rowindex": + assert value == 1 + else: + assert value is None + + +def test_list_rows_scalars_extreme( + bigquery_client: bigquery.Client, scalars_extreme_table: str +): + rows = sorted( + bigquery_client.list_rows(scalars_extreme_table), + key=lambda row: row["rowindex"], + ) + row = rows[0] + assert row["bool_col"] # True + assert row["bytes_col"] == b"\r\n" + assert row["date_col"] == datetime.date(9999, 12, 31) + assert row["datetime_col"] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) + assert row["geography_col"] == "POINT(-135 90)" + assert row["int64_col"] == 9223372036854775807 + assert row["interval_col"] == relativedelta.relativedelta( + years=-10000, days=-3660000, hours=-87840000 + ) + assert row["numeric_col"] == decimal.Decimal(f"9.{'9' * 37}E+28") + assert row["bignumeric_col"] == decimal.Decimal(f"9.{'9' * 75}E+37") + assert row["float64_col"] == float("Inf") + assert row["string_col"] == "Hello, World" + assert row["time_col"] == datetime.time(23, 59, 59, 999999) + assert row["timestamp_col"] == datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ) + + nullrow = rows[4] + for column, value in nullrow.items(): + if column == "rowindex": + assert value == 4 + else: + assert value is None diff --git a/tests/system/test_magics.py b/tests/system/test_magics.py index c7f03320b..78c15cb50 100644 --- a/tests/system/test_magics.py +++ b/tests/system/test_magics.py @@ -1,83 +1,83 @@ -# # Copyright 2020 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # https://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# """System tests for Jupyter/IPython connector.""" - -# import re - -# import pytest -# import psutil - - -# IPython = pytest.importorskip("IPython") -# io = pytest.importorskip("IPython.utils.io") -# pandas = pytest.importorskip("pandas") -# tools = pytest.importorskip("IPython.testing.tools") -# interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") - - -# @pytest.fixture(scope="session") -# def ipython(): -# config = tools.default_config() -# config.TerminalInteractiveShell.simple_prompt = True -# shell = interactiveshell.TerminalInteractiveShell.instance(config=config) -# return shell - - -# @pytest.fixture() -# def ipython_interactive(ipython): -# """Activate IPython's builtin hooks - -# for the duration of the test scope. -# """ -# with ipython.builtin_trap: -# yield ipython - - -# def test_bigquery_magic(ipython_interactive): -# ip = IPython.get_ipython() -# current_process = psutil.Process() -# conn_count_start = len(current_process.connections()) - -# ip.extension_manager.load_extension("google.cloud.bigquery") -# sql = """ -# SELECT -# CONCAT( -# 'https://stackoverflow.com/questions/', -# CAST(id as STRING)) as url, -# view_count -# FROM `bigquery-public-data.stackoverflow.posts_questions` -# WHERE tags like '%google-bigquery%' -# ORDER BY view_count DESC -# LIMIT 10 -# """ -# with io.capture_output() as captured: -# result = ip.run_cell_magic("bigquery", "--use_rest_api", sql) - -# conn_count_end = len(current_process.connections()) - -# lines = re.split("\n|\r", captured.stdout) -# # Removes blanks & terminal code (result of display clearing) -# updates = list(filter(lambda x: bool(x) and x != "\x1b[2K", lines)) -# assert re.match("Executing query with job ID: .*", updates[0]) -# assert all(re.match("Query executing: .*s", line) for line in updates[1:-1]) -# assert re.match("Query complete after .*s", updates[-1]) -# assert isinstance(result, pandas.DataFrame) -# assert len(result) == 10 # verify row count -# assert list(result) == ["url", "view_count"] # verify column names - -# # NOTE: For some reason, the number of open sockets is sometimes one *less* -# # than expected when running system tests on Kokoro, thus using the <= assertion. -# # That's still fine, however, since the sockets are apparently not leaked. -# assert conn_count_end <= conn_count_start # system resources are released +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for Jupyter/IPython connector.""" + +import re + +import pytest +import psutil + + +IPython = pytest.importorskip("IPython") +io = pytest.importorskip("IPython.utils.io") +pandas = pytest.importorskip("pandas") +tools = pytest.importorskip("IPython.testing.tools") +interactiveshell = pytest.importorskip("IPython.terminal.interactiveshell") + + +@pytest.fixture(scope="session") +def ipython(): + config = tools.default_config() + config.TerminalInteractiveShell.simple_prompt = True + shell = interactiveshell.TerminalInteractiveShell.instance(config=config) + return shell + + +@pytest.fixture() +def ipython_interactive(ipython): + """Activate IPython's builtin hooks + + for the duration of the test scope. + """ + with ipython.builtin_trap: + yield ipython + + +def test_bigquery_magic(ipython_interactive): + ip = IPython.get_ipython() + current_process = psutil.Process() + conn_count_start = len(current_process.connections()) + + ip.extension_manager.load_extension("google.cloud.bigquery") + sql = """ + SELECT + CONCAT( + 'https://stackoverflow.com/questions/', + CAST(id as STRING)) as url, + view_count + FROM `bigquery-public-data.stackoverflow.posts_questions` + WHERE tags like '%google-bigquery%' + ORDER BY view_count DESC + LIMIT 10 + """ + with io.capture_output() as captured: + result = ip.run_cell_magic("bigquery", "--use_rest_api", sql) + + conn_count_end = len(current_process.connections()) + + lines = re.split("\n|\r", captured.stdout) + # Removes blanks & terminal code (result of display clearing) + updates = list(filter(lambda x: bool(x) and x != "\x1b[2K", lines)) + assert re.match("Executing query with job ID: .*", updates[0]) + assert all(re.match("Query executing: .*s", line) for line in updates[1:-1]) + assert re.match("Query complete after .*s", updates[-1]) + assert isinstance(result, pandas.DataFrame) + assert len(result) == 10 # verify row count + assert list(result) == ["url", "view_count"] # verify column names + + # NOTE: For some reason, the number of open sockets is sometimes one *less* + # than expected when running system tests on Kokoro, thus using the <= assertion. + # That's still fine, however, since the sockets are apparently not leaked. + assert conn_count_end <= conn_count_start # system resources are released diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 7154c7b8d..34e4243c4 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -1,1301 +1,1301 @@ -# # Copyright 2021 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # https://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# """System tests for pandas connector.""" - -# import collections -# import datetime -# import decimal -# import json -# import io -# import operator -# import warnings - -# import google.api_core.retry -# import pkg_resources -# import pytest - -# from google.cloud import bigquery -# from google.cloud import bigquery_storage -# from google.cloud.bigquery import enums - -# from . import helpers - - -# pandas = pytest.importorskip("pandas", minversion="0.23.0") -# numpy = pytest.importorskip("numpy") - - -# PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version -# PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0") - - -# class MissingDataError(Exception): -# pass - - -# def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_id): -# """Test that a DataFrame with dtypes that map well to BigQuery types -# can be uploaded without specifying a schema. - -# https://github.com/googleapis/google-cloud-python/issues/9044 -# """ -# df_data = collections.OrderedDict( -# [ -# ("bool_col", pandas.Series([True, False, True], dtype="bool")), -# ( -# "ts_col", -# pandas.Series( -# [ -# datetime.datetime(2010, 1, 2, 3, 44, 50), -# datetime.datetime(2011, 2, 3, 14, 50, 59), -# datetime.datetime(2012, 3, 14, 15, 16), -# ], -# dtype="datetime64[ns]", -# ).dt.tz_localize(datetime.timezone.utc), -# ), -# ( -# "dt_col_no_tz", -# pandas.Series( -# [ -# datetime.datetime(2010, 1, 2, 3, 44, 50), -# datetime.datetime(2011, 2, 3, 14, 50, 59), -# datetime.datetime(2012, 3, 14, 15, 16), -# ], -# dtype="datetime64[ns]", -# ), -# ), -# ("float32_col", pandas.Series([1.0, 2.0, 3.0], dtype="float32")), -# ("float64_col", pandas.Series([4.0, 5.0, 6.0], dtype="float64")), -# ("int8_col", pandas.Series([-12, -11, -10], dtype="int8")), -# ("int16_col", pandas.Series([-9, -8, -7], dtype="int16")), -# ("int32_col", pandas.Series([-6, -5, -4], dtype="int32")), -# ("int64_col", pandas.Series([-3, -2, -1], dtype="int64")), -# ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), -# ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), -# ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), -# ( -# "date_col", -# pandas.Series( -# [ -# datetime.date(2010, 1, 2), -# datetime.date(2011, 2, 3), -# datetime.date(2012, 3, 14), -# ], -# dtype="dbdate", -# ), -# ), -# ( -# "time_col", -# pandas.Series( -# [ -# datetime.time(3, 44, 50), -# datetime.time(14, 50, 59), -# datetime.time(15, 16), -# ], -# dtype="dbtime", -# ), -# ), -# ("array_bool_col", pandas.Series([[True], [False], [True]])), -# ( -# "array_ts_col", -# pandas.Series( -# [ -# [ -# datetime.datetime( -# 2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc -# ), -# ], -# [ -# datetime.datetime( -# 2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc -# ), -# ], -# [ -# datetime.datetime( -# 2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc -# ), -# ], -# ], -# ), -# ), -# ( -# "array_dt_col_no_tz", -# pandas.Series( -# [ -# [datetime.datetime(2010, 1, 2, 3, 44, 50)], -# [datetime.datetime(2011, 2, 3, 14, 50, 59)], -# [datetime.datetime(2012, 3, 14, 15, 16)], -# ], -# ), -# ), -# ( -# "array_float32_col", -# pandas.Series( -# [numpy.array([_], dtype="float32") for _ in [1.0, 2.0, 3.0]] -# ), -# ), -# ( -# "array_float64_col", -# pandas.Series( -# [numpy.array([_], dtype="float64") for _ in [4.0, 5.0, 6.0]] -# ), -# ), -# ( -# "array_int8_col", -# pandas.Series( -# [numpy.array([_], dtype="int8") for _ in [-12, -11, -10]] -# ), -# ), -# ( -# "array_int16_col", -# pandas.Series([numpy.array([_], dtype="int16") for _ in [-9, -8, -7]]), -# ), -# ( -# "array_int32_col", -# pandas.Series([numpy.array([_], dtype="int32") for _ in [-6, -5, -4]]), -# ), -# ( -# "array_int64_col", -# pandas.Series([numpy.array([_], dtype="int64") for _ in [-3, -2, -1]]), -# ), -# ( -# "array_uint8_col", -# pandas.Series([numpy.array([_], dtype="uint8") for _ in [0, 1, 2]]), -# ), -# ( -# "array_uint16_col", -# pandas.Series([numpy.array([_], dtype="uint16") for _ in [3, 4, 5]]), -# ), -# ( -# "array_uint32_col", -# pandas.Series([numpy.array([_], dtype="uint32") for _ in [6, 7, 8]]), -# ), -# ] -# ) -# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - -# table_id = "{}.{}.load_table_from_dataframe_w_automatic_schema".format( -# bigquery_client.project, dataset_id -# ) - -# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) -# load_job.result() - -# table = bigquery_client.get_table(table_id) -# assert tuple(table.schema) == ( -# bigquery.SchemaField("bool_col", "BOOLEAN"), -# bigquery.SchemaField("ts_col", "TIMESTAMP"), -# bigquery.SchemaField("dt_col_no_tz", "DATETIME"), -# bigquery.SchemaField("float32_col", "FLOAT"), -# bigquery.SchemaField("float64_col", "FLOAT"), -# bigquery.SchemaField("int8_col", "INTEGER"), -# bigquery.SchemaField("int16_col", "INTEGER"), -# bigquery.SchemaField("int32_col", "INTEGER"), -# bigquery.SchemaField("int64_col", "INTEGER"), -# bigquery.SchemaField("uint8_col", "INTEGER"), -# bigquery.SchemaField("uint16_col", "INTEGER"), -# bigquery.SchemaField("uint32_col", "INTEGER"), -# bigquery.SchemaField("date_col", "DATE"), -# bigquery.SchemaField("time_col", "TIME"), -# bigquery.SchemaField("array_bool_col", "BOOLEAN", mode="REPEATED"), -# bigquery.SchemaField("array_ts_col", "TIMESTAMP", mode="REPEATED"), -# bigquery.SchemaField("array_dt_col_no_tz", "DATETIME", mode="REPEATED"), -# bigquery.SchemaField("array_float32_col", "FLOAT", mode="REPEATED"), -# bigquery.SchemaField("array_float64_col", "FLOAT", mode="REPEATED"), -# bigquery.SchemaField("array_int8_col", "INTEGER", mode="REPEATED"), -# bigquery.SchemaField("array_int16_col", "INTEGER", mode="REPEATED"), -# bigquery.SchemaField("array_int32_col", "INTEGER", mode="REPEATED"), -# bigquery.SchemaField("array_int64_col", "INTEGER", mode="REPEATED"), -# bigquery.SchemaField("array_uint8_col", "INTEGER", mode="REPEATED"), -# bigquery.SchemaField("array_uint16_col", "INTEGER", mode="REPEATED"), -# bigquery.SchemaField("array_uint32_col", "INTEGER", mode="REPEATED"), -# ) - -# assert numpy.array( -# sorted(map(list, bigquery_client.list_rows(table)), key=lambda r: r[5]), -# dtype="object", -# ).transpose().tolist() == [ -# # bool_col -# [True, False, True], -# # ts_col -# [ -# datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc), -# datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc), -# datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc), -# ], -# # dt_col_no_tz -# [ -# datetime.datetime(2010, 1, 2, 3, 44, 50), -# datetime.datetime(2011, 2, 3, 14, 50, 59), -# datetime.datetime(2012, 3, 14, 15, 16), -# ], -# # float32_col -# [1.0, 2.0, 3.0], -# # float64_col -# [4.0, 5.0, 6.0], -# # int8_col -# [-12, -11, -10], -# # int16_col -# [-9, -8, -7], -# # int32_col -# [-6, -5, -4], -# # int64_col -# [-3, -2, -1], -# # uint8_col -# [0, 1, 2], -# # uint16_col -# [3, 4, 5], -# # uint32_col -# [6, 7, 8], -# # date_col -# [ -# datetime.date(2010, 1, 2), -# datetime.date(2011, 2, 3), -# datetime.date(2012, 3, 14), -# ], -# # time_col -# [datetime.time(3, 44, 50), datetime.time(14, 50, 59), datetime.time(15, 16)], -# # array_bool_col -# [[True], [False], [True]], -# # array_ts_col -# [ -# [datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc)], -# [datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc)], -# [datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc)], -# ], -# # array_dt_col -# [ -# [datetime.datetime(2010, 1, 2, 3, 44, 50)], -# [datetime.datetime(2011, 2, 3, 14, 50, 59)], -# [datetime.datetime(2012, 3, 14, 15, 16)], -# ], -# # array_float32_col -# [[1.0], [2.0], [3.0]], -# # array_float64_col -# [[4.0], [5.0], [6.0]], -# # array_int8_col -# [[-12], [-11], [-10]], -# # array_int16_col -# [[-9], [-8], [-7]], -# # array_int32_col -# [[-6], [-5], [-4]], -# # array_int64_col -# [[-3], [-2], [-1]], -# # array_uint8_col -# [[0], [1], [2]], -# # array_uint16_col -# [[3], [4], [5]], -# # array_uint32_col -# [[6], [7], [8]], -# ] - - -# @pytest.mark.skipif( -# PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, -# reason="Only `pandas version >=1.0.0` is supported", -# ) -# def test_load_table_from_dataframe_w_nullable_int64_datatype( -# bigquery_client, dataset_id -# ): -# """Test that a DataFrame containing column with None-type values and int64 datatype -# can be uploaded if a BigQuery schema is specified. - -# https://github.com/googleapis/python-bigquery/issues/22 -# """ -# table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( -# bigquery_client.project, dataset_id -# ) -# table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),) -# table = helpers.retry_403(bigquery_client.create_table)( -# bigquery.Table(table_id, schema=table_schema) -# ) - -# df_data = collections.OrderedDict( -# [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] -# ) -# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) -# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) -# load_job.result() -# table = bigquery_client.get_table(table_id) -# assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) -# assert table.num_rows == 4 - - -# @pytest.mark.skipif( -# PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, -# reason="Only `pandas version >=1.0.0` is supported", -# ) -# def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema( -# bigquery_client, dataset_id, table_id -# ): -# """Test that a DataFrame containing column with None-type values and int64 datatype -# can be uploaded without specifying a schema. - -# https://github.com/googleapis/python-bigquery/issues/22 -# """ - -# df_data = collections.OrderedDict( -# [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] -# ) -# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) -# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) -# load_job.result() -# table = bigquery_client.get_table(table_id) -# assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) -# assert table.num_rows == 4 - - -# def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): -# """Test that a DataFrame with null columns can be uploaded if a -# BigQuery schema is specified. - -# See: https://github.com/googleapis/google-cloud-python/issues/7370 -# """ -# # Schema with all scalar types. -# table_schema = ( -# bigquery.SchemaField("bool_col", "BOOLEAN"), -# bigquery.SchemaField("bytes_col", "BYTES"), -# bigquery.SchemaField("date_col", "DATE"), -# bigquery.SchemaField("dt_col", "DATETIME"), -# bigquery.SchemaField("float_col", "FLOAT"), -# bigquery.SchemaField("geo_col", "GEOGRAPHY"), -# bigquery.SchemaField("int_col", "INTEGER"), -# bigquery.SchemaField("num_col", "NUMERIC"), -# bigquery.SchemaField("bignum_col", "BIGNUMERIC"), -# bigquery.SchemaField("str_col", "STRING"), -# bigquery.SchemaField("time_col", "TIME"), -# bigquery.SchemaField("ts_col", "TIMESTAMP"), -# ) - -# num_rows = 100 -# nulls = [None] * num_rows -# df_data = [ -# ("bool_col", nulls), -# ("bytes_col", nulls), -# ("date_col", nulls), -# ("dt_col", nulls), -# ("float_col", nulls), -# ("geo_col", nulls), -# ("int_col", nulls), -# ("num_col", nulls), -# ("bignum_col", nulls), -# ("str_col", nulls), -# ("time_col", nulls), -# ("ts_col", nulls), -# ] -# df_data = collections.OrderedDict(df_data) -# dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) - -# table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( -# bigquery_client.project, dataset_id -# ) - -# # Create the table before loading so that schema mismatch errors are -# # identified. -# table = helpers.retry_403(bigquery_client.create_table)( -# bigquery.Table(table_id, schema=table_schema) -# ) - -# job_config = bigquery.LoadJobConfig(schema=table_schema) -# load_job = bigquery_client.load_table_from_dataframe( -# dataframe, table_id, job_config=job_config -# ) -# load_job.result() - -# table = bigquery_client.get_table(table) -# assert tuple(table.schema) == table_schema -# assert table.num_rows == num_rows - - -# def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id): -# """Test that a DataFrame with required columns can be uploaded if a -# BigQuery schema is specified. - -# See: https://github.com/googleapis/google-cloud-python/issues/8093 -# """ -# table_schema = ( -# bigquery.SchemaField("name", "STRING", mode="REQUIRED"), -# bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), -# ) - -# records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}] -# dataframe = pandas.DataFrame(records, columns=["name", "age"]) -# job_config = bigquery.LoadJobConfig(schema=table_schema) -# table_id = "{}.{}.load_table_from_dataframe_w_required".format( -# bigquery_client.project, dataset_id -# ) - -# # Create the table before loading so that schema mismatch errors are -# # identified. -# table = helpers.retry_403(bigquery_client.create_table)( -# bigquery.Table(table_id, schema=table_schema) -# ) - -# job_config = bigquery.LoadJobConfig(schema=table_schema) -# load_job = bigquery_client.load_table_from_dataframe( -# dataframe, table_id, job_config=job_config -# ) -# load_job.result() - -# table = bigquery_client.get_table(table) -# assert tuple(table.schema) == table_schema -# assert table.num_rows == 2 - - -# def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id): -# # Schema with all scalar types. -# # See: -# # https://github.com/googleapis/python-bigquery/issues/61 -# # https://issuetracker.google.com/issues/151765076 -# table_schema = ( -# bigquery.SchemaField("row_num", "INTEGER"), -# bigquery.SchemaField("bool_col", "BOOLEAN"), -# bigquery.SchemaField("bytes_col", "BYTES"), -# bigquery.SchemaField("date_col", "DATE"), -# bigquery.SchemaField("dt_col", "DATETIME"), -# bigquery.SchemaField("float_col", "FLOAT"), -# bigquery.SchemaField("geo_col", "GEOGRAPHY"), -# bigquery.SchemaField("int_col", "INTEGER"), -# bigquery.SchemaField("num_col", "NUMERIC"), -# bigquery.SchemaField("bignum_col", "BIGNUMERIC"), -# bigquery.SchemaField("str_col", "STRING"), -# bigquery.SchemaField("time_col", "TIME"), -# bigquery.SchemaField("ts_col", "TIMESTAMP"), -# ) - -# df_data = [ -# ("row_num", [1, 2, 3]), -# ("bool_col", [True, None, False]), -# ("bytes_col", [b"abc", None, b"def"]), -# ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)]), -# ( -# "dt_col", -# [ -# datetime.datetime(1, 1, 1, 0, 0, 0), -# None, -# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), -# ], -# ), -# ("float_col", [float("-inf"), float("nan"), float("inf")]), -# ( -# "geo_col", -# ["POINT(30 10)", None, "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], -# ), -# ("int_col", [-9223372036854775808, None, 9223372036854775807]), -# ( -# "num_col", -# [ -# decimal.Decimal("-99999999999999999999999999999.999999999"), -# None, -# decimal.Decimal("99999999999999999999999999999.999999999"), -# ], -# ), -# ( -# "bignum_col", -# [ -# decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), -# None, -# decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), -# ], -# ), -# ("str_col", ["abc", None, "def"]), -# ( -# "time_col", -# [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], -# ), -# ( -# "ts_col", -# [ -# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), -# None, -# datetime.datetime( -# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc -# ), -# ], -# ), -# ] -# df_data = collections.OrderedDict(df_data) -# dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - -# table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( -# bigquery_client.project, dataset_id -# ) - -# job_config = bigquery.LoadJobConfig(schema=table_schema) -# load_job = bigquery_client.load_table_from_dataframe( -# dataframe, table_id, job_config=job_config -# ) -# load_job.result() - -# table = bigquery_client.get_table(table_id) -# assert tuple(table.schema) == table_schema -# assert table.num_rows == 3 - -# result = bigquery_client.list_rows(table).to_dataframe() -# result.sort_values("row_num", inplace=True) - -# # Check that extreme DATE/DATETIME values are loaded correctly. -# # https://github.com/googleapis/python-bigquery/issues/1076 -# assert result["date_col"][0] == datetime.date(1, 1, 1) -# assert result["date_col"][2] == datetime.date(9999, 12, 31) -# assert result["dt_col"][0] == datetime.datetime(1, 1, 1, 0, 0, 0) -# assert result["dt_col"][2] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) -# assert result["ts_col"][0] == datetime.datetime( -# 1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc -# ) -# assert result["ts_col"][2] == datetime.datetime( -# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc -# ) - - -# def test_load_table_from_dataframe_w_struct_datatype(bigquery_client, dataset_id): -# """Test that a DataFrame with struct datatype can be uploaded if a -# BigQuery schema is specified. - -# https://github.com/googleapis/python-bigquery/issues/21 -# """ -# table_id = "{}.{}.load_table_from_dataframe_w_struct_datatype".format( -# bigquery_client.project, dataset_id -# ) -# table_schema = [ -# bigquery.SchemaField( -# "bar", -# "RECORD", -# fields=[ -# bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), -# bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), -# ], -# mode="REQUIRED", -# ), -# ] -# table = helpers.retry_403(bigquery_client.create_table)( -# bigquery.Table(table_id, schema=table_schema) -# ) - -# df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}] -# dataframe = pandas.DataFrame(data={"bar": df_data}, columns=["bar"]) - -# load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) -# load_job.result() - -# table = bigquery_client.get_table(table_id) -# assert table.schema == table_schema -# assert table.num_rows == 3 - - -# def test_load_table_from_dataframe_w_explicit_schema_source_format_csv( -# bigquery_client, dataset_id -# ): -# from google.cloud.bigquery.job import SourceFormat - -# table_schema = ( -# bigquery.SchemaField("bool_col", "BOOLEAN"), -# bigquery.SchemaField("bytes_col", "BYTES"), -# bigquery.SchemaField("date_col", "DATE"), -# bigquery.SchemaField("dt_col", "DATETIME"), -# bigquery.SchemaField("float_col", "FLOAT"), -# bigquery.SchemaField("geo_col", "GEOGRAPHY"), -# bigquery.SchemaField("int_col", "INTEGER"), -# bigquery.SchemaField("num_col", "NUMERIC"), -# bigquery.SchemaField("bignum_col", "BIGNUMERIC"), -# bigquery.SchemaField("str_col", "STRING"), -# bigquery.SchemaField("time_col", "TIME"), -# bigquery.SchemaField("ts_col", "TIMESTAMP"), -# ) -# df_data = collections.OrderedDict( -# [ -# ("bool_col", [True, None, False]), -# ("bytes_col", ["abc", None, "def"]), -# ( -# "date_col", -# [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], -# ), -# ( -# "dt_col", -# [ -# datetime.datetime(1, 1, 1, 0, 0, 0), -# None, -# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), -# ], -# ), -# ("float_col", [float("-inf"), float("nan"), float("inf")]), -# ( -# "geo_col", -# [ -# "POINT(30 10)", -# None, -# "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", -# ], -# ), -# ("int_col", [-9223372036854775808, None, 9223372036854775807]), -# ( -# "num_col", -# [ -# decimal.Decimal("-99999999999999999999999999999.999999999"), -# None, -# decimal.Decimal("99999999999999999999999999999.999999999"), -# ], -# ), -# ( -# "bignum_col", -# [ -# decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), -# None, -# decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), -# ], -# ), -# ("str_col", ["abc", None, "def"]), -# ( -# "time_col", -# [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], -# ), -# ( -# "ts_col", -# [ -# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), -# None, -# datetime.datetime( -# 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc -# ), -# ], -# ), -# ] -# ) -# dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - -# table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format( -# bigquery_client.project, dataset_id -# ) - -# job_config = bigquery.LoadJobConfig( -# schema=table_schema, source_format=SourceFormat.CSV -# ) -# load_job = bigquery_client.load_table_from_dataframe( -# dataframe, table_id, job_config=job_config -# ) -# load_job.result() - -# table = bigquery_client.get_table(table_id) -# assert tuple(table.schema) == table_schema -# assert table.num_rows == 3 - - -# def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats( -# bigquery_client, dataset_id, table_id -# ): -# from google.cloud.bigquery.job import SourceFormat - -# table_schema = (bigquery.SchemaField("float_col", "FLOAT"),) -# df_data = collections.OrderedDict( -# [ -# ( -# "float_col", -# [ -# 0.14285714285714285, -# 0.51428571485748, -# 0.87128748, -# 1.807960649, -# 2.0679610649, -# 2.4406779661016949, -# 3.7148514257, -# 3.8571428571428572, -# 1.51251252e40, -# ], -# ), -# ] -# ) -# dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) - -# job_config = bigquery.LoadJobConfig( -# schema=table_schema, source_format=SourceFormat.CSV -# ) -# load_job = bigquery_client.load_table_from_dataframe( -# dataframe, table_id, job_config=job_config -# ) -# load_job.result() - -# table = bigquery_client.get_table(table_id) -# rows = bigquery_client.list_rows(table_id) -# floats = [r.values()[0] for r in rows] -# assert tuple(table.schema) == table_schema -# assert table.num_rows == 9 -# assert floats == df_data["float_col"] - - -# def test_query_results_to_dataframe(bigquery_client): -# QUERY = """ -# SELECT id, author, time_ts, dead -# FROM `bigquery-public-data.hacker_news.comments` -# LIMIT 10 -# """ - -# df = bigquery_client.query(QUERY).result().to_dataframe() - -# assert isinstance(df, pandas.DataFrame) -# assert len(df) == 10 # verify the number of rows -# column_names = ["id", "author", "time_ts", "dead"] -# assert list(df) == column_names # verify the column names -# exp_datatypes = { -# "id": int, -# "author": str, -# "time_ts": pandas.Timestamp, -# "dead": bool, -# } -# for _, row in df.iterrows(): -# for col in column_names: -# # all the schema fields are nullable, so None is acceptable -# if not pandas.isna(row[col]): -# assert isinstance(row[col], exp_datatypes[col]) - - -# def test_query_results_to_dataframe_w_bqstorage(bigquery_client): -# query = """ -# SELECT id, author, time_ts, dead -# FROM `bigquery-public-data.hacker_news.comments` -# LIMIT 10 -# """ - -# bqstorage_client = bigquery_storage.BigQueryReadClient( -# credentials=bigquery_client._credentials -# ) - -# df = bigquery_client.query(query).result().to_dataframe(bqstorage_client) - -# assert isinstance(df, pandas.DataFrame) -# assert len(df) == 10 # verify the number of rows -# column_names = ["id", "author", "time_ts", "dead"] -# assert list(df) == column_names -# exp_datatypes = { -# "id": int, -# "author": str, -# "time_ts": pandas.Timestamp, -# "dead": bool, -# } -# for index, row in df.iterrows(): -# for col in column_names: -# # all the schema fields are nullable, so None is acceptable -# if not pandas.isna(row[col]): -# assert isinstance(row[col], exp_datatypes[col]) - - -# def test_insert_rows_from_dataframe(bigquery_client, dataset_id): -# SF = bigquery.SchemaField -# schema = [ -# SF("float_col", "FLOAT", mode="REQUIRED"), -# SF("int_col", "INTEGER", mode="REQUIRED"), -# SF("bool_col", "BOOLEAN", mode="REQUIRED"), -# SF("string_col", "STRING", mode="NULLABLE"), -# SF("date_col", "DATE", mode="NULLABLE"), -# SF("time_col", "TIME", mode="NULLABLE"), -# ] - -# dataframe = pandas.DataFrame( -# [ -# { -# "float_col": 1.11, -# "bool_col": True, -# "string_col": "my string", -# "int_col": 10, -# "date_col": datetime.date(2021, 1, 1), -# "time_col": datetime.time(21, 1, 1), -# }, -# { -# "float_col": 2.22, -# "bool_col": False, -# "string_col": "another string", -# "int_col": 20, -# "date_col": datetime.date(2021, 1, 2), -# "time_col": datetime.time(21, 1, 2), -# }, -# { -# "float_col": 3.33, -# "bool_col": False, -# "string_col": "another string", -# "int_col": 30, -# "date_col": datetime.date(2021, 1, 3), -# "time_col": datetime.time(21, 1, 3), -# }, -# { -# "float_col": 4.44, -# "bool_col": True, -# "string_col": "another string", -# "int_col": 40, -# "date_col": datetime.date(2021, 1, 4), -# "time_col": datetime.time(21, 1, 4), -# }, -# { -# "float_col": 5.55, -# "bool_col": False, -# "string_col": "another string", -# "int_col": 50, -# "date_col": datetime.date(2021, 1, 5), -# "time_col": datetime.time(21, 1, 5), -# }, -# { -# "float_col": 6.66, -# "bool_col": True, -# # Include a NaN value, because pandas often uses NaN as a -# # NULL value indicator. -# "string_col": float("NaN"), -# "int_col": 60, -# "date_col": datetime.date(2021, 1, 6), -# "time_col": datetime.time(21, 1, 6), -# }, -# ] -# ) -# dataframe["date_col"] = dataframe["date_col"].astype("dbdate") -# dataframe["time_col"] = dataframe["time_col"].astype("dbtime") - -# table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe" -# table_arg = bigquery.Table(table_id, schema=schema) -# table = helpers.retry_403(bigquery_client.create_table)(table_arg) - -# chunk_errors = bigquery_client.insert_rows_from_dataframe( -# table, dataframe, chunk_size=3 -# ) -# for errors in chunk_errors: -# assert not errors -# expected = [ -# # Pandas often represents NULL values as NaN. Convert to None for -# # easier comparison. -# tuple(None if col != col else col for col in data_row) -# for data_row in dataframe.itertuples(index=False) -# ] - -# # Use query to fetch rows instead of listing directly from the table so -# # that we get values from the streaming buffer "within a few seconds". -# # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability -# @google.api_core.retry.Retry( -# predicate=google.api_core.retry.if_exception_type(MissingDataError) -# ) -# def get_rows(): -# rows = list( -# bigquery_client.query( -# "SELECT * FROM `{}.{}.{}`".format( -# table.project, table.dataset_id, table.table_id -# ) -# ) -# ) -# if len(rows) != len(expected): -# raise MissingDataError() -# return rows - -# rows = get_rows() -# sorted_rows = sorted(rows, key=operator.attrgetter("int_col")) -# row_tuples = [r.values() for r in sorted_rows] - -# for row, expected_row in zip(row_tuples, expected): -# assert ( -# # Use Counter to verify the same number of values in each, because -# # column order does not matter. -# collections.Counter(row) -# == collections.Counter(expected_row) -# ) - - -# def test_nested_table_to_dataframe(bigquery_client, dataset_id): -# from google.cloud.bigquery.job import SourceFormat -# from google.cloud.bigquery.job import WriteDisposition - -# SF = bigquery.SchemaField -# schema = [ -# SF("string_col", "STRING", mode="NULLABLE"), -# SF( -# "record_col", -# "RECORD", -# mode="NULLABLE", -# fields=[ -# SF("nested_string", "STRING", mode="NULLABLE"), -# SF("nested_repeated", "INTEGER", mode="REPEATED"), -# SF( -# "nested_record", -# "RECORD", -# mode="NULLABLE", -# fields=[SF("nested_nested_string", "STRING", mode="NULLABLE")], -# ), -# ], -# ), -# SF("bigfloat_col", "FLOAT", mode="NULLABLE"), -# SF("smallfloat_col", "FLOAT", mode="NULLABLE"), -# ] -# record = { -# "nested_string": "another string value", -# "nested_repeated": [0, 1, 2], -# "nested_record": {"nested_nested_string": "some deep insight"}, -# } -# to_insert = [ -# { -# "string_col": "Some value", -# "record_col": record, -# "bigfloat_col": 3.14, -# "smallfloat_col": 2.72, -# } -# ] -# rows = [json.dumps(row) for row in to_insert] -# body = io.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) -# table_id = f"{bigquery_client.project}.{dataset_id}.test_nested_table_to_dataframe" -# job_config = bigquery.LoadJobConfig() -# job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE -# job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON -# job_config.schema = schema -# # Load a table using a local JSON file from memory. -# bigquery_client.load_table_from_file(body, table_id, job_config=job_config).result() - -# df = bigquery_client.list_rows(table_id, selected_fields=schema).to_dataframe( -# dtypes={"smallfloat_col": "float16"} -# ) - -# assert isinstance(df, pandas.DataFrame) -# assert len(df) == 1 # verify the number of rows -# exp_columns = ["string_col", "record_col", "bigfloat_col", "smallfloat_col"] -# assert list(df) == exp_columns # verify the column names -# row = df.iloc[0] -# # verify the row content -# assert row["string_col"] == "Some value" -# expected_keys = tuple(sorted(record.keys())) -# row_keys = tuple(sorted(row["record_col"].keys())) -# assert row_keys == expected_keys -# # Can't compare numpy arrays, which pyarrow encodes the embedded -# # repeated column to, so convert to list. -# assert list(row["record_col"]["nested_repeated"]) == [0, 1, 2] -# # verify that nested data can be accessed with indices/keys -# assert row["record_col"]["nested_repeated"][0] == 0 -# assert ( -# row["record_col"]["nested_record"]["nested_nested_string"] -# == "some deep insight" -# ) -# # verify dtypes -# assert df.dtypes["bigfloat_col"].name == "float64" -# assert df.dtypes["smallfloat_col"].name == "float16" - - -# def test_list_rows_max_results_w_bqstorage(bigquery_client): -# table_ref = bigquery.DatasetReference("bigquery-public-data", "utility_us").table( -# "country_code_iso" -# ) -# bqstorage_client = bigquery_storage.BigQueryReadClient( -# credentials=bigquery_client._credentials -# ) - -# row_iterator = bigquery_client.list_rows( -# table_ref, -# selected_fields=[bigquery.SchemaField("country_name", "STRING")], -# max_results=100, -# ) -# with pytest.warns( -# UserWarning, match="Cannot use bqstorage_client if max_results is set" -# ): -# dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) - -# assert len(dataframe.index) == 100 - - -# @pytest.mark.parametrize( -# ("max_results",), -# ( -# (None,), -# (10,), -# ), # Use BQ Storage API. # Use REST API. -# ) -# def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): -# # TODO(GH#836): Avoid INTERVAL columns until they are supported by the -# # BigQuery Storage API and pyarrow. -# schema = [ -# bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), -# bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), -# bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), -# bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), -# bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), -# bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), -# bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), -# bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), -# bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), -# bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), -# bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), -# bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), -# ] - -# df = bigquery_client.list_rows( -# scalars_table, -# max_results=max_results, -# selected_fields=schema, -# ).to_dataframe() - -# assert df.dtypes["bool_col"].name == "boolean" -# assert df.dtypes["datetime_col"].name == "datetime64[ns]" -# assert df.dtypes["float64_col"].name == "float64" -# assert df.dtypes["int64_col"].name == "Int64" -# assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" -# assert df.dtypes["date_col"].name == "dbdate" -# assert df.dtypes["time_col"].name == "dbtime" - -# # decimal.Decimal is used to avoid loss of precision. -# assert df.dtypes["bignumeric_col"].name == "object" -# assert df.dtypes["numeric_col"].name == "object" - -# # pandas uses Python string and bytes objects. -# assert df.dtypes["bytes_col"].name == "object" -# assert df.dtypes["string_col"].name == "object" - - -# @pytest.mark.parametrize( -# ("max_results",), -# ( -# (None,), -# (10,), -# ), # Use BQ Storage API. # Use REST API. -# ) -# def test_list_rows_nullable_scalars_extreme_dtypes( -# bigquery_client, scalars_extreme_table, max_results -# ): -# # TODO(GH#836): Avoid INTERVAL columns until they are supported by the -# # BigQuery Storage API and pyarrow. -# schema = [ -# bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), -# bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), -# bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), -# bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), -# bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), -# bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), -# bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), -# bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), -# bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), -# bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), -# bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), -# bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), -# ] - -# df = bigquery_client.list_rows( -# scalars_extreme_table, -# max_results=max_results, -# selected_fields=schema, -# ).to_dataframe() - -# # Extreme values are out-of-bounds for pandas datetime64 values, which use -# # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must -# # be represented with object. -# # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations -# assert df.dtypes["date_col"].name == "object" -# assert df.dtypes["datetime_col"].name == "object" -# assert df.dtypes["timestamp_col"].name == "object" - -# # These pandas dtypes can handle the same ranges as BigQuery. -# assert df.dtypes["bool_col"].name == "boolean" -# assert df.dtypes["float64_col"].name == "float64" -# assert df.dtypes["int64_col"].name == "Int64" -# assert df.dtypes["time_col"].name == "dbtime" - -# # decimal.Decimal is used to avoid loss of precision. -# assert df.dtypes["numeric_col"].name == "object" -# assert df.dtypes["bignumeric_col"].name == "object" - -# # pandas uses Python string and bytes objects. -# assert df.dtypes["bytes_col"].name == "object" -# assert df.dtypes["string_col"].name == "object" - - -# def test_upload_time_and_datetime_56(bigquery_client, dataset_id): -# df = pandas.DataFrame( -# dict( -# dt=[ -# datetime.datetime(2020, 1, 8, 8, 0, 0), -# datetime.datetime( -# 2020, -# 1, -# 8, -# 8, -# 0, -# 0, -# tzinfo=datetime.timezone(datetime.timedelta(hours=-7)), -# ), -# ], -# t=[datetime.time(0, 0, 10, 100001), None], -# ) -# ) -# table = f"{dataset_id}.test_upload_time_and_datetime" -# bigquery_client.load_table_from_dataframe(df, table).result() -# data = list(map(list, bigquery_client.list_rows(table))) -# assert data == [ -# [ -# datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc), -# datetime.time(0, 0, 10, 100001), -# ], -# [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None], -# ] - -# from google.cloud.bigquery import job, schema - -# table = f"{dataset_id}.test_upload_time_and_datetime_dt" -# config = job.LoadJobConfig( -# schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")] -# ) - -# bigquery_client.load_table_from_dataframe(df, table, job_config=config).result() -# data = list(map(list, bigquery_client.list_rows(table))) -# assert data == [ -# [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)], -# [datetime.datetime(2020, 1, 8, 15, 0), None], -# ] - - -# def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): -# wkt = pytest.importorskip("shapely.wkt") -# bigquery_client.query( -# f"create table {dataset_id}.lake (name string, geog geography)" -# ).result() -# bigquery_client.query( -# f""" -# insert into {dataset_id}.lake (name, geog) values -# ('foo', st_geogfromtext('point(0 0)')), -# ('bar', st_geogfromtext('point(0 1)')), -# ('baz', null) -# """ -# ).result() -# df = bigquery_client.query( -# f"select * from {dataset_id}.lake order by name" -# ).to_dataframe(geography_as_object=True) -# assert list(df["name"]) == ["bar", "baz", "foo"] -# assert df["geog"][0] == wkt.loads("point(0 1)") -# assert pandas.isna(df["geog"][1]) -# assert df["geog"][2] == wkt.loads("point(0 0)") - - -# def test_to_geodataframe(bigquery_client, dataset_id): -# geopandas = pytest.importorskip("geopandas") -# from shapely import wkt - -# bigquery_client.query( -# f"create table {dataset_id}.geolake (name string, geog geography)" -# ).result() -# bigquery_client.query( -# f""" -# insert into {dataset_id}.geolake (name, geog) values -# ('foo', st_geogfromtext('point(0 0)')), -# ('bar', st_geogfromtext('polygon((0 0, 1 0, 1 1, 0 0))')), -# ('baz', null) -# """ -# ).result() -# df = bigquery_client.query( -# f"select * from {dataset_id}.geolake order by name" -# ).to_geodataframe() -# assert df["geog"][0] == wkt.loads("polygon((0 0, 1 0, 1 1, 0 0))") -# assert pandas.isna(df["geog"][1]) -# assert df["geog"][2] == wkt.loads("point(0 0)") -# assert isinstance(df, geopandas.GeoDataFrame) -# assert isinstance(df["geog"], geopandas.GeoSeries) - -# with warnings.catch_warnings(): -# # Computing the area on a GeoDataFrame that uses a geographic Coordinate -# # Reference System (CRS) produces a warning that we are not interested in. -# # We do not mind if the computed area is incorrect with respect to the -# # GeoDataFrame data, as long as it matches the expected "incorrect" value. -# warnings.filterwarnings("ignore", category=UserWarning) -# assert df.area[0] == 0.5 -# assert pandas.isna(df.area[1]) -# assert df.area[2] == 0.0 - -# assert df.crs.srs == "EPSG:4326" -# assert df.crs.name == "WGS 84" -# assert df.geog.crs.srs == "EPSG:4326" -# assert df.geog.crs.name == "WGS 84" - - -# def test_load_geodataframe(bigquery_client, dataset_id): -# geopandas = pytest.importorskip("geopandas") -# import pandas -# from shapely import wkt -# from google.cloud.bigquery.schema import SchemaField - -# df = geopandas.GeoDataFrame( -# pandas.DataFrame( -# dict( -# name=["foo", "bar"], -# geo1=[None, None], -# geo2=[None, wkt.loads("Point(1 1)")], -# ) -# ), -# geometry="geo1", -# ) - -# table_id = f"{dataset_id}.lake_from_gp" -# bigquery_client.load_table_from_dataframe(df, table_id).result() - -# table = bigquery_client.get_table(table_id) -# assert table.schema == [ -# SchemaField("name", "STRING", "NULLABLE"), -# SchemaField("geo1", "GEOGRAPHY", "NULLABLE"), -# SchemaField("geo2", "GEOGRAPHY", "NULLABLE"), -# ] -# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ -# ["bar", None, "POINT(1 1)"], -# ["foo", None, None], -# ] - - -# def test_load_dataframe_w_shapely(bigquery_client, dataset_id): -# wkt = pytest.importorskip("shapely.wkt") -# from google.cloud.bigquery.schema import SchemaField - -# df = pandas.DataFrame( -# dict(name=["foo", "bar"], geo=[None, wkt.loads("Point(1 1)")]) -# ) - -# table_id = f"{dataset_id}.lake_from_shapes" -# bigquery_client.load_table_from_dataframe(df, table_id).result() - -# table = bigquery_client.get_table(table_id) -# assert table.schema == [ -# SchemaField("name", "STRING", "NULLABLE"), -# SchemaField("geo", "GEOGRAPHY", "NULLABLE"), -# ] -# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ -# ["bar", "POINT(1 1)"], -# ["foo", None], -# ] - -# bigquery_client.load_table_from_dataframe(df, table_id).result() -# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ -# ["bar", "POINT(1 1)"], -# ["bar", "POINT(1 1)"], -# ["foo", None], -# ["foo", None], -# ] - - -# def test_load_dataframe_w_wkb(bigquery_client, dataset_id): -# wkt = pytest.importorskip("shapely.wkt") -# from shapely import wkb -# from google.cloud.bigquery.schema import SchemaField - -# df = pandas.DataFrame( -# dict(name=["foo", "bar"], geo=[None, wkb.dumps(wkt.loads("Point(1 1)"))]) -# ) - -# table_id = f"{dataset_id}.lake_from_wkb" -# # We create the table first, to inform the interpretation of the wkb data -# bigquery_client.query( -# f"create table {table_id} (name string, geo GEOGRAPHY)" -# ).result() -# bigquery_client.load_table_from_dataframe(df, table_id).result() - -# table = bigquery_client.get_table(table_id) -# assert table.schema == [ -# SchemaField("name", "STRING", "NULLABLE"), -# SchemaField("geo", "GEOGRAPHY", "NULLABLE"), -# ] -# assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ -# ["bar", "POINT(1 1)"], -# ["foo", None], -# ] +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""System tests for pandas connector.""" + +import collections +import datetime +import decimal +import json +import io +import operator +import warnings + +import google.api_core.retry +import pkg_resources +import pytest + +from google.cloud import bigquery +from google.cloud import bigquery_storage +from google.cloud.bigquery import enums + +from . import helpers + + +pandas = pytest.importorskip("pandas", minversion="0.23.0") +numpy = pytest.importorskip("numpy") + + +PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version +PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0") + + +class MissingDataError(Exception): + pass + + +def test_load_table_from_dataframe_w_automatic_schema(bigquery_client, dataset_id): + """Test that a DataFrame with dtypes that map well to BigQuery types + can be uploaded without specifying a schema. + + https://github.com/googleapis/google-cloud-python/issues/9044 + """ + df_data = collections.OrderedDict( + [ + ("bool_col", pandas.Series([True, False, True], dtype="bool")), + ( + "ts_col", + pandas.Series( + [ + datetime.datetime(2010, 1, 2, 3, 44, 50), + datetime.datetime(2011, 2, 3, 14, 50, 59), + datetime.datetime(2012, 3, 14, 15, 16), + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + ), + ( + "dt_col_no_tz", + pandas.Series( + [ + datetime.datetime(2010, 1, 2, 3, 44, 50), + datetime.datetime(2011, 2, 3, 14, 50, 59), + datetime.datetime(2012, 3, 14, 15, 16), + ], + dtype="datetime64[ns]", + ), + ), + ("float32_col", pandas.Series([1.0, 2.0, 3.0], dtype="float32")), + ("float64_col", pandas.Series([4.0, 5.0, 6.0], dtype="float64")), + ("int8_col", pandas.Series([-12, -11, -10], dtype="int8")), + ("int16_col", pandas.Series([-9, -8, -7], dtype="int16")), + ("int32_col", pandas.Series([-6, -5, -4], dtype="int32")), + ("int64_col", pandas.Series([-3, -2, -1], dtype="int64")), + ("uint8_col", pandas.Series([0, 1, 2], dtype="uint8")), + ("uint16_col", pandas.Series([3, 4, 5], dtype="uint16")), + ("uint32_col", pandas.Series([6, 7, 8], dtype="uint32")), + ( + "date_col", + pandas.Series( + [ + datetime.date(2010, 1, 2), + datetime.date(2011, 2, 3), + datetime.date(2012, 3, 14), + ], + dtype="dbdate", + ), + ), + ( + "time_col", + pandas.Series( + [ + datetime.time(3, 44, 50), + datetime.time(14, 50, 59), + datetime.time(15, 16), + ], + dtype="dbtime", + ), + ), + ("array_bool_col", pandas.Series([[True], [False], [True]])), + ( + "array_ts_col", + pandas.Series( + [ + [ + datetime.datetime( + 2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc + ), + ], + [ + datetime.datetime( + 2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc + ), + ], + [ + datetime.datetime( + 2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc + ), + ], + ], + ), + ), + ( + "array_dt_col_no_tz", + pandas.Series( + [ + [datetime.datetime(2010, 1, 2, 3, 44, 50)], + [datetime.datetime(2011, 2, 3, 14, 50, 59)], + [datetime.datetime(2012, 3, 14, 15, 16)], + ], + ), + ), + ( + "array_float32_col", + pandas.Series( + [numpy.array([_], dtype="float32") for _ in [1.0, 2.0, 3.0]] + ), + ), + ( + "array_float64_col", + pandas.Series( + [numpy.array([_], dtype="float64") for _ in [4.0, 5.0, 6.0]] + ), + ), + ( + "array_int8_col", + pandas.Series( + [numpy.array([_], dtype="int8") for _ in [-12, -11, -10]] + ), + ), + ( + "array_int16_col", + pandas.Series([numpy.array([_], dtype="int16") for _ in [-9, -8, -7]]), + ), + ( + "array_int32_col", + pandas.Series([numpy.array([_], dtype="int32") for _ in [-6, -5, -4]]), + ), + ( + "array_int64_col", + pandas.Series([numpy.array([_], dtype="int64") for _ in [-3, -2, -1]]), + ), + ( + "array_uint8_col", + pandas.Series([numpy.array([_], dtype="uint8") for _ in [0, 1, 2]]), + ), + ( + "array_uint16_col", + pandas.Series([numpy.array([_], dtype="uint16") for _ in [3, 4, 5]]), + ), + ( + "array_uint32_col", + pandas.Series([numpy.array([_], dtype="uint32") for _ in [6, 7, 8]]), + ), + ] + ) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_automatic_schema".format( + bigquery_client.project, dataset_id + ) + + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + bigquery.SchemaField("dt_col_no_tz", "DATETIME"), + bigquery.SchemaField("float32_col", "FLOAT"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("int8_col", "INTEGER"), + bigquery.SchemaField("int16_col", "INTEGER"), + bigquery.SchemaField("int32_col", "INTEGER"), + bigquery.SchemaField("int64_col", "INTEGER"), + bigquery.SchemaField("uint8_col", "INTEGER"), + bigquery.SchemaField("uint16_col", "INTEGER"), + bigquery.SchemaField("uint32_col", "INTEGER"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("array_bool_col", "BOOLEAN", mode="REPEATED"), + bigquery.SchemaField("array_ts_col", "TIMESTAMP", mode="REPEATED"), + bigquery.SchemaField("array_dt_col_no_tz", "DATETIME", mode="REPEATED"), + bigquery.SchemaField("array_float32_col", "FLOAT", mode="REPEATED"), + bigquery.SchemaField("array_float64_col", "FLOAT", mode="REPEATED"), + bigquery.SchemaField("array_int8_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_int16_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_int32_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_int64_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_uint8_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_uint16_col", "INTEGER", mode="REPEATED"), + bigquery.SchemaField("array_uint32_col", "INTEGER", mode="REPEATED"), + ) + + assert numpy.array( + sorted(map(list, bigquery_client.list_rows(table)), key=lambda r: r[5]), + dtype="object", + ).transpose().tolist() == [ + # bool_col + [True, False, True], + # ts_col + [ + datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc), + datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc), + datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc), + ], + # dt_col_no_tz + [ + datetime.datetime(2010, 1, 2, 3, 44, 50), + datetime.datetime(2011, 2, 3, 14, 50, 59), + datetime.datetime(2012, 3, 14, 15, 16), + ], + # float32_col + [1.0, 2.0, 3.0], + # float64_col + [4.0, 5.0, 6.0], + # int8_col + [-12, -11, -10], + # int16_col + [-9, -8, -7], + # int32_col + [-6, -5, -4], + # int64_col + [-3, -2, -1], + # uint8_col + [0, 1, 2], + # uint16_col + [3, 4, 5], + # uint32_col + [6, 7, 8], + # date_col + [ + datetime.date(2010, 1, 2), + datetime.date(2011, 2, 3), + datetime.date(2012, 3, 14), + ], + # time_col + [datetime.time(3, 44, 50), datetime.time(14, 50, 59), datetime.time(15, 16)], + # array_bool_col + [[True], [False], [True]], + # array_ts_col + [ + [datetime.datetime(2010, 1, 2, 3, 44, 50, tzinfo=datetime.timezone.utc)], + [datetime.datetime(2011, 2, 3, 14, 50, 59, tzinfo=datetime.timezone.utc)], + [datetime.datetime(2012, 3, 14, 15, 16, tzinfo=datetime.timezone.utc)], + ], + # array_dt_col + [ + [datetime.datetime(2010, 1, 2, 3, 44, 50)], + [datetime.datetime(2011, 2, 3, 14, 50, 59)], + [datetime.datetime(2012, 3, 14, 15, 16)], + ], + # array_float32_col + [[1.0], [2.0], [3.0]], + # array_float64_col + [[4.0], [5.0], [6.0]], + # array_int8_col + [[-12], [-11], [-10]], + # array_int16_col + [[-9], [-8], [-7]], + # array_int32_col + [[-6], [-5], [-4]], + # array_int64_col + [[-3], [-2], [-1]], + # array_uint8_col + [[0], [1], [2]], + # array_uint16_col + [[3], [4], [5]], + # array_uint32_col + [[6], [7], [8]], + ] + + +@pytest.mark.skipif( + PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, + reason="Only `pandas version >=1.0.0` is supported", +) +def test_load_table_from_dataframe_w_nullable_int64_datatype( + bigquery_client, dataset_id +): + """Test that a DataFrame containing column with None-type values and int64 datatype + can be uploaded if a BigQuery schema is specified. + + https://github.com/googleapis/python-bigquery/issues/22 + """ + table_id = "{}.{}.load_table_from_dataframe_w_nullable_int64_datatype".format( + bigquery_client.project, dataset_id + ) + table_schema = (bigquery.SchemaField("x", "INTEGER", mode="NULLABLE"),) + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + df_data = collections.OrderedDict( + [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] + ) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) + assert table.num_rows == 4 + + +@pytest.mark.skipif( + PANDAS_INSTALLED_VERSION < PANDAS_INT64_VERSION, + reason="Only `pandas version >=1.0.0` is supported", +) +def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema( + bigquery_client, dataset_id, table_id +): + """Test that a DataFrame containing column with None-type values and int64 datatype + can be uploaded without specifying a schema. + + https://github.com/googleapis/python-bigquery/issues/22 + """ + + df_data = collections.OrderedDict( + [("x", pandas.Series([1, 2, None, 4], dtype="Int64"))] + ) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == (bigquery.SchemaField("x", "INTEGER"),) + assert table.num_rows == 4 + + +def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): + """Test that a DataFrame with null columns can be uploaded if a + BigQuery schema is specified. + + See: https://github.com/googleapis/google-cloud-python/issues/7370 + """ + # Schema with all scalar types. + table_schema = ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("dt_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), + bigquery.SchemaField("str_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + ) + + num_rows = 100 + nulls = [None] * num_rows + df_data = [ + ("bool_col", nulls), + ("bytes_col", nulls), + ("date_col", nulls), + ("dt_col", nulls), + ("float_col", nulls), + ("geo_col", nulls), + ("int_col", nulls), + ("num_col", nulls), + ("bignum_col", nulls), + ("str_col", nulls), + ("time_col", nulls), + ("ts_col", nulls), + ] + df_data = collections.OrderedDict(df_data) + dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( + bigquery_client.project, dataset_id + ) + + # Create the table before loading so that schema mismatch errors are + # identified. + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + job_config = bigquery.LoadJobConfig(schema=table_schema) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table) + assert tuple(table.schema) == table_schema + assert table.num_rows == num_rows + + +def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id): + """Test that a DataFrame with required columns can be uploaded if a + BigQuery schema is specified. + + See: https://github.com/googleapis/google-cloud-python/issues/8093 + """ + table_schema = ( + bigquery.SchemaField("name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + ) + + records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}] + dataframe = pandas.DataFrame(records, columns=["name", "age"]) + job_config = bigquery.LoadJobConfig(schema=table_schema) + table_id = "{}.{}.load_table_from_dataframe_w_required".format( + bigquery_client.project, dataset_id + ) + + # Create the table before loading so that schema mismatch errors are + # identified. + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + job_config = bigquery.LoadJobConfig(schema=table_schema) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table) + assert tuple(table.schema) == table_schema + assert table.num_rows == 2 + + +def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id): + # Schema with all scalar types. + # See: + # https://github.com/googleapis/python-bigquery/issues/61 + # https://issuetracker.google.com/issues/151765076 + table_schema = ( + bigquery.SchemaField("row_num", "INTEGER"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("dt_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), + bigquery.SchemaField("str_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + ) + + df_data = [ + ("row_num", [1, 2, 3]), + ("bool_col", [True, None, False]), + ("bytes_col", [b"abc", None, b"def"]), + ("date_col", [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)]), + ( + "dt_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0), + None, + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ), + ("float_col", [float("-inf"), float("nan"), float("inf")]), + ( + "geo_col", + ["POINT(30 10)", None, "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], + ), + ("int_col", [-9223372036854775808, None, 9223372036854775807]), + ( + "num_col", + [ + decimal.Decimal("-99999999999999999999999999999.999999999"), + None, + decimal.Decimal("99999999999999999999999999999.999999999"), + ], + ), + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + ), + ("str_col", ["abc", None, "def"]), + ( + "time_col", + [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], + ), + ( + "ts_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + None, + datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ), + ] + df_data = collections.OrderedDict(df_data) + dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( + bigquery_client.project, dataset_id + ) + + job_config = bigquery.LoadJobConfig(schema=table_schema) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == table_schema + assert table.num_rows == 3 + + result = bigquery_client.list_rows(table).to_dataframe() + result.sort_values("row_num", inplace=True) + + # Check that extreme DATE/DATETIME values are loaded correctly. + # https://github.com/googleapis/python-bigquery/issues/1076 + assert result["date_col"][0] == datetime.date(1, 1, 1) + assert result["date_col"][2] == datetime.date(9999, 12, 31) + assert result["dt_col"][0] == datetime.datetime(1, 1, 1, 0, 0, 0) + assert result["dt_col"][2] == datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) + assert result["ts_col"][0] == datetime.datetime( + 1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc + ) + assert result["ts_col"][2] == datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ) + + +def test_load_table_from_dataframe_w_struct_datatype(bigquery_client, dataset_id): + """Test that a DataFrame with struct datatype can be uploaded if a + BigQuery schema is specified. + + https://github.com/googleapis/python-bigquery/issues/21 + """ + table_id = "{}.{}.load_table_from_dataframe_w_struct_datatype".format( + bigquery_client.project, dataset_id + ) + table_schema = [ + bigquery.SchemaField( + "bar", + "RECORD", + fields=[ + bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"), + ], + mode="REQUIRED", + ), + ] + table = helpers.retry_403(bigquery_client.create_table)( + bigquery.Table(table_id, schema=table_schema) + ) + + df_data = [{"id": 1, "age": 21}, {"id": 2, "age": 22}, {"id": 2, "age": 23}] + dataframe = pandas.DataFrame(data={"bar": df_data}, columns=["bar"]) + + load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert table.schema == table_schema + assert table.num_rows == 3 + + +def test_load_table_from_dataframe_w_explicit_schema_source_format_csv( + bigquery_client, dataset_id +): + from google.cloud.bigquery.job import SourceFormat + + table_schema = ( + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("dt_col", "DATETIME"), + bigquery.SchemaField("float_col", "FLOAT"), + bigquery.SchemaField("geo_col", "GEOGRAPHY"), + bigquery.SchemaField("int_col", "INTEGER"), + bigquery.SchemaField("num_col", "NUMERIC"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), + bigquery.SchemaField("str_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("ts_col", "TIMESTAMP"), + ) + df_data = collections.OrderedDict( + [ + ("bool_col", [True, None, False]), + ("bytes_col", ["abc", None, "def"]), + ( + "date_col", + [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], + ), + ( + "dt_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0), + None, + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + ), + ("float_col", [float("-inf"), float("nan"), float("inf")]), + ( + "geo_col", + [ + "POINT(30 10)", + None, + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", + ], + ), + ("int_col", [-9223372036854775808, None, 9223372036854775807]), + ( + "num_col", + [ + decimal.Decimal("-99999999999999999999999999999.999999999"), + None, + decimal.Decimal("99999999999999999999999999999.999999999"), + ], + ), + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + ), + ("str_col", ["abc", None, "def"]), + ( + "time_col", + [datetime.time(0, 0, 0), None, datetime.time(23, 59, 59, 999999)], + ), + ( + "ts_col", + [ + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc), + None, + datetime.datetime( + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=datetime.timezone.utc + ), + ], + ), + ] + ) + dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + + table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema_csv".format( + bigquery_client.project, dataset_id + ) + + job_config = bigquery.LoadJobConfig( + schema=table_schema, source_format=SourceFormat.CSV + ) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table_id) + assert tuple(table.schema) == table_schema + assert table.num_rows == 3 + + +def test_load_table_from_dataframe_w_explicit_schema_source_format_csv_floats( + bigquery_client, dataset_id, table_id +): + from google.cloud.bigquery.job import SourceFormat + + table_schema = (bigquery.SchemaField("float_col", "FLOAT"),) + df_data = collections.OrderedDict( + [ + ( + "float_col", + [ + 0.14285714285714285, + 0.51428571485748, + 0.87128748, + 1.807960649, + 2.0679610649, + 2.4406779661016949, + 3.7148514257, + 3.8571428571428572, + 1.51251252e40, + ], + ), + ] + ) + dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) + + job_config = bigquery.LoadJobConfig( + schema=table_schema, source_format=SourceFormat.CSV + ) + load_job = bigquery_client.load_table_from_dataframe( + dataframe, table_id, job_config=job_config + ) + load_job.result() + + table = bigquery_client.get_table(table_id) + rows = bigquery_client.list_rows(table_id) + floats = [r.values()[0] for r in rows] + assert tuple(table.schema) == table_schema + assert table.num_rows == 9 + assert floats == df_data["float_col"] + + +def test_query_results_to_dataframe(bigquery_client): + QUERY = """ + SELECT id, author, time_ts, dead + FROM `bigquery-public-data.hacker_news.comments` + LIMIT 10 + """ + + df = bigquery_client.query(QUERY).result().to_dataframe() + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 10 # verify the number of rows + column_names = ["id", "author", "time_ts", "dead"] + assert list(df) == column_names # verify the column names + exp_datatypes = { + "id": int, + "author": str, + "time_ts": pandas.Timestamp, + "dead": bool, + } + for _, row in df.iterrows(): + for col in column_names: + # all the schema fields are nullable, so None is acceptable + if not pandas.isna(row[col]): + assert isinstance(row[col], exp_datatypes[col]) + + +def test_query_results_to_dataframe_w_bqstorage(bigquery_client): + query = """ + SELECT id, author, time_ts, dead + FROM `bigquery-public-data.hacker_news.comments` + LIMIT 10 + """ + + bqstorage_client = bigquery_storage.BigQueryReadClient( + credentials=bigquery_client._credentials + ) + + df = bigquery_client.query(query).result().to_dataframe(bqstorage_client) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 10 # verify the number of rows + column_names = ["id", "author", "time_ts", "dead"] + assert list(df) == column_names + exp_datatypes = { + "id": int, + "author": str, + "time_ts": pandas.Timestamp, + "dead": bool, + } + for index, row in df.iterrows(): + for col in column_names: + # all the schema fields are nullable, so None is acceptable + if not pandas.isna(row[col]): + assert isinstance(row[col], exp_datatypes[col]) + + +def test_insert_rows_from_dataframe(bigquery_client, dataset_id): + SF = bigquery.SchemaField + schema = [ + SF("float_col", "FLOAT", mode="REQUIRED"), + SF("int_col", "INTEGER", mode="REQUIRED"), + SF("bool_col", "BOOLEAN", mode="REQUIRED"), + SF("string_col", "STRING", mode="NULLABLE"), + SF("date_col", "DATE", mode="NULLABLE"), + SF("time_col", "TIME", mode="NULLABLE"), + ] + + dataframe = pandas.DataFrame( + [ + { + "float_col": 1.11, + "bool_col": True, + "string_col": "my string", + "int_col": 10, + "date_col": datetime.date(2021, 1, 1), + "time_col": datetime.time(21, 1, 1), + }, + { + "float_col": 2.22, + "bool_col": False, + "string_col": "another string", + "int_col": 20, + "date_col": datetime.date(2021, 1, 2), + "time_col": datetime.time(21, 1, 2), + }, + { + "float_col": 3.33, + "bool_col": False, + "string_col": "another string", + "int_col": 30, + "date_col": datetime.date(2021, 1, 3), + "time_col": datetime.time(21, 1, 3), + }, + { + "float_col": 4.44, + "bool_col": True, + "string_col": "another string", + "int_col": 40, + "date_col": datetime.date(2021, 1, 4), + "time_col": datetime.time(21, 1, 4), + }, + { + "float_col": 5.55, + "bool_col": False, + "string_col": "another string", + "int_col": 50, + "date_col": datetime.date(2021, 1, 5), + "time_col": datetime.time(21, 1, 5), + }, + { + "float_col": 6.66, + "bool_col": True, + # Include a NaN value, because pandas often uses NaN as a + # NULL value indicator. + "string_col": float("NaN"), + "int_col": 60, + "date_col": datetime.date(2021, 1, 6), + "time_col": datetime.time(21, 1, 6), + }, + ] + ) + dataframe["date_col"] = dataframe["date_col"].astype("dbdate") + dataframe["time_col"] = dataframe["time_col"].astype("dbtime") + + table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe" + table_arg = bigquery.Table(table_id, schema=schema) + table = helpers.retry_403(bigquery_client.create_table)(table_arg) + + chunk_errors = bigquery_client.insert_rows_from_dataframe( + table, dataframe, chunk_size=3 + ) + for errors in chunk_errors: + assert not errors + expected = [ + # Pandas often represents NULL values as NaN. Convert to None for + # easier comparison. + tuple(None if col != col else col for col in data_row) + for data_row in dataframe.itertuples(index=False) + ] + + # Use query to fetch rows instead of listing directly from the table so + # that we get values from the streaming buffer "within a few seconds". + # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability + @google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type(MissingDataError) + ) + def get_rows(): + rows = list( + bigquery_client.query( + "SELECT * FROM `{}.{}.{}`".format( + table.project, table.dataset_id, table.table_id + ) + ) + ) + if len(rows) != len(expected): + raise MissingDataError() + return rows + + rows = get_rows() + sorted_rows = sorted(rows, key=operator.attrgetter("int_col")) + row_tuples = [r.values() for r in sorted_rows] + + for row, expected_row in zip(row_tuples, expected): + assert ( + # Use Counter to verify the same number of values in each, because + # column order does not matter. + collections.Counter(row) + == collections.Counter(expected_row) + ) + + +def test_nested_table_to_dataframe(bigquery_client, dataset_id): + from google.cloud.bigquery.job import SourceFormat + from google.cloud.bigquery.job import WriteDisposition + + SF = bigquery.SchemaField + schema = [ + SF("string_col", "STRING", mode="NULLABLE"), + SF( + "record_col", + "RECORD", + mode="NULLABLE", + fields=[ + SF("nested_string", "STRING", mode="NULLABLE"), + SF("nested_repeated", "INTEGER", mode="REPEATED"), + SF( + "nested_record", + "RECORD", + mode="NULLABLE", + fields=[SF("nested_nested_string", "STRING", mode="NULLABLE")], + ), + ], + ), + SF("bigfloat_col", "FLOAT", mode="NULLABLE"), + SF("smallfloat_col", "FLOAT", mode="NULLABLE"), + ] + record = { + "nested_string": "another string value", + "nested_repeated": [0, 1, 2], + "nested_record": {"nested_nested_string": "some deep insight"}, + } + to_insert = [ + { + "string_col": "Some value", + "record_col": record, + "bigfloat_col": 3.14, + "smallfloat_col": 2.72, + } + ] + rows = [json.dumps(row) for row in to_insert] + body = io.BytesIO("{}\n".format("\n".join(rows)).encode("ascii")) + table_id = f"{bigquery_client.project}.{dataset_id}.test_nested_table_to_dataframe" + job_config = bigquery.LoadJobConfig() + job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE + job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON + job_config.schema = schema + # Load a table using a local JSON file from memory. + bigquery_client.load_table_from_file(body, table_id, job_config=job_config).result() + + df = bigquery_client.list_rows(table_id, selected_fields=schema).to_dataframe( + dtypes={"smallfloat_col": "float16"} + ) + + assert isinstance(df, pandas.DataFrame) + assert len(df) == 1 # verify the number of rows + exp_columns = ["string_col", "record_col", "bigfloat_col", "smallfloat_col"] + assert list(df) == exp_columns # verify the column names + row = df.iloc[0] + # verify the row content + assert row["string_col"] == "Some value" + expected_keys = tuple(sorted(record.keys())) + row_keys = tuple(sorted(row["record_col"].keys())) + assert row_keys == expected_keys + # Can't compare numpy arrays, which pyarrow encodes the embedded + # repeated column to, so convert to list. + assert list(row["record_col"]["nested_repeated"]) == [0, 1, 2] + # verify that nested data can be accessed with indices/keys + assert row["record_col"]["nested_repeated"][0] == 0 + assert ( + row["record_col"]["nested_record"]["nested_nested_string"] + == "some deep insight" + ) + # verify dtypes + assert df.dtypes["bigfloat_col"].name == "float64" + assert df.dtypes["smallfloat_col"].name == "float16" + + +def test_list_rows_max_results_w_bqstorage(bigquery_client): + table_ref = bigquery.DatasetReference("bigquery-public-data", "utility_us").table( + "country_code_iso" + ) + bqstorage_client = bigquery_storage.BigQueryReadClient( + credentials=bigquery_client._credentials + ) + + row_iterator = bigquery_client.list_rows( + table_ref, + selected_fields=[bigquery.SchemaField("country_name", "STRING")], + max_results=100, + ) + with pytest.warns( + UserWarning, match="Cannot use bqstorage_client if max_results is set" + ): + dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) + + assert len(dataframe.index) == 100 + + +@pytest.mark.parametrize( + ("max_results",), + ( + (None,), + (10,), + ), # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): + # TODO(GH#836): Avoid INTERVAL columns until they are supported by the + # BigQuery Storage API and pyarrow. + schema = [ + bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), + bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), + bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), + bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), + bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), + bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), + bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), + bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), + bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), + bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), + bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), + bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), + ] + + df = bigquery_client.list_rows( + scalars_table, + max_results=max_results, + selected_fields=schema, + ).to_dataframe() + + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + assert df.dtypes["date_col"].name == "dbdate" + assert df.dtypes["time_col"].name == "dbtime" + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["bignumeric_col"].name == "object" + assert df.dtypes["numeric_col"].name == "object" + + # pandas uses Python string and bytes objects. + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["string_col"].name == "object" + + +@pytest.mark.parametrize( + ("max_results",), + ( + (None,), + (10,), + ), # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_extreme_dtypes( + bigquery_client, scalars_extreme_table, max_results +): + # TODO(GH#836): Avoid INTERVAL columns until they are supported by the + # BigQuery Storage API and pyarrow. + schema = [ + bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), + bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), + bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), + bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), + bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), + bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), + bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), + bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), + bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), + bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), + bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), + bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), + ] + + df = bigquery_client.list_rows( + scalars_extreme_table, + max_results=max_results, + selected_fields=schema, + ).to_dataframe() + + # Extreme values are out-of-bounds for pandas datetime64 values, which use + # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must + # be represented with object. + # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations + assert df.dtypes["date_col"].name == "object" + assert df.dtypes["datetime_col"].name == "object" + assert df.dtypes["timestamp_col"].name == "object" + + # These pandas dtypes can handle the same ranges as BigQuery. + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["time_col"].name == "dbtime" + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["bignumeric_col"].name == "object" + + # pandas uses Python string and bytes objects. + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["string_col"].name == "object" + + +def test_upload_time_and_datetime_56(bigquery_client, dataset_id): + df = pandas.DataFrame( + dict( + dt=[ + datetime.datetime(2020, 1, 8, 8, 0, 0), + datetime.datetime( + 2020, + 1, + 8, + 8, + 0, + 0, + tzinfo=datetime.timezone(datetime.timedelta(hours=-7)), + ), + ], + t=[datetime.time(0, 0, 10, 100001), None], + ) + ) + table = f"{dataset_id}.test_upload_time_and_datetime" + bigquery_client.load_table_from_dataframe(df, table).result() + data = list(map(list, bigquery_client.list_rows(table))) + assert data == [ + [ + datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc), + datetime.time(0, 0, 10, 100001), + ], + [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None], + ] + + from google.cloud.bigquery import job, schema + + table = f"{dataset_id}.test_upload_time_and_datetime_dt" + config = job.LoadJobConfig( + schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")] + ) + + bigquery_client.load_table_from_dataframe(df, table, job_config=config).result() + data = list(map(list, bigquery_client.list_rows(table))) + assert data == [ + [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)], + [datetime.datetime(2020, 1, 8, 15, 0), None], + ] + + +def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id): + wkt = pytest.importorskip("shapely.wkt") + bigquery_client.query( + f"create table {dataset_id}.lake (name string, geog geography)" + ).result() + bigquery_client.query( + f""" + insert into {dataset_id}.lake (name, geog) values + ('foo', st_geogfromtext('point(0 0)')), + ('bar', st_geogfromtext('point(0 1)')), + ('baz', null) + """ + ).result() + df = bigquery_client.query( + f"select * from {dataset_id}.lake order by name" + ).to_dataframe(geography_as_object=True) + assert list(df["name"]) == ["bar", "baz", "foo"] + assert df["geog"][0] == wkt.loads("point(0 1)") + assert pandas.isna(df["geog"][1]) + assert df["geog"][2] == wkt.loads("point(0 0)") + + +def test_to_geodataframe(bigquery_client, dataset_id): + geopandas = pytest.importorskip("geopandas") + from shapely import wkt + + bigquery_client.query( + f"create table {dataset_id}.geolake (name string, geog geography)" + ).result() + bigquery_client.query( + f""" + insert into {dataset_id}.geolake (name, geog) values + ('foo', st_geogfromtext('point(0 0)')), + ('bar', st_geogfromtext('polygon((0 0, 1 0, 1 1, 0 0))')), + ('baz', null) + """ + ).result() + df = bigquery_client.query( + f"select * from {dataset_id}.geolake order by name" + ).to_geodataframe() + assert df["geog"][0] == wkt.loads("polygon((0 0, 1 0, 1 1, 0 0))") + assert pandas.isna(df["geog"][1]) + assert df["geog"][2] == wkt.loads("point(0 0)") + assert isinstance(df, geopandas.GeoDataFrame) + assert isinstance(df["geog"], geopandas.GeoSeries) + + with warnings.catch_warnings(): + # Computing the area on a GeoDataFrame that uses a geographic Coordinate + # Reference System (CRS) produces a warning that we are not interested in. + # We do not mind if the computed area is incorrect with respect to the + # GeoDataFrame data, as long as it matches the expected "incorrect" value. + warnings.filterwarnings("ignore", category=UserWarning) + assert df.area[0] == 0.5 + assert pandas.isna(df.area[1]) + assert df.area[2] == 0.0 + + assert df.crs.srs == "EPSG:4326" + assert df.crs.name == "WGS 84" + assert df.geog.crs.srs == "EPSG:4326" + assert df.geog.crs.name == "WGS 84" + + +def test_load_geodataframe(bigquery_client, dataset_id): + geopandas = pytest.importorskip("geopandas") + import pandas + from shapely import wkt + from google.cloud.bigquery.schema import SchemaField + + df = geopandas.GeoDataFrame( + pandas.DataFrame( + dict( + name=["foo", "bar"], + geo1=[None, None], + geo2=[None, wkt.loads("Point(1 1)")], + ) + ), + geometry="geo1", + ) + + table_id = f"{dataset_id}.lake_from_gp" + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo1", "GEOGRAPHY", "NULLABLE"), + SchemaField("geo2", "GEOGRAPHY", "NULLABLE"), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", None, "POINT(1 1)"], + ["foo", None, None], + ] + + +def test_load_dataframe_w_shapely(bigquery_client, dataset_id): + wkt = pytest.importorskip("shapely.wkt") + from google.cloud.bigquery.schema import SchemaField + + df = pandas.DataFrame( + dict(name=["foo", "bar"], geo=[None, wkt.loads("Point(1 1)")]) + ) + + table_id = f"{dataset_id}.lake_from_shapes" + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo", "GEOGRAPHY", "NULLABLE"), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", "POINT(1 1)"], + ["foo", None], + ] + + bigquery_client.load_table_from_dataframe(df, table_id).result() + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", "POINT(1 1)"], + ["bar", "POINT(1 1)"], + ["foo", None], + ["foo", None], + ] + + +def test_load_dataframe_w_wkb(bigquery_client, dataset_id): + wkt = pytest.importorskip("shapely.wkt") + from shapely import wkb + from google.cloud.bigquery.schema import SchemaField + + df = pandas.DataFrame( + dict(name=["foo", "bar"], geo=[None, wkb.dumps(wkt.loads("Point(1 1)"))]) + ) + + table_id = f"{dataset_id}.lake_from_wkb" + # We create the table first, to inform the interpretation of the wkb data + bigquery_client.query( + f"create table {table_id} (name string, geo GEOGRAPHY)" + ).result() + bigquery_client.load_table_from_dataframe(df, table_id).result() + + table = bigquery_client.get_table(table_id) + assert table.schema == [ + SchemaField("name", "STRING", "NULLABLE"), + SchemaField("geo", "GEOGRAPHY", "NULLABLE"), + ] + assert sorted(map(list, bigquery_client.list_rows(table_id))) == [ + ["bar", "POINT(1 1)"], + ["foo", None], + ] diff --git a/tests/system/test_query.py b/tests/system/test_query.py index 5dc7c7875..723f927d7 100644 --- a/tests/system/test_query.py +++ b/tests/system/test_query.py @@ -1,503 +1,503 @@ -# # Copyright 2021 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # http://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# import concurrent.futures -# import datetime -# import decimal -# from typing import Tuple - -# from google.api_core import exceptions -# import pytest - -# from google.cloud import bigquery -# from google.cloud.bigquery.query import ArrayQueryParameter -# from google.cloud.bigquery.query import ScalarQueryParameter -# from google.cloud.bigquery.query import ScalarQueryParameterType -# from google.cloud.bigquery.query import StructQueryParameter -# from google.cloud.bigquery.query import StructQueryParameterType - - -# @pytest.fixture(params=["INSERT", "QUERY"]) -# def query_api_method(request): -# return request.param - - -# @pytest.fixture(scope="session") -# def table_with_9999_columns_10_rows(bigquery_client, project_id, dataset_id): -# """Generate a table of maximum width via CREATE TABLE AS SELECT. - -# The first column is named 'rowval', and has a value from 1..rowcount -# Subsequent columns are named col_ and contain the value N*rowval, where -# N is between 1 and 9999 inclusive. -# """ -# table_id = "many_columns" -# row_count = 10 -# col_projections = ",".join(f"r * {n} as col_{n}" for n in range(1, 10000)) -# sql = f""" -# CREATE TABLE `{project_id}.{dataset_id}.{table_id}` -# AS -# SELECT -# r as rowval, -# {col_projections} -# FROM -# UNNEST(GENERATE_ARRAY(1,{row_count},1)) as r -# """ -# query_job = bigquery_client.query(sql) -# query_job.result() - -# return f"{project_id}.{dataset_id}.{table_id}" - - -# def test_query_many_columns( -# bigquery_client, table_with_9999_columns_10_rows, query_api_method -# ): -# # Test working with the widest schema BigQuery supports, 10k columns. -# query_job = bigquery_client.query( -# f"SELECT * FROM `{table_with_9999_columns_10_rows}`", -# api_method=query_api_method, -# ) -# rows = list(query_job) -# assert len(rows) == 10 - -# # check field representations adhere to expected values. -# for row in rows: -# rowval = row["rowval"] -# for column in range(1, 10000): -# assert row[f"col_{column}"] == rowval * column - - -# def test_query_w_timeout(bigquery_client, query_api_method): -# job_config = bigquery.QueryJobConfig() -# job_config.use_query_cache = False - -# query_job = bigquery_client.query( -# "SELECT * FROM `bigquery-public-data.github_repos.commits`;", -# location="US", -# job_config=job_config, -# api_method=query_api_method, -# ) - -# with pytest.raises(concurrent.futures.TimeoutError): -# query_job.result(timeout=1) - -# # Even though the query takes >1 second, the call to getQueryResults -# # should succeed. -# assert not query_job.done(timeout=1) -# assert bigquery_client.cancel_job(query_job) is not None - - -# def test_query_statistics(bigquery_client, query_api_method): -# """ -# A system test to exercise some of the extended query statistics. - -# Note: We construct a query that should need at least three stages by -# specifying a JOIN query. Exact plan and stats are effectively -# non-deterministic, so we're largely interested in confirming values -# are present. -# """ - -# job_config = bigquery.QueryJobConfig() -# job_config.use_query_cache = False - -# query_job = bigquery_client.query( -# """ -# SELECT -# COUNT(1) -# FROM -# ( -# SELECT -# year, -# wban_number -# FROM `bigquery-public-data.samples.gsod` -# LIMIT 1000 -# ) lside -# INNER JOIN -# ( -# SELECT -# year, -# state -# FROM `bigquery-public-data.samples.natality` -# LIMIT 1000 -# ) rside -# ON -# lside.year = rside.year -# """, -# location="US", -# job_config=job_config, -# api_method=query_api_method, -# ) - -# # run the job to completion -# query_job.result() - -# # Must reload job to get stats if jobs.query was used. -# if query_api_method == "QUERY": -# query_job.reload() - -# # Assert top-level stats -# assert not query_job.cache_hit -# assert query_job.destination is not None -# assert query_job.done -# assert not query_job.dry_run -# assert query_job.num_dml_affected_rows is None -# assert query_job.priority == "INTERACTIVE" -# assert query_job.total_bytes_billed > 1 -# assert query_job.total_bytes_processed > 1 -# assert query_job.statement_type == "SELECT" -# assert query_job.slot_millis > 1 - -# # Make assertions on the shape of the query plan. -# plan = query_job.query_plan -# assert len(plan) >= 3 -# first_stage = plan[0] -# assert first_stage.start is not None -# assert first_stage.end is not None -# assert first_stage.entry_id is not None -# assert first_stage.name is not None -# assert first_stage.parallel_inputs > 0 -# assert first_stage.completed_parallel_inputs > 0 -# assert first_stage.shuffle_output_bytes > 0 -# assert first_stage.status == "COMPLETE" - -# # Query plan is a digraph. Ensure it has inter-stage links, -# # but not every stage has inputs. -# stages_with_inputs = 0 -# for entry in plan: -# if len(entry.input_stages) > 0: -# stages_with_inputs = stages_with_inputs + 1 -# assert stages_with_inputs > 0 -# assert len(plan) > stages_with_inputs - - -# @pytest.mark.parametrize( -# ("sql", "expected", "query_parameters"), -# ( -# ( -# "SELECT @question", -# "What is the answer to life, the universe, and everything?", -# [ -# ScalarQueryParameter( -# name="question", -# type_="STRING", -# value="What is the answer to life, the universe, and everything?", -# ) -# ], -# ), -# ( -# "SELECT @answer", -# 42, -# [ScalarQueryParameter(name="answer", type_="INT64", value=42)], -# ), -# ( -# "SELECT @pi", -# 3.1415926, -# [ScalarQueryParameter(name="pi", type_="FLOAT64", value=3.1415926)], -# ), -# ( -# "SELECT @pi_numeric_param", -# decimal.Decimal("3.141592654"), -# [ -# ScalarQueryParameter( -# name="pi_numeric_param", -# type_="NUMERIC", -# value=decimal.Decimal("3.141592654"), -# ) -# ], -# ), -# ( -# "SELECT @bignum_param", -# decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), -# [ -# ScalarQueryParameter( -# name="bignum_param", -# type_="BIGNUMERIC", -# value=decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), -# ) -# ], -# ), -# ( -# "SELECT @truthy", -# True, -# [ScalarQueryParameter(name="truthy", type_="BOOL", value=True)], -# ), -# ( -# "SELECT @beef", -# b"DEADBEEF", -# [ScalarQueryParameter(name="beef", type_="BYTES", value=b"DEADBEEF")], -# ), -# ( -# "SELECT @naive", -# datetime.datetime(2016, 12, 5, 12, 41, 9), -# [ -# ScalarQueryParameter( -# name="naive", -# type_="DATETIME", -# value=datetime.datetime(2016, 12, 5, 12, 41, 9), -# ) -# ], -# ), -# ( -# "SELECT @naive_date", -# datetime.date(2016, 12, 5), -# [ -# ScalarQueryParameter( -# name="naive_date", type_="DATE", value=datetime.date(2016, 12, 5) -# ) -# ], -# ), -# ( -# "SELECT @naive_time", -# datetime.time(12, 41, 9, 62500), -# [ -# ScalarQueryParameter( -# name="naive_time", -# type_="TIME", -# value=datetime.time(12, 41, 9, 62500), -# ) -# ], -# ), -# ( -# "SELECT @zoned", -# datetime.datetime(2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc), -# [ -# ScalarQueryParameter( -# name="zoned", -# type_="TIMESTAMP", -# value=datetime.datetime( -# 2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc -# ), -# ) -# ], -# ), -# ( -# "SELECT @array_param", -# [1, 2], -# [ -# ArrayQueryParameter( -# name="array_param", array_type="INT64", values=[1, 2] -# ) -# ], -# ), -# ( -# "SELECT (@hitchhiker.question, @hitchhiker.answer)", -# ({"_field_1": "What is the answer?", "_field_2": 42}), -# [ -# StructQueryParameter( -# "hitchhiker", -# ScalarQueryParameter( -# name="question", -# type_="STRING", -# value="What is the answer?", -# ), -# ScalarQueryParameter( -# name="answer", -# type_="INT64", -# value=42, -# ), -# ), -# ], -# ), -# ( -# "SELECT " -# "((@rectangle.bottom_right.x - @rectangle.top_left.x) " -# "* (@rectangle.top_left.y - @rectangle.bottom_right.y))", -# 100, -# [ -# StructQueryParameter( -# "rectangle", -# StructQueryParameter( -# "top_left", -# ScalarQueryParameter("x", "INT64", 12), -# ScalarQueryParameter("y", "INT64", 102), -# ), -# StructQueryParameter( -# "bottom_right", -# ScalarQueryParameter("x", "INT64", 22), -# ScalarQueryParameter("y", "INT64", 92), -# ), -# ) -# ], -# ), -# ( -# "SELECT ?", -# [ -# {"name": "Phred Phlyntstone", "age": 32}, -# {"name": "Bharney Rhubbyl", "age": 31}, -# ], -# [ -# ArrayQueryParameter( -# name=None, -# array_type="RECORD", -# values=[ -# StructQueryParameter( -# None, -# ScalarQueryParameter( -# name="name", type_="STRING", value="Phred Phlyntstone" -# ), -# ScalarQueryParameter(name="age", type_="INT64", value=32), -# ), -# StructQueryParameter( -# None, -# ScalarQueryParameter( -# name="name", type_="STRING", value="Bharney Rhubbyl" -# ), -# ScalarQueryParameter(name="age", type_="INT64", value=31), -# ), -# ], -# ) -# ], -# ), -# ( -# "SELECT @empty_array_param", -# [], -# [ -# ArrayQueryParameter( -# name="empty_array_param", -# values=[], -# array_type=StructQueryParameterType( -# ScalarQueryParameterType(name="foo", type_="INT64"), -# ScalarQueryParameterType(name="bar", type_="STRING"), -# ), -# ) -# ], -# ), -# ( -# "SELECT @roles", -# { -# "hero": {"name": "Phred Phlyntstone", "age": 32}, -# "sidekick": {"name": "Bharney Rhubbyl", "age": 31}, -# }, -# [ -# StructQueryParameter( -# "roles", -# StructQueryParameter( -# "hero", -# ScalarQueryParameter( -# name="name", type_="STRING", value="Phred Phlyntstone" -# ), -# ScalarQueryParameter(name="age", type_="INT64", value=32), -# ), -# StructQueryParameter( -# "sidekick", -# ScalarQueryParameter( -# name="name", type_="STRING", value="Bharney Rhubbyl" -# ), -# ScalarQueryParameter(name="age", type_="INT64", value=31), -# ), -# ), -# ], -# ), -# ( -# "SELECT ?", -# {"friends": ["Jack", "Jill"]}, -# [ -# StructQueryParameter( -# None, -# ArrayQueryParameter( -# name="friends", array_type="STRING", values=["Jack", "Jill"] -# ), -# ) -# ], -# ), -# ), -# ) -# def test_query_parameters( -# bigquery_client, query_api_method, sql, expected, query_parameters -# ): -# jconfig = bigquery.QueryJobConfig() -# jconfig.query_parameters = query_parameters -# query_job = bigquery_client.query( -# sql, -# job_config=jconfig, -# api_method=query_api_method, -# ) -# rows = list(query_job.result()) -# assert len(rows) == 1 -# assert len(rows[0]) == 1 -# assert rows[0][0] == expected - - -# def test_dry_run( -# bigquery_client: bigquery.Client, -# query_api_method: str, -# scalars_table_multi_location: Tuple[str, str], -# ): -# location, full_table_id = scalars_table_multi_location -# query_config = bigquery.QueryJobConfig() -# query_config.dry_run = True - -# query_string = f"SELECT * FROM {full_table_id}" -# query_job = bigquery_client.query( -# query_string, -# location=location, -# job_config=query_config, -# api_method=query_api_method, -# ) - -# # Note: `query_job.result()` is not necessary on a dry run query. All -# # necessary information is returned in the initial response. -# assert query_job.dry_run is True -# assert query_job.total_bytes_processed > 0 -# assert len(query_job.schema) > 0 - - -# def test_query_error_w_api_method_query(bigquery_client: bigquery.Client): -# """No job is returned from jobs.query if the query fails.""" - -# with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): -# bigquery_client.query( -# "SELECT * FROM not_a_real_dataset.doesnt_exist", api_method="QUERY" -# ) - - -# def test_query_error_w_api_method_default(bigquery_client: bigquery.Client): -# """Test that an exception is not thrown until fetching the results. - -# For backwards compatibility, jobs.insert is the default API method. With -# jobs.insert, a failed query job is "sucessfully" created. An exception is -# thrown when fetching the results. -# """ - -# query_job = bigquery_client.query("SELECT * FROM not_a_real_dataset.doesnt_exist") - -# with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): -# query_job.result() - - -# def test_session(bigquery_client: bigquery.Client, query_api_method: str): -# initial_config = bigquery.QueryJobConfig() -# initial_config.create_session = True -# initial_query = """ -# CREATE TEMPORARY TABLE numbers(id INT64) -# AS -# SELECT * FROM UNNEST([1, 2, 3, 4, 5]) AS id; -# """ -# initial_job = bigquery_client.query( -# initial_query, job_config=initial_config, api_method=query_api_method -# ) -# initial_job.result() -# session_id = initial_job.session_info.session_id -# assert session_id is not None - -# second_config = bigquery.QueryJobConfig() -# second_config.connection_properties = [ -# bigquery.ConnectionProperty("session_id", session_id), -# ] -# second_job = bigquery_client.query( -# "SELECT COUNT(*) FROM numbers;", job_config=second_config -# ) -# rows = list(second_job.result()) - -# assert len(rows) == 1 -# assert rows[0][0] == 5 +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import concurrent.futures +import datetime +import decimal +from typing import Tuple + +from google.api_core import exceptions +import pytest + +from google.cloud import bigquery +from google.cloud.bigquery.query import ArrayQueryParameter +from google.cloud.bigquery.query import ScalarQueryParameter +from google.cloud.bigquery.query import ScalarQueryParameterType +from google.cloud.bigquery.query import StructQueryParameter +from google.cloud.bigquery.query import StructQueryParameterType + + +@pytest.fixture(params=["INSERT", "QUERY"]) +def query_api_method(request): + return request.param + + +@pytest.fixture(scope="session") +def table_with_9999_columns_10_rows(bigquery_client, project_id, dataset_id): + """Generate a table of maximum width via CREATE TABLE AS SELECT. + + The first column is named 'rowval', and has a value from 1..rowcount + Subsequent columns are named col_ and contain the value N*rowval, where + N is between 1 and 9999 inclusive. + """ + table_id = "many_columns" + row_count = 10 + col_projections = ",".join(f"r * {n} as col_{n}" for n in range(1, 10000)) + sql = f""" + CREATE TABLE `{project_id}.{dataset_id}.{table_id}` + AS + SELECT + r as rowval, + {col_projections} + FROM + UNNEST(GENERATE_ARRAY(1,{row_count},1)) as r + """ + query_job = bigquery_client.query(sql) + query_job.result() + + return f"{project_id}.{dataset_id}.{table_id}" + + +def test_query_many_columns( + bigquery_client, table_with_9999_columns_10_rows, query_api_method +): + # Test working with the widest schema BigQuery supports, 10k columns. + query_job = bigquery_client.query( + f"SELECT * FROM `{table_with_9999_columns_10_rows}`", + api_method=query_api_method, + ) + rows = list(query_job) + assert len(rows) == 10 + + # check field representations adhere to expected values. + for row in rows: + rowval = row["rowval"] + for column in range(1, 10000): + assert row[f"col_{column}"] == rowval * column + + +def test_query_w_timeout(bigquery_client, query_api_method): + job_config = bigquery.QueryJobConfig() + job_config.use_query_cache = False + + query_job = bigquery_client.query( + "SELECT * FROM `bigquery-public-data.github_repos.commits`;", + location="US", + job_config=job_config, + api_method=query_api_method, + ) + + with pytest.raises(concurrent.futures.TimeoutError): + query_job.result(timeout=1) + + # Even though the query takes >1 second, the call to getQueryResults + # should succeed. + assert not query_job.done(timeout=1) + assert bigquery_client.cancel_job(query_job) is not None + + +def test_query_statistics(bigquery_client, query_api_method): + """ + A system test to exercise some of the extended query statistics. + + Note: We construct a query that should need at least three stages by + specifying a JOIN query. Exact plan and stats are effectively + non-deterministic, so we're largely interested in confirming values + are present. + """ + + job_config = bigquery.QueryJobConfig() + job_config.use_query_cache = False + + query_job = bigquery_client.query( + """ + SELECT + COUNT(1) + FROM + ( + SELECT + year, + wban_number + FROM `bigquery-public-data.samples.gsod` + LIMIT 1000 + ) lside + INNER JOIN + ( + SELECT + year, + state + FROM `bigquery-public-data.samples.natality` + LIMIT 1000 + ) rside + ON + lside.year = rside.year + """, + location="US", + job_config=job_config, + api_method=query_api_method, + ) + + # run the job to completion + query_job.result() + + # Must reload job to get stats if jobs.query was used. + if query_api_method == "QUERY": + query_job.reload() + + # Assert top-level stats + assert not query_job.cache_hit + assert query_job.destination is not None + assert query_job.done + assert not query_job.dry_run + assert query_job.num_dml_affected_rows is None + assert query_job.priority == "INTERACTIVE" + assert query_job.total_bytes_billed > 1 + assert query_job.total_bytes_processed > 1 + assert query_job.statement_type == "SELECT" + assert query_job.slot_millis > 1 + + # Make assertions on the shape of the query plan. + plan = query_job.query_plan + assert len(plan) >= 3 + first_stage = plan[0] + assert first_stage.start is not None + assert first_stage.end is not None + assert first_stage.entry_id is not None + assert first_stage.name is not None + assert first_stage.parallel_inputs > 0 + assert first_stage.completed_parallel_inputs > 0 + assert first_stage.shuffle_output_bytes > 0 + assert first_stage.status == "COMPLETE" + + # Query plan is a digraph. Ensure it has inter-stage links, + # but not every stage has inputs. + stages_with_inputs = 0 + for entry in plan: + if len(entry.input_stages) > 0: + stages_with_inputs = stages_with_inputs + 1 + assert stages_with_inputs > 0 + assert len(plan) > stages_with_inputs + + +@pytest.mark.parametrize( + ("sql", "expected", "query_parameters"), + ( + ( + "SELECT @question", + "What is the answer to life, the universe, and everything?", + [ + ScalarQueryParameter( + name="question", + type_="STRING", + value="What is the answer to life, the universe, and everything?", + ) + ], + ), + ( + "SELECT @answer", + 42, + [ScalarQueryParameter(name="answer", type_="INT64", value=42)], + ), + ( + "SELECT @pi", + 3.1415926, + [ScalarQueryParameter(name="pi", type_="FLOAT64", value=3.1415926)], + ), + ( + "SELECT @pi_numeric_param", + decimal.Decimal("3.141592654"), + [ + ScalarQueryParameter( + name="pi_numeric_param", + type_="NUMERIC", + value=decimal.Decimal("3.141592654"), + ) + ], + ), + ( + "SELECT @bignum_param", + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + [ + ScalarQueryParameter( + name="bignum_param", + type_="BIGNUMERIC", + value=decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + ) + ], + ), + ( + "SELECT @truthy", + True, + [ScalarQueryParameter(name="truthy", type_="BOOL", value=True)], + ), + ( + "SELECT @beef", + b"DEADBEEF", + [ScalarQueryParameter(name="beef", type_="BYTES", value=b"DEADBEEF")], + ), + ( + "SELECT @naive", + datetime.datetime(2016, 12, 5, 12, 41, 9), + [ + ScalarQueryParameter( + name="naive", + type_="DATETIME", + value=datetime.datetime(2016, 12, 5, 12, 41, 9), + ) + ], + ), + ( + "SELECT @naive_date", + datetime.date(2016, 12, 5), + [ + ScalarQueryParameter( + name="naive_date", type_="DATE", value=datetime.date(2016, 12, 5) + ) + ], + ), + ( + "SELECT @naive_time", + datetime.time(12, 41, 9, 62500), + [ + ScalarQueryParameter( + name="naive_time", + type_="TIME", + value=datetime.time(12, 41, 9, 62500), + ) + ], + ), + ( + "SELECT @zoned", + datetime.datetime(2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc), + [ + ScalarQueryParameter( + name="zoned", + type_="TIMESTAMP", + value=datetime.datetime( + 2016, 12, 5, 12, 41, 9, tzinfo=datetime.timezone.utc + ), + ) + ], + ), + ( + "SELECT @array_param", + [1, 2], + [ + ArrayQueryParameter( + name="array_param", array_type="INT64", values=[1, 2] + ) + ], + ), + ( + "SELECT (@hitchhiker.question, @hitchhiker.answer)", + ({"_field_1": "What is the answer?", "_field_2": 42}), + [ + StructQueryParameter( + "hitchhiker", + ScalarQueryParameter( + name="question", + type_="STRING", + value="What is the answer?", + ), + ScalarQueryParameter( + name="answer", + type_="INT64", + value=42, + ), + ), + ], + ), + ( + "SELECT " + "((@rectangle.bottom_right.x - @rectangle.top_left.x) " + "* (@rectangle.top_left.y - @rectangle.bottom_right.y))", + 100, + [ + StructQueryParameter( + "rectangle", + StructQueryParameter( + "top_left", + ScalarQueryParameter("x", "INT64", 12), + ScalarQueryParameter("y", "INT64", 102), + ), + StructQueryParameter( + "bottom_right", + ScalarQueryParameter("x", "INT64", 22), + ScalarQueryParameter("y", "INT64", 92), + ), + ) + ], + ), + ( + "SELECT ?", + [ + {"name": "Phred Phlyntstone", "age": 32}, + {"name": "Bharney Rhubbyl", "age": 31}, + ], + [ + ArrayQueryParameter( + name=None, + array_type="RECORD", + values=[ + StructQueryParameter( + None, + ScalarQueryParameter( + name="name", type_="STRING", value="Phred Phlyntstone" + ), + ScalarQueryParameter(name="age", type_="INT64", value=32), + ), + StructQueryParameter( + None, + ScalarQueryParameter( + name="name", type_="STRING", value="Bharney Rhubbyl" + ), + ScalarQueryParameter(name="age", type_="INT64", value=31), + ), + ], + ) + ], + ), + ( + "SELECT @empty_array_param", + [], + [ + ArrayQueryParameter( + name="empty_array_param", + values=[], + array_type=StructQueryParameterType( + ScalarQueryParameterType(name="foo", type_="INT64"), + ScalarQueryParameterType(name="bar", type_="STRING"), + ), + ) + ], + ), + ( + "SELECT @roles", + { + "hero": {"name": "Phred Phlyntstone", "age": 32}, + "sidekick": {"name": "Bharney Rhubbyl", "age": 31}, + }, + [ + StructQueryParameter( + "roles", + StructQueryParameter( + "hero", + ScalarQueryParameter( + name="name", type_="STRING", value="Phred Phlyntstone" + ), + ScalarQueryParameter(name="age", type_="INT64", value=32), + ), + StructQueryParameter( + "sidekick", + ScalarQueryParameter( + name="name", type_="STRING", value="Bharney Rhubbyl" + ), + ScalarQueryParameter(name="age", type_="INT64", value=31), + ), + ), + ], + ), + ( + "SELECT ?", + {"friends": ["Jack", "Jill"]}, + [ + StructQueryParameter( + None, + ArrayQueryParameter( + name="friends", array_type="STRING", values=["Jack", "Jill"] + ), + ) + ], + ), + ), +) +def test_query_parameters( + bigquery_client, query_api_method, sql, expected, query_parameters +): + jconfig = bigquery.QueryJobConfig() + jconfig.query_parameters = query_parameters + query_job = bigquery_client.query( + sql, + job_config=jconfig, + api_method=query_api_method, + ) + rows = list(query_job.result()) + assert len(rows) == 1 + assert len(rows[0]) == 1 + assert rows[0][0] == expected + + +def test_dry_run( + bigquery_client: bigquery.Client, + query_api_method: str, + scalars_table_multi_location: Tuple[str, str], +): + location, full_table_id = scalars_table_multi_location + query_config = bigquery.QueryJobConfig() + query_config.dry_run = True + + query_string = f"SELECT * FROM {full_table_id}" + query_job = bigquery_client.query( + query_string, + location=location, + job_config=query_config, + api_method=query_api_method, + ) + + # Note: `query_job.result()` is not necessary on a dry run query. All + # necessary information is returned in the initial response. + assert query_job.dry_run is True + assert query_job.total_bytes_processed > 0 + assert len(query_job.schema) > 0 + + +def test_query_error_w_api_method_query(bigquery_client: bigquery.Client): + """No job is returned from jobs.query if the query fails.""" + + with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): + bigquery_client.query( + "SELECT * FROM not_a_real_dataset.doesnt_exist", api_method="QUERY" + ) + + +def test_query_error_w_api_method_default(bigquery_client: bigquery.Client): + """Test that an exception is not thrown until fetching the results. + + For backwards compatibility, jobs.insert is the default API method. With + jobs.insert, a failed query job is "sucessfully" created. An exception is + thrown when fetching the results. + """ + + query_job = bigquery_client.query("SELECT * FROM not_a_real_dataset.doesnt_exist") + + with pytest.raises(exceptions.NotFound, match="not_a_real_dataset"): + query_job.result() + + +def test_session(bigquery_client: bigquery.Client, query_api_method: str): + initial_config = bigquery.QueryJobConfig() + initial_config.create_session = True + initial_query = """ + CREATE TEMPORARY TABLE numbers(id INT64) + AS + SELECT * FROM UNNEST([1, 2, 3, 4, 5]) AS id; + """ + initial_job = bigquery_client.query( + initial_query, job_config=initial_config, api_method=query_api_method + ) + initial_job.result() + session_id = initial_job.session_info.session_id + assert session_id is not None + + second_config = bigquery.QueryJobConfig() + second_config.connection_properties = [ + bigquery.ConnectionProperty("session_id", session_id), + ] + second_job = bigquery_client.query( + "SELECT COUNT(*) FROM numbers;", job_config=second_config + ) + rows = list(second_job.result()) + + assert len(rows) == 1 + assert rows[0][0] == 5 diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 842af6d55..c2219a56a 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -25,7 +25,6 @@ bigquery_storage = None -@unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): def tearDown(self): from google.cloud.bigquery import _helpers From fd0c40c19a83a3bc62fda54cc2a0d4e33731ee77 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 8 Jul 2022 11:54:21 -0700 Subject: [PATCH 04/47] add test pyarrow skips --- google/cloud/bigquery/_pandas_helpers.py | 4 +-- google/cloud/bigquery/client.py | 4 +-- google/cloud/bigquery/exceptions.py | 5 +++ google/cloud/bigquery/table.py | 4 +-- tests/unit/test_table.py | 39 ++++++++++++++++++++++-- 5 files changed, 48 insertions(+), 8 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 3dd3d5a80..5d237815e 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -47,8 +47,8 @@ date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype try: - import pyarrow # type: ignore - import pyarrow.parquet # type: ignore + import pyarrow # type: ignore + import pyarrow.parquet # type: ignore except ImportError: # pragma: NO COVER pyarrow = None diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 142f1e305..85e4b8c1e 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -46,7 +46,7 @@ import warnings try: - import pyarrow # type: ignore + import pyarrow # type: ignore _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__) except ImportError: # pragma: NO COVER @@ -70,7 +70,7 @@ DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO, ) except ImportError: - DEFAULT_BQSTORAGE_CLIENT_INFO = None # type: ignore + DEFAULT_BQSTORAGE_CLIENT_INFO = None # type: ignore from google.cloud.bigquery import _job_helpers diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py index ed2e59bb8..2bab97fea 100644 --- a/google/cloud/bigquery/exceptions.py +++ b/google/cloud/bigquery/exceptions.py @@ -16,5 +16,10 @@ class BigQueryError(Exception): """Base class for all custom exceptions defined by the BigQuery client.""" + +class LegacyBigQueryStorageError(BigQueryError): + """Raised when too old a version of BigQuery Storage extra is detected at runtime.""" + + class LegacyPyarrowError(BigQueryError): """Raised when too old a version of pyarrow package is detected at runtime.""" diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 9d0cd6544..885acbe92 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -30,7 +30,7 @@ pandas = None try: - import pyarrow # type: ignore + import pyarrow # type: ignore except ImportError: # pragma: NO COVER pyarrow = None @@ -66,7 +66,7 @@ # they are not None, avoiding false "no attribute" errors. import pandas import pyarrow - import geopandas # type: ignore + import geopandas # type: ignore from google.cloud import bigquery_storage from google.cloud.bigquery.dataset import DatasetReference diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index b5f2e58c6..9f2ca96da 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -21,8 +21,7 @@ import warnings import mock -import pyarrow -import pyarrow.types +import pkg_resources import pytest import google.api_core.exceptions @@ -34,6 +33,15 @@ grpc as big_query_read_grpc_transport, ) +try: + import pyarrow + import pyarrow.types + + PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) +except ImportError: # pragma: NO COVER + pyarrow = None + PYARROW_VERSION = pkg_resources.parse_version("0.0.1") + try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -1899,6 +1907,13 @@ def test_total_rows_eq_zero(self): row_iterator = self._make_one() self.assertEqual(row_iterator.total_rows, 0) + @mock.patch("google.cloud.bigquery.table.pyarrow", new=None) + def test_to_arrow_error_if_pyarrow_is_none(self): + row_iterator = self._make_one() + with self.assertRaises(ValueError): + row_iterator.to_arrow() + + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow(self): row_iterator = self._make_one() tbl = row_iterator.to_arrow() @@ -2442,6 +2457,7 @@ def test_to_arrow(self): ], ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_w_nulls(self): from google.cloud.bigquery.schema import SchemaField @@ -2474,6 +2490,7 @@ def test_to_arrow_w_nulls(self): self.assertEqual(names, ["Donkey", "Diddy", "Dixie", None]) self.assertEqual(ages, [32, 29, None, 111]) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_w_unknown_type(self): from google.cloud.bigquery.schema import SchemaField @@ -2516,6 +2533,7 @@ def test_to_arrow_w_unknown_type(self): warning = warned[0] self.assertTrue("sport" in str(warning)) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_w_empty_table(self): from google.cloud.bigquery.schema import SchemaField @@ -2554,6 +2572,7 @@ def test_to_arrow_w_empty_table(self): self.assertEqual(child_field.type.value_type[0].name, "name") self.assertEqual(child_field.type.value_type[1].name, "age") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -2630,6 +2649,7 @@ def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): self.assertFalse(matches) mock_client._ensure_bqstorage_client.assert_not_called() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_w_bqstorage(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -2707,6 +2727,7 @@ def test_to_arrow_w_bqstorage(self): # Don't close the client if it was passed in. bqstorage_client._transport.grpc_channel.close.assert_not_called() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_w_bqstorage_creates_client(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -2734,6 +2755,7 @@ def test_to_arrow_w_bqstorage_creates_client(self): mock_client._ensure_bqstorage_client.assert_called_once() bqstorage_client._transport.grpc_channel.close.assert_called_once() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): from google.cloud.bigquery.schema import SchemaField @@ -2760,6 +2782,7 @@ def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): self.assertIsInstance(tbl, pyarrow.Table) self.assertEqual(tbl.num_rows, 2) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_w_bqstorage_no_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -2796,6 +2819,7 @@ def test_to_arrow_w_bqstorage_no_streams(self): self.assertEqual(actual_table.schema[1].name, "colC") self.assertEqual(actual_table.schema[2].name, "colB") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm_gui") @mock.patch("tqdm.tqdm_notebook") @@ -2930,6 +2954,7 @@ def test_to_dataframe_iterable_with_dtypes(self): self.assertEqual(df_2["age"][0], 33) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_iterable_w_bqstorage(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -3094,6 +3119,7 @@ def test_to_dataframe(self): self.assertEqual(df.age.dtype.name, "Int64") @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): from google.cloud.bigquery.schema import SchemaField @@ -3121,6 +3147,7 @@ def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): from google.cloud.bigquery.schema import SchemaField @@ -3144,6 +3171,7 @@ def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm_gui") @mock.patch("tqdm.tqdm_notebook") @@ -3620,6 +3648,7 @@ def test_to_dataframe_w_bqstorage_no_streams(self): self.assertTrue(got.empty) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_logs_session(self): from google.cloud.bigquery.table import Table @@ -3641,6 +3670,7 @@ def test_to_dataframe_w_bqstorage_logs_session(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_empty_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -3692,6 +3722,7 @@ def test_to_dataframe_w_bqstorage_empty_streams(self): self.assertTrue(got.empty) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_nonempty(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -3768,6 +3799,7 @@ def test_to_dataframe_w_bqstorage_nonempty(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -3818,6 +3850,7 @@ def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): self.assertTrue(got.index.is_unique) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm") def test_to_dataframe_w_bqstorage_updates_progress_bar(self, tqdm_mock): @@ -3893,6 +3926,7 @@ def blocking_to_arrow(*args, **kwargs): tqdm_mock().close.assert_called_once() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_exits_on_keyboardinterrupt(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -4062,6 +4096,7 @@ def test_to_dataframe_w_bqstorage_snapshot(self): row_iterator.to_dataframe(bqstorage_client) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut From 1fac4d2e06bcd3c99a49d882da2242d606a9c6b5 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Sun, 10 Jul 2022 21:45:21 -0700 Subject: [PATCH 05/47] replace storage checks --- docs/snippets.py | 5 ++ google/cloud/bigquery/__init__.py | 3 + google/cloud/bigquery/client.py | 16 ++++- google/cloud/bigquery/job/query.py | 4 +- google/cloud/bigquery/table.py | 28 +++++--- setup.py | 19 ++++- tests/system/test_client.py | 25 ++++++- tests/system/test_pandas.py | 2 +- tests/unit/job/test_query_pandas.py | 28 ++++++-- tests/unit/test_dbapi_connection.py | 22 +++++- tests/unit/test_dbapi_cursor.py | 27 ++++++- tests/unit/test_magics.py | 78 +++++++++++++++++++- tests/unit/test_table.py | 106 +++++++++++++++++++++++++++- 13 files changed, 337 insertions(+), 26 deletions(-) diff --git a/docs/snippets.py b/docs/snippets.py index 238fd52c3..05e4fa378 100644 --- a/docs/snippets.py +++ b/docs/snippets.py @@ -31,6 +31,11 @@ except (ImportError, AttributeError): pandas = None +try: + import pyarrow +except (ImportError, AttributeError): + pyarrow = None + from google.api_core.exceptions import InternalServerError from google.api_core.exceptions import ServiceUnavailable from google.api_core.exceptions import TooManyRequests diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 5a4520476..2ae30a081 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -42,6 +42,7 @@ from google.cloud.bigquery.enums import KeyResultStatementKind from google.cloud.bigquery.enums import SqlTypeNames from google.cloud.bigquery.enums import StandardSqlTypeNames +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -195,6 +196,8 @@ "WriteDisposition", # EncryptionConfiguration "EncryptionConfiguration", + # Custom exceptions + "LegacyBigQueryStorageError", ] diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 85e4b8c1e..28647d51f 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -80,6 +80,7 @@ from google.cloud.bigquery._helpers import _str_or_none from google.cloud.bigquery._helpers import _verify_job_config_type from google.cloud.bigquery._helpers import _get_bigquery_host +from google.cloud.bigquery._helpers import BQ_STORAGE_VERSIONS from google.cloud.bigquery._helpers import _DEFAULT_HOST from google.cloud.bigquery._http import Connection from google.cloud.bigquery import _pandas_helpers @@ -88,6 +89,7 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( @@ -535,8 +537,20 @@ def _ensure_bqstorage_client( Returns: A BigQuery Storage API client. """ - from google.cloud import bigquery_storage # type: ignore + try: + from google.cloud import bigquery_storage # type: ignore + except ImportError: + warnings.warn( + "Cannot create BigQuery Storage client, the dependency " + "google-cloud-bigquery-storage is not installed." + ) + return None + try: + BQ_STORAGE_VERSIONS.verify_version() + except LegacyBigQueryStorageError as exc: + warnings.warn(str(exc)) + return None if bqstorage_client is None: bqstorage_client = bigquery_storage.BigQueryReadClient( credentials=self._credentials, diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 7b0097c3d..599b70940 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1537,7 +1537,7 @@ def do_get_result(): def to_arrow( self, progress_bar_type: str = None, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, create_bqstorage_client: bool = True, max_results: Optional[int] = None, ) -> "pyarrow.Table": @@ -1611,7 +1611,7 @@ def to_arrow( # that should only exist here in the QueryJob method. def to_dataframe( self, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, dtypes: Dict[str, Any] = None, progress_bar_type: str = None, create_bqstorage_client: bool = True, diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 885acbe92..42729c0dc 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -54,6 +54,7 @@ import google.cloud._helpers # type: ignore from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers +from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields @@ -67,7 +68,7 @@ import pandas import pyarrow import geopandas # type: ignore - from google.cloud import bigquery_storage + from google.cloud import bigquery_storage # type: ignore from google.cloud.bigquery.dataset import DatasetReference @@ -1592,6 +1593,17 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): if self.max_results is not None: return False + try: + from google.cloud import bigquery_storage # noqa: F401 + except ImportError: + return False + + try: + _helpers.BQ_STORAGE_VERSIONS.verify_version() + except LegacyBigQueryStorageError as exc: + warnings.warn(str(exc)) + return False + return True def _get_next_page_response(self): @@ -1628,7 +1640,7 @@ def total_rows(self): def _maybe_warn_max_results( self, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"], + bqstorage_client: "bigquery_storage.BigQueryReadClient", ): """Issue a warning if BQ Storage client is not ``None`` with ``max_results`` set. @@ -1718,7 +1730,7 @@ def to_arrow_iterable( def to_arrow( self, progress_bar_type: str = None, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, create_bqstorage_client: bool = True, ) -> "pyarrow.Table": """[Beta] Create a class:`pyarrow.Table` by loading all pages of a @@ -1826,7 +1838,7 @@ def to_arrow( def to_dataframe_iterable( self, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, dtypes: Dict[str, Any] = None, max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore ) -> "pandas.DataFrame": @@ -1902,7 +1914,7 @@ def to_dataframe_iterable( # changes to job.QueryJob.to_dataframe() def to_dataframe( self, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, dtypes: Dict[str, Any] = None, progress_bar_type: str = None, create_bqstorage_client: bool = True, @@ -2263,7 +2275,7 @@ def to_geodataframe( def to_dataframe_iterable( self, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, dtypes: Optional[Dict[str, Any]] = None, max_queue_size: Optional[int] = None, ) -> Iterator["pandas.DataFrame"]: @@ -2284,7 +2296,7 @@ def to_dataframe_iterable( Returns: An iterator yielding a single empty :class:`~pandas.DataFrame`. - Raises: + Raises: ValueError: If the :mod:`pandas` library cannot be imported. """ @@ -2293,7 +2305,7 @@ def to_dataframe_iterable( def to_arrow_iterable( self, - bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, + bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, max_queue_size: Optional[int] = None, ) -> Iterator["pyarrow.RecordBatch"]: """Create an iterable of pandas DataFrames, to process the table as a stream. diff --git a/setup.py b/setup.py index 32479d8f7..355ffa2d2 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ # Until this issue is closed # https://github.com/googleapis/google-cloud-python/issues/10566 "google-api-core[grpc] >= 1.31.5, <3.0.0dev,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0", - "google-cloud-bigquery-storage >= 2.0.0, <3.0.0dev", + # "google-cloud-bigquery-storage >= 2.0.0, <3.0.0dev", "proto-plus >= 1.15.0, <2.0.0dev", # NOTE: Maintainers, please do not require google-cloud-core>=2.x.x # Until this issue is closed @@ -49,7 +49,17 @@ extras = { # Keep the no-op bqstorage extra for backward compatibility. # See: https://github.com/googleapis/python-bigquery/issues/757 - "bqstorage": [], + "bqstorage": [ + "google-cloud-bigquery-storage >= 2.0.0, <3.0.0dev", + # Due to an issue in pip's dependency resolver, the `grpc` extra is not + # installed, even though `google-cloud-bigquery-storage` specifies it + # as `google-api-core[grpc]`. We thus need to explicitly specify it here. + # See: https://github.com/googleapis/python-bigquery/issues/83 The + # grpc.Channel.close() method isn't added until 1.32.0. + # https://github.com/grpc/grpc/pull/15254 + "grpcio >= 1.38.1, < 2.0dev", + "pyarrow >= 1.0.0, < 5.0dev", + ], "pandas": [ "pandas>=1.0.0", "pyarrow == 5.0.0, < 9.0dev", @@ -69,6 +79,11 @@ all_extras = [] for extra in extras: + # Exclude this extra from all to avoid overly strict dependencies on core + # libraries such as pyarrow. + # https://github.com/googleapis/python-bigquery/issues/563 + if extra in {"bignumeric_type"}: + continue all_extras.extend(extras[extra]) extras["all"] = all_extras diff --git a/tests/system/test_client.py b/tests/system/test_client.py index c99ee1c72..881aa6ef5 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -37,19 +37,17 @@ from google.api_core.exceptions import TooManyRequests from google.api_core.iam import Policy from google.cloud import bigquery +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import Table from google.cloud._helpers import UTC from google.cloud.bigquery import dbapi, enums -from google.cloud import bigquery_storage from google.cloud import storage from google.cloud.datacatalog_v1 import types as datacatalog_types from google.cloud.datacatalog_v1 import PolicyTagManagerClient import psutil import pytest -import pyarrow -import pyarrow.types from test_utils.retry import RetryErrors from test_utils.retry import RetryInstanceState from test_utils.retry import RetryResult @@ -57,6 +55,16 @@ from . import helpers +try: + from google.cloud import bigquery_storage +except ImportError: # pragma: NO COVER + bigquery_storage = None + +try: + import pyarrow + import pyarrow.types +except ImportError: # pragma: NO COVER + pyarrow = None JOB_TIMEOUT = 120 # 2 minutes DATA_PATH = pathlib.Path(__file__).parent.parent / "data" @@ -1473,6 +1481,10 @@ def test_dbapi_fetchall_from_script(self): row_tuples = [r.values() for r in rows] self.assertEqual(row_tuples, [(5, "foo"), (6, "bar"), (7, "baz")]) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_dbapi_fetch_w_bqstorage_client_large_result_set(self): bqstorage_client = bigquery_storage.BigQueryReadClient( credentials=Config.CLIENT._credentials @@ -1531,6 +1543,9 @@ def test_dbapi_dry_run_query(self): self.assertEqual(list(rows), []) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_dbapi_connection_does_not_leak_sockets(self): current_process = psutil.Process() conn_count_start = len(current_process.connections()) @@ -1998,6 +2013,10 @@ def test_create_table_rows_fetch_nested_schema(self): self.assertEqual(found[7], e_favtime) self.assertEqual(found[8], decimal.Decimal(expected["FavoriteNumber"])) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_nested_table_to_arrow(self): from google.cloud.bigquery.job import SourceFormat from google.cloud.bigquery.job import WriteDisposition diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 34e4243c4..1e177b129 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -27,7 +27,7 @@ import pytest from google.cloud import bigquery -from google.cloud import bigquery_storage +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery import enums from . import helpers diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 84aab3aca..306050f5b 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -17,13 +17,20 @@ import json import mock -import pyarrow import pytest -from google.cloud import bigquery_storage import google.cloud.bigquery_storage_v1.reader import google.cloud.bigquery_storage_v1.services.big_query_read.client +try: + import pyarrow +except (ImportError, AttributeError): # pragma: NO COVER + pyarrow = None +try: + from google.cloud import bigquery_storage +except (ImportError, AttributeError): # pragma: NO COVER + bigquery_storage = None + try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -89,6 +96,9 @@ def test__contains_order_by(query, expected): assert not mut._contains_order_by(query) +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) @pytest.mark.parametrize( "query", ( @@ -179,6 +189,7 @@ def test_to_dataframe_bqstorage_preserve_order(query, table_read_options_kwarg): ) +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") def test_to_arrow(): from google.cloud.bigquery.job import QueryJob as target_class @@ -265,6 +276,7 @@ def test_to_arrow(): ] +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") def test_to_arrow_max_results_no_progress_bar(): from google.cloud.bigquery import table from google.cloud.bigquery.job import QueryJob as target_class @@ -300,6 +312,7 @@ def test_to_arrow_max_results_no_progress_bar(): assert tbl.num_rows == 2 +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_arrow_w_tqdm_w_query_plan(): from google.cloud.bigquery import table @@ -355,7 +368,7 @@ def test_to_arrow_w_tqdm_w_query_plan(): timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None ) - +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_arrow_w_tqdm_w_pending_status(): from google.cloud.bigquery import table @@ -407,7 +420,7 @@ def test_to_arrow_w_tqdm_w_pending_status(): timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None ) - +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_arrow_w_tqdm_wo_query_plan(): from google.cloud.bigquery import table @@ -510,6 +523,9 @@ def test_to_dataframe_ddl_query(): assert len(df) == 0 +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test_to_dataframe_bqstorage(table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class @@ -584,6 +600,9 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg): bqstorage_client.read_rows.assert_called_once_with(stream_id) +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test_to_dataframe_bqstorage_no_pyarrow_compression(): from google.cloud.bigquery.job import QueryJob as target_class @@ -629,6 +648,7 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression(): ) +@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") def test_to_dataframe_column_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class diff --git a/tests/unit/test_dbapi_connection.py b/tests/unit/test_dbapi_connection.py index e96ab55d7..e18ff1e4a 100644 --- a/tests/unit/test_dbapi_connection.py +++ b/tests/unit/test_dbapi_connection.py @@ -17,7 +17,10 @@ import mock -from google.cloud import bigquery_storage +try: + from google.cloud import bigquery_storage +except ImportError: # pragma: NO COVER + bigquery_storage = None class TestConnection(unittest.TestCase): @@ -37,11 +40,16 @@ def _mock_client(self): return mock_client def _mock_bqstorage_client(self): + # Assumption: bigquery_storage exists. It's the test's responisbility to + # not use this helper or skip itself if bqstroage is not installed. mock_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) mock_client._transport = mock.Mock(spec=["channel"]) mock_client._transport.grpc_channel = mock.Mock(spec=["close"]) return mock_client + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_ctor_wo_bqstorage_client(self): from google.cloud.bigquery.dbapi import Connection @@ -82,6 +90,9 @@ def test_connect_wo_client(self, mock_client): self.assertIsNotNone(connection._client) self.assertIsNotNone(connection._bqstorage_client) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_connect_w_client(self): from google.cloud.bigquery.dbapi import connect from google.cloud.bigquery.dbapi import Connection @@ -97,6 +108,9 @@ def test_connect_w_client(self): self.assertIs(connection._client, mock_client) self.assertIs(connection._bqstorage_client, mock_bqstorage_client) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_connect_w_both_clients(self): from google.cloud.bigquery.dbapi import connect from google.cloud.bigquery.dbapi import Connection @@ -130,6 +144,9 @@ def test_raises_error_if_closed(self): ): getattr(connection, method)() + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_close_closes_all_created_bigquery_clients(self): client = self._mock_client() bqstorage_client = self._mock_bqstorage_client() @@ -152,6 +169,9 @@ def test_close_closes_all_created_bigquery_clients(self): self.assertTrue(client.close.called) self.assertTrue(bqstorage_client._transport.grpc_channel.close.called) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_close_does_not_close_bigquery_clients_passed_to_it(self): client = self._mock_client() bqstorage_client = self._mock_bqstorage_client() diff --git a/tests/unit/test_dbapi_cursor.py b/tests/unit/test_dbapi_cursor.py index d672c0f6c..24b9b987c 100644 --- a/tests/unit/test_dbapi_cursor.py +++ b/tests/unit/test_dbapi_cursor.py @@ -18,8 +18,17 @@ import pytest +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + from google.api_core import exceptions -from google.cloud import bigquery_storage + +try: + from google.cloud import bigquery_storage +except ImportError: # pragma: NO COVER + bigquery_storage = None from tests.unit.helpers import _to_pyarrow @@ -269,6 +278,10 @@ def test_fetchall_w_row(self): self.assertEqual(len(rows), 1) self.assertEqual(rows[0], (1,)) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_fetch_success(self): from google.cloud.bigquery import dbapi from google.cloud.bigquery import table @@ -322,6 +335,10 @@ def test_fetchall_w_bqstorage_client_fetch_success(self): self.assertEqual(sorted_row_data, expected_row_data) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_fetch_no_rows(self): from google.cloud.bigquery import dbapi @@ -344,6 +361,10 @@ def test_fetchall_w_bqstorage_client_fetch_no_rows(self): # check the data returned self.assertEqual(rows, []) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_fetch_error_no_fallback(self): from google.cloud.bigquery import dbapi from google.cloud.bigquery import table @@ -375,6 +396,10 @@ def fake_ensure_bqstorage_client(bqstorage_client=None, **kwargs): # the default client was not used mock_client.list_rows.assert_not_called() + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_no_arrow_compression(self): from google.cloud.bigquery import dbapi from google.cloud.bigquery import table diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index ea8fe568f..064cdd069 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -76,6 +76,19 @@ def ipython_ns_cleanup(): del ip.user_ns[name] +@pytest.fixture(scope="session") +def missing_bq_storage(): + """Provide a patcher that can make the bigquery storage import to fail.""" + + def fail_if(name, globals, locals, fromlist, level): + # NOTE: *very* simplified, assuming a straightforward absolute import + return "bigquery_storage" in name or ( + fromlist is not None and "bigquery_storage" in fromlist + ) + + return maybe_fail_import(predicate=fail_if) + + @pytest.fixture(scope="session") def missing_grpcio_lib(): """Provide a patcher that can make the gapic library import to fail.""" @@ -311,6 +324,9 @@ def test__make_bqstorage_client_false(): assert got is None +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test__make_bqstorage_client_true(): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True @@ -322,6 +338,54 @@ def test__make_bqstorage_client_true(): assert isinstance(got, bigquery_storage.BigQueryReadClient) +def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): + credentials_mock = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + test_client = bigquery.Client( + project="test_project", credentials=credentials_mock, location="test_location" + ) + + with pytest.raises(ImportError) as exc_context, missing_bq_storage: + magics._make_bqstorage_client(test_client, True, {}) + + error_msg = str(exc_context.value) + assert "google-cloud-bigquery-storage" in error_msg + assert "pyarrow" in error_msg + + +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) +def test__make_bqstorage_client_true_obsolete_dependency(): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + credentials_mock = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + test_client = bigquery.Client( + project="test_project", credentials=credentials_mock, location="test_location" + ) + + patcher = mock.patch( + "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + ) + with patcher, warnings.catch_warnings(record=True) as warned: + got = magics._make_bqstorage_client(test_client, True, {}) + + assert got is None + + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + + +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) + @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test__make_bqstorage_client_true_missing_gapic(missing_grpcio_lib): credentials_mock = mock.create_autospec( @@ -377,6 +441,9 @@ def test_extension_load(): @pytest.mark.usefixtures("ipython_interactive") @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test_bigquery_magic_without_optional_arguments(monkeypatch): ip = IPython.get_ipython() ip.extension_manager.load_extension("google.cloud.bigquery") @@ -539,9 +606,10 @@ def test_bigquery_magic_clears_display_in_non_verbose_mode(): @pytest.mark.usefixtures("ipython_interactive") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test_bigquery_magic_with_bqstorage_from_argument(monkeypatch): - pandas = pytest.importorskip("pandas") - ip = IPython.get_ipython() ip.extension_manager.load_extension("google.cloud.bigquery") mock_credentials = mock.create_autospec( @@ -604,6 +672,9 @@ def warning_match(warning): @pytest.mark.usefixtures("ipython_interactive") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) def test_bigquery_magic_with_rest_client_requested(monkeypatch): pandas = pytest.importorskip("pandas") @@ -831,6 +902,9 @@ def test_bigquery_magic_w_table_id_and_destination_var(ipython_ns_cleanup): @pytest.mark.usefixtures("ipython_interactive") +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_bigquery_magic_w_table_id_and_bqstorage_client(): ip = IPython.get_ipython() diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 9f2ca96da..628cb19aa 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -25,10 +25,19 @@ import pytest import google.api_core.exceptions +from test_utils.imports import maybe_fail_import from google.cloud.bigquery.table import TableReference -from google.cloud import bigquery_storage +try: + from google.cloud import bigquery_storage + from google.cloud.bigquery_storage_v1.services.big_query_read.transports import ( + grpc as big_query_read_grpc_transport, + ) +except ImportError: # pragma: NO COVER + bigquery_storage = None + big_query_read_grpc_transport = None + from google.cloud.bigquery_storage_v1.services.big_query_read.transports import ( grpc as big_query_read_grpc_transport, ) @@ -2201,6 +2210,49 @@ def test__validate_bqstorage_returns_false_if_max_results_set(self): ) self.assertFalse(result) + def test__validate_bqstorage_returns_false_if_missing_dependency(self): + iterator = self._make_one(first_page_response=None) # not cached + + def fail_bqstorage_import(name, globals, locals, fromlist, level): + # NOTE: *very* simplified, assuming a straightforward absolute import + return "bigquery_storage" in name or ( + fromlist is not None and "bigquery_storage" in fromlist + ) + + no_bqstorage = maybe_fail_import(predicate=fail_bqstorage_import) + + with no_bqstorage: + result = iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + + self.assertFalse(result) + + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test__validate_bqstorage_returns_false_w_warning_if_obsolete_version(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + iterator = self._make_one(first_page_response=None) # not cached + + patcher = mock.patch( + "google.cloud.bigquery.table._helpers.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + ) + with patcher, warnings.catch_warnings(record=True) as warned: + result = iterator._validate_bqstorage( + bqstorage_client=None, create_bqstorage_client=True + ) + + self.assertFalse(result) + + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_iterable(self): from google.cloud.bigquery.schema import SchemaField @@ -2613,6 +2665,10 @@ def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): ) mock_client._ensure_bqstorage_client.assert_not_called() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -2650,6 +2706,9 @@ def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): mock_client._ensure_bqstorage_client.assert_not_called() @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_w_bqstorage(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -2728,6 +2787,9 @@ def test_to_arrow_w_bqstorage(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_w_bqstorage_creates_client(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -2783,6 +2845,9 @@ def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): self.assertEqual(tbl.num_rows, 2) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_w_bqstorage_no_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -2954,6 +3019,9 @@ def test_to_dataframe_iterable_with_dtypes(self): self.assertEqual(df_2["age"][0], 33) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_iterable_w_bqstorage(self): from google.cloud.bigquery import schema @@ -3594,6 +3662,9 @@ def test_to_dataframe_max_results_w_create_bqstorage_client_no_warning(self): mock_client._ensure_bqstorage_client.assert_not_called() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_dataframe_w_bqstorage_creates_client(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -3622,6 +3693,9 @@ def test_to_dataframe_w_bqstorage_creates_client(self): bqstorage_client._transport.grpc_channel.close.assert_called_once() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_dataframe_w_bqstorage_no_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -3647,6 +3721,9 @@ def test_to_dataframe_w_bqstorage_no_streams(self): self.assertEqual(list(got), column_names) self.assertTrue(got.empty) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_logs_session(self): @@ -3670,6 +3747,9 @@ def test_to_dataframe_w_bqstorage_logs_session(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_empty_streams(self): from google.cloud.bigquery import schema @@ -3722,6 +3802,9 @@ def test_to_dataframe_w_bqstorage_empty_streams(self): self.assertTrue(got.empty) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_nonempty(self): from google.cloud.bigquery import schema @@ -3799,6 +3882,9 @@ def test_to_dataframe_w_bqstorage_nonempty(self): bqstorage_client._transport.grpc_channel.close.assert_not_called() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): from google.cloud.bigquery import schema @@ -3926,6 +4012,9 @@ def blocking_to_arrow(*args, **kwargs): tqdm_mock().close.assert_called_once() @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_w_bqstorage_exits_on_keyboardinterrupt(self): from google.cloud.bigquery import schema @@ -4043,6 +4132,9 @@ def test_to_dataframe_tabledata_list_w_multiple_pages_return_unique_index(self): self.assertTrue(df.index.is_unique) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_dataframe_w_bqstorage_raises_auth_error(self): from google.cloud.bigquery import table as mut @@ -4061,6 +4153,9 @@ def test_to_dataframe_w_bqstorage_raises_auth_error(self): with pytest.raises(google.api_core.exceptions.Forbidden): row_iterator.to_dataframe(bqstorage_client=bqstorage_client) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_dataframe_w_bqstorage_partition(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -4078,6 +4173,9 @@ def test_to_dataframe_w_bqstorage_partition(self): with pytest.raises(ValueError): row_iterator.to_dataframe(bqstorage_client) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_dataframe_w_bqstorage_snapshot(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut @@ -4096,6 +4194,9 @@ def test_to_dataframe_w_bqstorage_snapshot(self): row_iterator.to_dataframe(bqstorage_client) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): from google.cloud.bigquery import schema @@ -4799,6 +4900,9 @@ def test_set_expiration_w_none(self): assert time_partitioning._properties["expirationMs"] is None +@pytest.mark.skipif( + bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" +) @pytest.mark.parametrize( "table_path", ( From 61c69e9c6c5e3f80a031ea6b3c081d94fb54ff48 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 11 Jul 2022 11:52:01 -0700 Subject: [PATCH 06/47] update tests --- tests/unit/job/test_query_pandas.py | 8 ++- tests/unit/test__helpers.py | 1 + tests/unit/test__pandas_helpers.py | 9 +++- tests/unit/test_client.py | 82 ++++++++++++++++++++++++++++- tests/unit/test_dbapi_connection.py | 8 +-- tests/unit/test_magics.py | 1 - tests/unit/test_table.py | 7 ++- 7 files changed, 103 insertions(+), 13 deletions(-) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 306050f5b..bcdc96f40 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -19,8 +19,8 @@ import mock import pytest -import google.cloud.bigquery_storage_v1.reader -import google.cloud.bigquery_storage_v1.services.big_query_read.client +# import google.cloud.bigquery_storage_v1.reader +# import google.cloud.bigquery_storage_v1.services.big_query_read.client try: import pyarrow @@ -28,6 +28,8 @@ pyarrow = None try: from google.cloud import bigquery_storage + import google.cloud.bigquery_storage_v1.reader + import google.cloud.bigquery_storage_v1.services.big_query_read.client except (ImportError, AttributeError): # pragma: NO COVER bigquery_storage = None @@ -368,6 +370,7 @@ def test_to_arrow_w_tqdm_w_query_plan(): timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None ) + @pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_arrow_w_tqdm_w_pending_status(): @@ -420,6 +423,7 @@ def test_to_arrow_w_tqdm_w_pending_status(): timeout=_PROGRESS_BAR_UPDATE_INTERVAL, max_results=None ) + @pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_arrow_w_tqdm_wo_query_plan(): diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index c2219a56a..842af6d55 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -25,6 +25,7 @@ bigquery_storage = None +@unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): def tearDown(self): from google.cloud.bigquery import _helpers diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 5780fb9b6..72957ab9b 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -46,12 +46,19 @@ import pytest from google import api_core -from google.cloud import bigquery_storage + from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT +try: + from google.cloud import bigquery_storage + + _helpers.BQ_STORAGE_VERSIONS.verify_version() +except ImportError: # pragma: NO COVER + bigquery_storage = None + PANDAS_MINIUM_VERSION = pkg_resources.parse_version("1.0.0") if pandas is not None: diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 7f3bb6032..fe275d06f 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -63,11 +63,16 @@ from google.api_core import client_info import google.cloud._helpers from google.cloud import bigquery -from google.cloud import bigquery_storage + from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.retry import DEFAULT_TIMEOUT from google.cloud.bigquery import ParquetOptions +try: + from google.cloud import bigquery_storage +except (ImportError, AttributeError): # pragma: NO COVER + bigquery_storage = None +from test_utils.imports import maybe_fail_import from tests.unit.helpers import make_connection PANDAS_MINIUM_VERSION = pkg_resources.parse_version("1.0.0") @@ -620,6 +625,9 @@ def test_get_dataset(self): self.assertEqual(dataset.dataset_id, self.DS_ID) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_ensure_bqstorage_client_creating_new_instance(self): mock_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) mock_client_instance = object() @@ -642,6 +650,55 @@ def test_ensure_bqstorage_client_creating_new_instance(self): client_info=mock.sentinel.client_info, ) + def test_ensure_bqstorage_client_missing_dependency(self): + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + + def fail_bqstorage_import(name, globals, locals, fromlist, level): + # NOTE: *very* simplified, assuming a straightforward absolute import + return "bigquery_storage" in name or ( + fromlist is not None and "bigquery_storage" in fromlist + ) + + no_bqstorage = maybe_fail_import(predicate=fail_bqstorage_import) + + with no_bqstorage, warnings.catch_warnings(record=True) as warned: + bqstorage_client = client._ensure_bqstorage_client() + + self.assertIsNone(bqstorage_client) + matching_warnings = [ + warning + for warning in warned + if "not installed" in str(warning) + and "google-cloud-bigquery-storage" in str(warning) + ] + assert matching_warnings, "Missing dependency warning not raised." + + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_ensure_bqstorage_client_obsolete_dependency(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + + patcher = mock.patch( + "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + ) + with patcher, warnings.catch_warnings(record=True) as warned: + bqstorage_client = client._ensure_bqstorage_client() + + self.assertIsNone(bqstorage_client) + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_ensure_bqstorage_client_existing_client_check_passes(self): creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) @@ -653,6 +710,29 @@ def test_ensure_bqstorage_client_existing_client_check_passes(self): self.assertIs(bqstorage_client, mock_storage_client) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) + def test_ensure_bqstorage_client_existing_client_check_fails(self): + from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError + + creds = _make_credentials() + client = self._make_one(project=self.PROJECT, credentials=creds) + mock_storage_client = mock.sentinel.mock_storage_client + + patcher = mock.patch( + "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", + side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + ) + with patcher, warnings.catch_warnings(record=True) as warned: + bqstorage_client = client._ensure_bqstorage_client(mock_storage_client) + + self.assertIsNone(bqstorage_client) + matching_warnings = [ + warning for warning in warned if "BQ Storage too old" in str(warning) + ] + assert matching_warnings, "Obsolete dependency warning not raised." + def test_create_routine_w_minimal_resource(self): from google.cloud.bigquery.routine import Routine from google.cloud.bigquery.routine import RoutineReference diff --git a/tests/unit/test_dbapi_connection.py b/tests/unit/test_dbapi_connection.py index e18ff1e4a..67777f923 100644 --- a/tests/unit/test_dbapi_connection.py +++ b/tests/unit/test_dbapi_connection.py @@ -41,15 +41,12 @@ def _mock_client(self): def _mock_bqstorage_client(self): # Assumption: bigquery_storage exists. It's the test's responisbility to - # not use this helper or skip itself if bqstroage is not installed. + # not use this helper or skip itself if bqstorage is not installed. mock_client = mock.create_autospec(bigquery_storage.BigQueryReadClient) mock_client._transport = mock.Mock(spec=["channel"]) mock_client._transport.grpc_channel = mock.Mock(spec=["close"]) return mock_client - @unittest.skipIf( - bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" - ) def test_ctor_wo_bqstorage_client(self): from google.cloud.bigquery.dbapi import Connection @@ -61,6 +58,9 @@ def test_ctor_wo_bqstorage_client(self): self.assertIs(connection._client, mock_client) self.assertIs(connection._bqstorage_client, None) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_ctor_w_bqstorage_client(self): from google.cloud.bigquery.dbapi import Connection diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 064cdd069..5d09ec99b 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -385,7 +385,6 @@ def test__make_bqstorage_client_true_obsolete_dependency(): @pytest.mark.skipif( bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) - @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test__make_bqstorage_client_true_missing_gapic(missing_grpcio_lib): credentials_mock = mock.create_autospec( diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 628cb19aa..c28562358 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -38,10 +38,6 @@ bigquery_storage = None big_query_read_grpc_transport = None -from google.cloud.bigquery_storage_v1.services.big_query_read.transports import ( - grpc as big_query_read_grpc_transport, -) - try: import pyarrow import pyarrow.types @@ -2353,6 +2349,9 @@ def test_to_arrow_iterable(self): [[{"name": "Bepples Phlyntstone", "age": 0}, {"name": "Dino", "age": 4}]], ) + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_iterable_w_bqstorage(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut From e31e4ef7bb927c5769daeef21dc3bab6e044f35f Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 11 Jul 2022 13:34:34 -0700 Subject: [PATCH 07/47] update tests --- google/cloud/bigquery/client.py | 2 +- google/cloud/bigquery/table.py | 4 ++-- setup.py | 2 +- tests/system/test_client.py | 3 +-- tests/system/test_pandas.py | 12 +++++++++--- tests/unit/test_table.py | 19 +++++++++++++------ 6 files changed, 27 insertions(+), 15 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 28647d51f..5c2ff1a97 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -538,7 +538,7 @@ def _ensure_bqstorage_client( A BigQuery Storage API client. """ try: - from google.cloud import bigquery_storage # type: ignore + from google.cloud import bigquery_storage # type: ignore except ImportError: warnings.warn( "Cannot create BigQuery Storage client, the dependency " diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 42729c0dc..27da6ae17 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1603,7 +1603,7 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): except LegacyBigQueryStorageError as exc: warnings.warn(str(exc)) return False - + return True def _get_next_page_response(self): @@ -2296,7 +2296,7 @@ def to_dataframe_iterable( Returns: An iterator yielding a single empty :class:`~pandas.DataFrame`. - Raises: + Raises: ValueError: If the :mod:`pandas` library cannot be imported. """ diff --git a/setup.py b/setup.py index 4e1ee479c..d4de1beb9 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 "grpcio >= 1.38.1, < 2.0dev", - "pyarrow >= 1.0.0, < 5.0dev", + "pyarrow == 5.0.0, < 9.0dev", ], "pandas": [ "pandas>=1.0.0", diff --git a/tests/system/test_client.py b/tests/system/test_client.py index 881aa6ef5..ec5e1ebba 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -37,7 +37,6 @@ from google.api_core.exceptions import TooManyRequests from google.api_core.iam import Policy from google.cloud import bigquery -from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import Table @@ -56,7 +55,7 @@ from . import helpers try: - from google.cloud import bigquery_storage + from google.cloud import bigquery_storage except ImportError: # pragma: NO COVER bigquery_storage = None diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 1e177b129..609138deb 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -36,6 +36,9 @@ pandas = pytest.importorskip("pandas", minversion="0.23.0") numpy = pytest.importorskip("numpy") +bigquery_storage = pytest.importorskip( + "google.cloud.bigquery_storage", minversion="2.0.0" +) PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version PANDAS_INT64_VERSION = pkg_resources.parse_version("1.0.0") @@ -373,11 +376,12 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): bigquery.SchemaField("geo_col", "GEOGRAPHY"), bigquery.SchemaField("int_col", "INTEGER"), bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("bignum_col", "BIGNUMERIC"), bigquery.SchemaField("str_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), ) + if _BIGNUMERIC_SUPPORT: + table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) num_rows = 100 nulls = [None] * num_rows @@ -390,11 +394,12 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): ("geo_col", nulls), ("int_col", nulls), ("num_col", nulls), - ("bignum_col", nulls), ("str_col", nulls), ("time_col", nulls), ("ts_col", nulls), ] + if _BIGNUMERIC_SUPPORT: + df_data.append(("bignum_col", nulls)) df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) @@ -469,11 +474,12 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id bigquery.SchemaField("geo_col", "GEOGRAPHY"), bigquery.SchemaField("int_col", "INTEGER"), bigquery.SchemaField("num_col", "NUMERIC"), - bigquery.SchemaField("bignum_col", "BIGNUMERIC"), bigquery.SchemaField("str_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), ) + if _BIGNUMERIC_SUPPORT: + table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) df_data = [ ("row_num", [1, 2, 3]), diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index c28562358..1cdb4af09 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -64,6 +64,8 @@ from google.cloud.bigquery.dataset import DatasetReference +PYARROW_TIMESTAMP_VERSION = pkg_resources.parse_version("2.0.0") + def _mock_client(): from google.cloud.bigquery import client @@ -2349,6 +2351,7 @@ def test_to_arrow_iterable(self): [[{"name": "Bepples Phlyntstone", "age": 0}, {"name": "Dino", "age": 4}]], ) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) @@ -2624,6 +2627,9 @@ def test_to_arrow_w_empty_table(self): self.assertEqual(child_field.type.value_type[1].name, "age") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -2664,10 +2670,6 @@ def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): ) mock_client._ensure_bqstorage_client.assert_not_called() - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - @unittest.skipIf( - bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" - ) def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): from google.cloud.bigquery.schema import SchemaField @@ -3201,7 +3203,10 @@ def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): df = row_iterator.to_dataframe(create_bqstorage_client=False) - tzinfo = datetime.timezone.utc + tzinfo = None + if PYARROW_VERSION >= PYARROW_TIMESTAMP_VERSION: + tzinfo = datetime.timezone.utc + self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 2) # verify the number of rows self.assertEqual(list(df.columns), ["some_timestamp"]) @@ -3238,7 +3243,6 @@ def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): ) @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm_gui") @mock.patch("tqdm.tqdm_notebook") @@ -3935,6 +3939,9 @@ def test_to_dataframe_w_bqstorage_multiple_streams_return_unique_index(self): self.assertTrue(got.index.is_unique) @unittest.skipIf(pandas is None, "Requires `pandas`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm") From b4f7160c5179ce245b73c7a26f3edc8123c95023 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 11 Jul 2022 13:58:27 -0700 Subject: [PATCH 08/47] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d4de1beb9..2cb30e917 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ "db-dtypes>=0.3.0,<2.0.0dev", ], "bignumeric_type": ["pyarrow == 5.0.0, < 9.0dev"], - # "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ From 2bb64614e9b2795d6ffb9b3001074033d3e56fa7 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 11 Jul 2022 21:05:39 -0700 Subject: [PATCH 09/47] update system tests --- tests/system/test_pandas.py | 19 +++++++++++-------- tests/unit/test_dbapi_cursor.py | 2 -- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 609138deb..e5dbe8719 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -508,14 +508,6 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id decimal.Decimal("99999999999999999999999999999.999999999"), ], ), - ( - "bignum_col", - [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - None, - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ], - ), ("str_col", ["abc", None, "def"]), ( "time_col", @@ -532,6 +524,17 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id ], ), ] + if _BIGNUMERIC_SUPPORT: + df_data.append( + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], + ) + ) df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) diff --git a/tests/unit/test_dbapi_cursor.py b/tests/unit/test_dbapi_cursor.py index 24b9b987c..b550bbce0 100644 --- a/tests/unit/test_dbapi_cursor.py +++ b/tests/unit/test_dbapi_cursor.py @@ -338,7 +338,6 @@ def test_fetchall_w_bqstorage_client_fetch_success(self): @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_fetch_no_rows(self): from google.cloud.bigquery import dbapi @@ -364,7 +363,6 @@ def test_fetchall_w_bqstorage_client_fetch_no_rows(self): @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_fetchall_w_bqstorage_client_fetch_error_no_fallback(self): from google.cloud.bigquery import dbapi from google.cloud.bigquery import table From 3a87275120e039bfbacd9559693d00b6762ff3b6 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:18:09 -0700 Subject: [PATCH 10/47] update verify_pandas_imports --- google/cloud/bigquery/_pandas_helpers.py | 8 +++++++- setup.py | 4 ++-- testing/constraints-3.6.txt | 2 +- testing/constraints-3.7.txt | 2 +- testing/constraints-3.9.txt | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 5d237815e..cdfd689f3 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -49,8 +49,11 @@ try: import pyarrow # type: ignore import pyarrow.parquet # type: ignore -except ImportError: # pragma: NO COVER + + pyarrow_import_exception = None +except ImportError as exc: # pragma: NO COVER pyarrow = None + pyarrow_import_exception = exc try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` @@ -99,6 +102,7 @@ def _to_wkb(v): _NO_PANDAS_ERROR = "Please install the 'pandas' package to use this function." _NO_DB_TYPES_ERROR = "Please install the 'db-dtypes' package to use this function." +_NO_PYARROW_ERROR = "Please install the 'pyarrow' package to use this function." _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", @@ -1014,6 +1018,8 @@ def dataframe_to_json_generator(dataframe): def verify_pandas_imports(): + if pyarrow is None: + raise ValueError(_NO_PYARROW_ERROR) from pyarrow_import_exception if pandas is None: raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception if db_dtypes is None: diff --git a/setup.py b/setup.py index 2cb30e917..7592ae966 100644 --- a/setup.py +++ b/setup.py @@ -58,11 +58,11 @@ # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 "grpcio >= 1.38.1, < 2.0dev", - "pyarrow == 5.0.0, < 9.0dev", + "pyarrow >= 5.0.0, < 9.0dev", ], "pandas": [ "pandas>=1.0.0", - "pyarrow == 5.0.0, < 9.0dev", + "pyarrow >= 5.0.0, < 9.0dev", "db-dtypes>=0.3.0,<2.0.0dev", ], "bignumeric_type": ["pyarrow == 5.0.0, < 9.0dev"], diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index c6dce2259..dc7295134 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -21,7 +21,7 @@ proto-plus==1.15.0 protobuf==3.12.0 pyarrow==5.0.0 python-dateutil==2.7.2 -pyarrow==5.0.0 +pyarrow>=5.0.0 requests==2.18.0 Shapely==1.6.0 six==1.13.0 diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 38b88208e..612126904 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -19,7 +19,7 @@ opentelemetry-sdk==1.1.0 pandas==1.1.0 proto-plus==1.15.0 protobuf==3.12.0 -pyarrow==5.0.0 +pyarrow>=5.0.0 python-dateutil==2.7.3 requests==2.18.0 Shapely==1.6.4.post2 diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 33798cac5..8224e563a 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,4 +4,4 @@ # # NOTE: Not comprehensive yet, will eventually be maintained semi-automatically by # the renovate bot. -pyarrow==5.0.0 +pyarrow>=5.0.0 From e0a9a2a35300327d1bd007587867c46e3eaad248 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:47:02 -0700 Subject: [PATCH 11/47] add pyarrow guards --- tests/unit/test__helpers.py | 34 ++++++++++++++++++++++++++++++ tests/unit/test__pandas_helpers.py | 26 +++++++++++++---------- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 842af6d55..4fb86f665 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -24,6 +24,11 @@ except ImportError: # pragma: NO COVER bigquery_storage = None +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): @@ -95,6 +100,7 @@ def test_is_read_session_optional_false(self): assert not versions.is_read_session_optional +@unittest.skipIf(pyarrow is None, "Requires `pyarrow`") class TestPyarrowVersions(unittest.TestCase): def tearDown(self): from google.cloud.bigquery import _helpers @@ -107,6 +113,34 @@ def _object_under_test(self): return _helpers.PyarrowVersions() + def _call_try_import(self, **kwargs): + from google.cloud.bigquery import _helpers + + _helpers.PYARROW_VERSIONS._installed_version = None + return _helpers.PYARROW_VERSIONS.try_import(**kwargs) + + def test_try_import_raises_no_error_w_recent_pyarrow(self): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + with mock.patch("pyarrow.__version__", new="5.0.0"): + try: + pyarrow = self._call_try_import(raise_if_error=True) + self.assertIsNotNone(pyarrow) + except LegacyPyarrowError: # pragma: NO COVER + self.fail("Legacy error raised with a non-legacy dependency version.") + + def test_try_import_returns_none_w_legacy_pyarrow(self): + with mock.patch("pyarrow.__version__", new="2.0.0"): + pyarrow = self._call_try_import() + self.assertIsNone(pyarrow) + + def test_try_import_raises_error_w_legacy_pyarrow(self): + from google.cloud.bigquery.exceptions import LegacyPyarrowError + + with mock.patch("pyarrow.__version__", new="2.0.0"): + with self.assertRaises(LegacyPyarrowError): + self._call_try_import(raise_if_error=True) + def test_installed_version_returns_cached(self): versions = self._object_under_test() versions._installed_version = object() diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 72957ab9b..edf599fc3 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -30,14 +30,6 @@ except ImportError: # pragma: NO COVER pandas = None -try: - import pyarrow - import pyarrow.types -except ImportError: # pragma: NO COVER - # Mock out pyarrow when missing, because methods from pyarrow.types are - # used in test parameterization. - pyarrow = mock.Mock() - try: import geopandas except ImportError: # pragma: NO COVER @@ -47,10 +39,18 @@ from google import api_core +from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +if pyarrow: + import pyarrow.types +else: # pragma: NO COVER + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() try: from google.cloud import bigquery_storage @@ -1145,10 +1145,14 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test): @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): - monkeypatch.setattr(module_under_test, "pyarrow", None) - with pytest.raises(ValueError) as exc_context: + mock_pyarrow_import = mock.Mock() + mock_pyarrow_import.side_effect = exceptions.LegacyPyarrowError( + "pyarrow not installed" + ) + monkeypatch.setattr(_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import) + + with pytest.raises(exceptions.LegacyPyarrowError): module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None) - assert "pyarrow is required" in str(exc_context.value) @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") From f3dbaeaceb136d8bf064cdd83565625e96f6f681 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 15 Jul 2022 11:52:49 -0700 Subject: [PATCH 12/47] add datetime check --- google/cloud/bigquery/_pandas_helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index cdfd689f3..d8265e0e9 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -134,7 +134,8 @@ def __init__(self): def pyarrow_datetime(): - return pyarrow.timestamp("us", tz=None) + if pyarrow: + return pyarrow.timestamp("us", tz=None) def pyarrow_numeric(): From 91fccefa468e973e889cd80fadf26743d50cae8a Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 15 Jul 2022 12:03:32 -0700 Subject: [PATCH 13/47] change pyarrow import --- tests/unit/test__pandas_helpers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index edf599fc3..f429305a2 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -44,10 +44,10 @@ from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT -pyarrow = _helpers.PYARROW_VERSIONS.try_import() -if pyarrow: +try: + import pyarrow import pyarrow.types -else: # pragma: NO COVER +except ImportError: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. pyarrow = mock.Mock() From ac78a3380ce0a21044362b0e181500a0ba21bfba Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 15 Jul 2022 15:49:11 -0700 Subject: [PATCH 14/47] update --- google/cloud/bigquery/_pandas_helpers.py | 24 ++++++++++++------------ tests/unit/job/test_query_pandas.py | 9 +++++---- tests/unit/test__pandas_helpers.py | 6 +++--- tests/unit/test_dbapi__helpers.py | 6 ++++++ tests/unit/test_table.py | 14 +++++++------- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index d8265e0e9..bd2e95d47 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -47,13 +47,10 @@ date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype try: - import pyarrow # type: ignore - import pyarrow.parquet # type: ignore - - pyarrow_import_exception = None -except ImportError as exc: # pragma: NO COVER + import pyarrow + import pyarrow.parquet +except ImportError: # pragma: NO COVER pyarrow = None - pyarrow_import_exception = exc try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` @@ -93,6 +90,7 @@ def _to_wkb(v): from google.cloud.bigquery import schema +pyarrow = _helpers.PYARROW_VERSIONS.try_import() _LOGGER = logging.getLogger(__name__) @@ -104,6 +102,11 @@ def _to_wkb(v): _NO_DB_TYPES_ERROR = "Please install the 'db-dtypes' package to use this function." _NO_PYARROW_ERROR = "Please install the 'pyarrow' package to use this function." +_NO_BQSTORAGE_ERROR = ( + "The google-cloud-bigquery-storage library is not installed, " + "please install google-cloud-bigquery-storage to use bqstorage features." +) + _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", @@ -691,10 +694,9 @@ def dataframe_to_parquet( This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ - global pyarrow - if pyarrow is None: - raise ValueError("pyarrow is required for BigQuery schema conversion.") - import pyarrow.parquet # type: ignore + pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + + import pyarrow.parquet kwargs = ( {"use_compliant_nested_type": parquet_use_compliant_nested_type} @@ -1019,8 +1021,6 @@ def dataframe_to_json_generator(dataframe): def verify_pandas_imports(): - if pyarrow is None: - raise ValueError(_NO_PYARROW_ERROR) from pyarrow_import_exception if pandas is None: raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception if db_dtypes is None: diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index bcdc96f40..54894a6a7 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -22,10 +22,7 @@ # import google.cloud.bigquery_storage_v1.reader # import google.cloud.bigquery_storage_v1.services.big_query_read.client -try: - import pyarrow -except (ImportError, AttributeError): # pragma: NO COVER - pyarrow = None + try: from google.cloud import bigquery_storage import google.cloud.bigquery_storage_v1.reader @@ -33,6 +30,8 @@ except (ImportError, AttributeError): # pragma: NO COVER bigquery_storage = None +from google.cloud.bigquery import _helpers + try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -56,6 +55,8 @@ pandas = pytest.importorskip("pandas") +pyarrow = _helpers.PYARROW_VERSIONS.try_import() + @pytest.fixture def table_read_options_kwarg(): diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index f429305a2..edf599fc3 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -44,10 +44,10 @@ from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT -try: - import pyarrow +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +if pyarrow: import pyarrow.types -except ImportError: # pragma: NO COVER +else: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. pyarrow = mock.Mock() diff --git a/tests/unit/test_dbapi__helpers.py b/tests/unit/test_dbapi__helpers.py index 7cc1f11c3..fae0c17e9 100644 --- a/tests/unit/test_dbapi__helpers.py +++ b/tests/unit/test_dbapi__helpers.py @@ -21,6 +21,11 @@ import pytest +try: + import pyarrow +except ImportError: # pragma: NO COVER + pyarrow = None + import google.cloud._helpers from google.cloud.bigquery import query, table from google.cloud.bigquery.dbapi import _helpers @@ -210,6 +215,7 @@ def test_empty_iterable(self): result = _helpers.to_bq_table_rows(rows_iterable) self.assertEqual(list(result), []) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_non_empty_iterable(self): rows_iterable = [ dict( diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 1cdb4af09..d9f55412c 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -28,6 +28,7 @@ from test_utils.imports import maybe_fail_import from google.cloud.bigquery.table import TableReference +from google.cloud.bigquery.dataset import DatasetReference try: from google.cloud import bigquery_storage @@ -38,14 +39,15 @@ bigquery_storage = None big_query_read_grpc_transport = None -try: - import pyarrow +from google.cloud.bigquery import _helpers + + +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +PYARROW_VERSION = pkg_resources.parse_version("0.0.1") +if pyarrow: import pyarrow.types PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) -except ImportError: # pragma: NO COVER - pyarrow = None - PYARROW_VERSION = pkg_resources.parse_version("0.0.1") try: import pandas @@ -62,8 +64,6 @@ except (ImportError, AttributeError): # pragma: NO COVER tqdm = None -from google.cloud.bigquery.dataset import DatasetReference - PYARROW_TIMESTAMP_VERSION = pkg_resources.parse_version("2.0.0") From 0d89234012a228281f94c76f51e6f9e7370bf25e Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Wed, 20 Jul 2022 19:51:37 -0700 Subject: [PATCH 15/47] add pyarrow skips --- tests/unit/test__pandas_helpers.py | 4 +++- tests/unit/test_table.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index edf599fc3..9e175c091 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -93,7 +93,6 @@ def is_datetime(type_): )(type_) -@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def is_numeric(type_): # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type return all_( @@ -129,6 +128,7 @@ def all_(*functions): return functools.partial(do_all, functions) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_is_datetime(): assert is_datetime(pyarrow.timestamp("us", tz=None)) assert not is_datetime(pyarrow.timestamp("ms", tz=None)) @@ -1830,6 +1830,7 @@ def test_table_data_listpage_to_dataframe_skips_stop_iteration(module_under_test assert isinstance(dataframe, pandas.DataFrame) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_field_type_override(module_under_test): # When loading pandas data, we may need to override the type # decision based on data contents, because GEOGRAPHY data can be @@ -1862,6 +1863,7 @@ def test_bq_to_arrow_field_type_override(module_under_test): ), ], ) +@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`") def test_bq_to_arrow_field_metadata(module_under_test, field_type, metadata): assert ( module_under_test.bq_to_arrow_field( diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index d9f55412c..b35a21c0b 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1927,6 +1927,7 @@ def test_to_arrow(self): self.assertIsInstance(tbl, pyarrow.Table) self.assertEqual(tbl.num_rows, 0) + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow_iterable(self): row_iterator = self._make_one() arrow_iter = row_iterator.to_arrow_iterable() @@ -2430,6 +2431,7 @@ def test_to_arrow_iterable_w_bqstorage(self): # Don't close the client if it was passed in. bqstorage_client._transport.grpc_channel.close.assert_not_called() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_to_arrow(self): from google.cloud.bigquery.schema import SchemaField @@ -2670,6 +2672,10 @@ def test_to_arrow_max_results_w_explicit_bqstorage_client_warning(self): ) mock_client._ensure_bqstorage_client.assert_not_called() + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") + @unittest.skipIf( + bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" + ) def test_to_arrow_max_results_w_create_bqstorage_client_no_warning(self): from google.cloud.bigquery.schema import SchemaField From 79dd4ccd8b0e6e0716dd1b73ece54d586fe001c6 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 21 Jul 2022 09:01:34 -0700 Subject: [PATCH 16/47] fix types --- google/cloud/bigquery/_pandas_helpers.py | 4 ++-- google/cloud/bigquery/table.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index bd2e95d47..f2b090073 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -47,8 +47,8 @@ date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype try: - import pyarrow - import pyarrow.parquet + import pyarrow # type: ignore + import pyarrow.parquet # type: ignore except ImportError: # pragma: NO COVER pyarrow = None diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 27da6ae17..87ba82a6b 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1640,7 +1640,7 @@ def total_rows(self): def _maybe_warn_max_results( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient", + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"], ): """Issue a warning if BQ Storage client is not ``None`` with ``max_results`` set. From 37d7a25db41fe6a620171943ee53f10ad4344eaf Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Thu, 21 Jul 2022 10:17:27 -0700 Subject: [PATCH 17/47] lint --- google/cloud/bigquery/_pandas_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index f2b090073..6ece9f58f 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -47,8 +47,8 @@ date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype try: - import pyarrow # type: ignore - import pyarrow.parquet # type: ignore + import pyarrow # type: ignore + import pyarrow.parquet # type: ignore except ImportError: # pragma: NO COVER pyarrow = None From 9dedf7819333f8c55e5482f9f4b768b0de180eec Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 1 Aug 2022 12:23:23 -0700 Subject: [PATCH 18/47] Update google/cloud/bigquery/client.py Co-authored-by: Tim Swast --- google/cloud/bigquery/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 5c2ff1a97..833d1d99d 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -2600,7 +2600,7 @@ def load_table_from_dataframe( ) if pyarrow is None and job_config.source_format == job.SourceFormat.PARQUET: - # pyarrow is now the only supported parquet engine. + # pyarrow is now the only supported parquet engine. raise ValueError("This method requires pyarrow to be installed") if location is None: From 933963e0c0d4a62ff1fb43754f6c50ed97e35c9f Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 1 Aug 2022 12:24:34 -0700 Subject: [PATCH 19/47] update pyarrow version --- google/cloud/bigquery/_pandas_helpers.py | 3 +-- setup.py | 5 ++--- testing/constraints-3.6.txt | 3 +-- testing/constraints-3.7.txt | 2 +- tests/unit/job/test_query_pandas.py | 7 ++----- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 6ece9f58f..74216c86a 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -48,7 +48,6 @@ try: import pyarrow # type: ignore - import pyarrow.parquet # type: ignore except ImportError: # pragma: NO COVER pyarrow = None @@ -696,7 +695,7 @@ def dataframe_to_parquet( """ pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) - import pyarrow.parquet + import pyarrow.parquet # type: ignore kwargs = ( {"use_compliant_nested_type": parquet_use_compliant_nested_type} diff --git a/setup.py b/setup.py index d8d04331a..65d8326e6 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,6 @@ # Until this issue is closed # https://github.com/googleapis/google-cloud-python/issues/10566 "google-api-core[grpc] >= 1.31.5, <3.0.0dev,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0", - # "google-cloud-bigquery-storage >= 2.0.0, <3.0.0dev", "proto-plus >= 1.15.0, <2.0.0dev", # NOTE: Maintainers, please do not require google-cloud-core>=2.x.x # Until this issue is closed @@ -58,11 +57,11 @@ # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 "grpcio >= 1.38.1, < 2.0dev", - "pyarrow >= 5.0.0, < 9.0dev", + "pyarrow >= 3.0.0, < 9.0dev", ], "pandas": [ "pandas>=1.0.0", - "pyarrow >= 5.0.0, < 9.0dev", + "pyarrow >= 3.0.0, < 9.0dev", "db-dtypes>=0.3.0,<2.0.0dev", ], "bignumeric_type": ["pyarrow == 5.0.0, < 9.0dev"], diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index dc7295134..47b842a6d 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -19,9 +19,8 @@ opentelemetry-sdk==1.1.0 pandas==1.0.0 proto-plus==1.15.0 protobuf==3.12.0 -pyarrow==5.0.0 +pyarrow==3.0.0 python-dateutil==2.7.2 -pyarrow>=5.0.0 requests==2.18.0 Shapely==1.6.0 six==1.13.0 diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 674ec5d01..c5803387e 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -19,7 +19,7 @@ opentelemetry-sdk==1.1.0 pandas==1.1.0 proto-plus==1.15.0 protobuf==3.12.0 -pyarrow>=5.0.0 +pyarrow==3.0.0 python-dateutil==2.7.3 requests==2.18.0 Shapely==1.6.4.post2 diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 54894a6a7..a6a49e09f 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -19,9 +19,6 @@ import mock import pytest -# import google.cloud.bigquery_storage_v1.reader -# import google.cloud.bigquery_storage_v1.services.big_query_read.client - try: from google.cloud import bigquery_storage @@ -54,8 +51,8 @@ from .helpers import _make_job_resource pandas = pytest.importorskip("pandas") - -pyarrow = _helpers.PYARROW_VERSIONS.try_import() +pyarrow = pytest.importorskip("pyarrow") +# pyarrow = _helpers.PYARROW_VERSIONS.try_import() @pytest.fixture From 45eed33875a272fcf77871a91e8ca1bbc95ebdb5 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 1 Aug 2022 12:53:35 -0700 Subject: [PATCH 20/47] update test --- tests/unit/test_table_pandas.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 943baa326..236005660 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -16,7 +16,17 @@ import decimal from unittest import mock -import pyarrow +from google.cloud.bigquery import _helpers +import pkg_resources + +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +PYARROW_VERSION = pkg_resources.parse_version("0.0.1") +if pyarrow: + import pyarrow.types + + PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) + + import pytest from google.cloud import bigquery From af00605b40af4ac2217e67cf2c603bff0e22da52 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 1 Aug 2022 13:35:17 -0700 Subject: [PATCH 21/47] lint --- tests/unit/job/test_query_pandas.py | 2 -- tests/unit/test_table_pandas.py | 9 ++++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index a6a49e09f..31fa0dfa7 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -27,8 +27,6 @@ except (ImportError, AttributeError): # pragma: NO COVER bigquery_storage = None -from google.cloud.bigquery import _helpers - try: import pandas except (ImportError, AttributeError): # pragma: NO COVER diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 236005660..b10cfbce9 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -19,6 +19,10 @@ from google.cloud.bigquery import _helpers import pkg_resources +import pytest + +from google.cloud import bigquery + pyarrow = _helpers.PYARROW_VERSIONS.try_import() PYARROW_VERSION = pkg_resources.parse_version("0.0.1") if pyarrow: @@ -26,11 +30,6 @@ PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) - -import pytest - -from google.cloud import bigquery - pandas = pytest.importorskip("pandas") From ef20ab51fbe22e108ba8b704a35886250af3777e Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 1 Aug 2022 14:13:22 -0700 Subject: [PATCH 22/47] update pyarrow req --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 65d8326e6..23fc70345 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ "pyarrow >= 3.0.0, < 9.0dev", "db-dtypes>=0.3.0,<2.0.0dev", ], - "bignumeric_type": ["pyarrow == 5.0.0, < 9.0dev"], + "bignumeric_type": ["pyarrow >= 3.0.0, < 9.0dev"], "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], From 95aceca409ef21f4fe046424d7528ce7300aff34 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Mon, 1 Aug 2022 14:23:33 -0700 Subject: [PATCH 23/47] update noxfile --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index c6f7c76b1..a98050425 100644 --- a/noxfile.py +++ b/noxfile.py @@ -108,7 +108,7 @@ def unit_noextras(session): # Install optional dependencies that are out-of-date. # https://github.com/googleapis/python-bigquery/issues/933 # There is no pyarrow 1.0.0 package for Python 3.9. - if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: + if session.python == UNIT_TEST_PYTHON_VERSIONS[-1]: session.install("pyarrow==1.0.0") default(session, install_extras=False) From d0e90456cc60c288172449e8099165a9e5b9b180 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 5 Aug 2022 13:12:34 -0700 Subject: [PATCH 24/47] remove bignum check --- google/cloud/bigquery/_pandas_helpers.py | 18 ++++---- noxfile.py | 2 +- tests/system/test_pandas.py | 35 +++++++------- tests/unit/test__helpers.py | 2 + tests/unit/test__pandas_helpers.py | 59 ++++++++++++------------ 5 files changed, 60 insertions(+), 56 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 74216c86a..d1562b626 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -204,19 +204,19 @@ def pyarrow_timestamp(): pyarrow.decimal128(38, scale=9).id: "NUMERIC", } - if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): - BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal256 instances. - ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" - _BIGNUMERIC_SUPPORT = True - else: - _BIGNUMERIC_SUPPORT = False + # if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): + BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + # # _BIGNUMERIC_SUPPORT = True + # else: + # _BIGNUMERIC_SUPPORT = False else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER + # _BIGNUMERIC_SUPPORT = False # pragma: NO COVER BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { diff --git a/noxfile.py b/noxfile.py index a98050425..c6f7c76b1 100644 --- a/noxfile.py +++ b/noxfile.py @@ -108,7 +108,7 @@ def unit_noextras(session): # Install optional dependencies that are out-of-date. # https://github.com/googleapis/python-bigquery/issues/933 # There is no pyarrow 1.0.0 package for Python 3.9. - if session.python == UNIT_TEST_PYTHON_VERSIONS[-1]: + if session.python == UNIT_TEST_PYTHON_VERSIONS[0]: session.install("pyarrow==1.0.0") default(session, install_extras=False) diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index e5dbe8719..7a1a1f65a 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -27,7 +27,8 @@ import pytest from google.cloud import bigquery -from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT + +# from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery import enums from . import helpers @@ -380,8 +381,8 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), ) - if _BIGNUMERIC_SUPPORT: - table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) + # if _BIGNUMERIC_SUPPORT: + table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) num_rows = 100 nulls = [None] * num_rows @@ -398,8 +399,8 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): ("time_col", nulls), ("ts_col", nulls), ] - if _BIGNUMERIC_SUPPORT: - df_data.append(("bignum_col", nulls)) + # if _BIGNUMERIC_SUPPORT: + df_data.append(("bignum_col", nulls)) df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) @@ -478,8 +479,8 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), ) - if _BIGNUMERIC_SUPPORT: - table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) + # if _BIGNUMERIC_SUPPORT: + table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) df_data = [ ("row_num", [1, 2, 3]), @@ -524,17 +525,17 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id ], ), ] - if _BIGNUMERIC_SUPPORT: - df_data.append( - ( - "bignum_col", - [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - None, - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ], - ) + # if _BIGNUMERIC_SUPPORT: + df_data.append( + ( + "bignum_col", + [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + None, + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], ) + ) df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 4fb86f665..515be0a2a 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -29,6 +29,8 @@ except ImportError: # pragma: NO COVER pyarrow = None +# pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 9e175c091..3ccf0843e 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -42,7 +42,8 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema -from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT + +# from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT pyarrow = _helpers.PYARROW_VERSIONS.try_import() if pyarrow: @@ -67,10 +68,10 @@ # Set to less than MIN version. PANDAS_INSTALLED_VERSION = pkg_resources.parse_version("0.0.0") -skip_if_no_bignumeric = pytest.mark.skipif( - not _BIGNUMERIC_SUPPORT, - reason="BIGNUMERIC support requires pyarrow>=3.0.0", -) +# skip_if_no_bignumeric = pytest.mark.skipif( +# not _BIGNUMERIC_SUPPORT, +# reason="BIGNUMERIC support requires pyarrow>=3.0.0", +# ) @pytest.fixture @@ -165,7 +166,7 @@ def test_all_(): "BIGNUMERIC", "NULLABLE", is_bignumeric, - marks=skip_if_no_bignumeric, + # marks=skip_if_no_bignumeric, ), ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), ("BOOL", "NULLABLE", pyarrow.types.is_boolean), @@ -249,7 +250,7 @@ def test_all_(): "BIGNUMERIC", "REPEATED", all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), - marks=skip_if_no_bignumeric, + # marks=skip_if_no_bignumeric, ), ( "BOOLEAN", @@ -380,8 +381,8 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field14", "DATETIME"), schema.SchemaField("field15", "GEOGRAPHY"), ) - if _BIGNUMERIC_SUPPORT: - fields += (schema.SchemaField("field08", "BIGNUMERIC"),) + # if _BIGNUMERIC_SUPPORT: + fields += (schema.SchemaField("field08", "BIGNUMERIC"),) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) @@ -403,8 +404,8 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field14", module_under_test.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) - if _BIGNUMERIC_SUPPORT: - expected += (pyarrow.field("field08", module_under_test.pyarrow_bignumeric()),) + # if _BIGNUMERIC_SUPPORT: + expected += (pyarrow.field("field08", module_under_test.pyarrow_bignumeric()),) expected_value_type = pyarrow.struct(expected) assert pyarrow.types.is_list(actual) @@ -458,7 +459,7 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), decimal.Decimal("3.141592653589793238462643383279"), ], - marks=skip_if_no_bignumeric, + # marks=skip_if_no_bignumeric, ), ("BOOLEAN", [True, None, False, None]), ("BOOL", [False, None, True, None]), @@ -1044,8 +1045,8 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field14", "DATETIME", mode="REQUIRED"), schema.SchemaField("field15", "GEOGRAPHY", mode="REQUIRED"), ) - if _BIGNUMERIC_SUPPORT: - bq_schema += (schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"),) + # if _BIGNUMERIC_SUPPORT: + bq_schema += (schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"),) data = { "field01": ["hello", "world"], @@ -1073,11 +1074,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): ], "field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], } - if _BIGNUMERIC_SUPPORT: - data["field08"] = [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ] + # if _BIGNUMERIC_SUPPORT: + data["field08"] = [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ] dataframe = pandas.DataFrame(data) arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema) @@ -1380,10 +1381,10 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), # schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), ) - if _BIGNUMERIC_SUPPORT: - current_schema += ( # type: ignore - schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), # type: ignore - ) # type: ignore + # if _BIGNUMERIC_SUPPORT: + current_schema += ( # type: ignore + schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), # type: ignore + ) # type: ignore with warnings.catch_warnings(record=True) as warned: augmented_schema = module_under_test.augment_schema(dataframe, current_schema) @@ -1409,12 +1410,12 @@ def test_augment_schema_type_detection_succeeds(module_under_test): # "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" # ), ) - if _BIGNUMERIC_SUPPORT: - expected_schema += ( - schema.SchemaField( - "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" - ), - ) + # if _BIGNUMERIC_SUPPORT: + expected_schema += ( + schema.SchemaField( + "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" + ), + ) by_name = operator.attrgetter("name") assert sorted(augmented_schema, key=by_name) == sorted(expected_schema, key=by_name) From 5045ead2cf13936e837df1abd7314b5d6a9bca3e Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 5 Aug 2022 15:54:45 -0700 Subject: [PATCH 25/47] remove comments --- google/cloud/bigquery/_pandas_helpers.py | 6 ---- tests/system/test_pandas.py | 16 +++------ tests/unit/test__pandas_helpers.py | 46 +++++------------------- 3 files changed, 13 insertions(+), 55 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index d1562b626..8333b8ec8 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -21,7 +21,6 @@ import logging import queue import warnings -from packaging import version from google.cloud.bigquery import _helpers @@ -204,19 +203,14 @@ def pyarrow_timestamp(): pyarrow.decimal128(38, scale=9).id: "NUMERIC", } - # if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric # The exact decimal's scale and precision are not important, as only # the type ID matters, and it's the same for all decimal256 instances. ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" - # # _BIGNUMERIC_SUPPORT = True - # else: - # _BIGNUMERIC_SUPPORT = False else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER - # _BIGNUMERIC_SUPPORT = False # pragma: NO COVER BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 7a1a1f65a..aa0831e19 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -380,9 +380,8 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): bigquery.SchemaField("str_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), ) - # if _BIGNUMERIC_SUPPORT: - table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) num_rows = 100 nulls = [None] * num_rows @@ -398,9 +397,8 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id): ("str_col", nulls), ("time_col", nulls), ("ts_col", nulls), + ("bignum_col", nulls), ] - # if _BIGNUMERIC_SUPPORT: - df_data.append(("bignum_col", nulls)) df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, columns=df_data.keys()) @@ -478,9 +476,8 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id bigquery.SchemaField("str_col", "STRING"), bigquery.SchemaField("time_col", "TIME"), bigquery.SchemaField("ts_col", "TIMESTAMP"), + bigquery.SchemaField("bignum_col", "BIGNUMERIC"), ) - # if _BIGNUMERIC_SUPPORT: - table_schema += (bigquery.SchemaField("bignum_col", "BIGNUMERIC"),) df_data = [ ("row_num", [1, 2, 3]), @@ -524,9 +521,6 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id ), ], ), - ] - # if _BIGNUMERIC_SUPPORT: - df_data.append( ( "bignum_col", [ @@ -534,8 +528,8 @@ def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id None, decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), ], - ) - ) + ), + ] df_data = collections.OrderedDict(df_data) dataframe = pandas.DataFrame(df_data, dtype="object", columns=df_data.keys()) diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 3ccf0843e..97bea492f 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -43,7 +43,6 @@ from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema -# from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT pyarrow = _helpers.PYARROW_VERSIONS.try_import() if pyarrow: @@ -68,11 +67,6 @@ # Set to less than MIN version. PANDAS_INSTALLED_VERSION = pkg_resources.parse_version("0.0.0") -# skip_if_no_bignumeric = pytest.mark.skipif( -# not _BIGNUMERIC_SUPPORT, -# reason="BIGNUMERIC support requires pyarrow>=3.0.0", -# ) - @pytest.fixture def module_under_test(): @@ -166,7 +160,6 @@ def test_all_(): "BIGNUMERIC", "NULLABLE", is_bignumeric, - # marks=skip_if_no_bignumeric, ), ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), ("BOOL", "NULLABLE", pyarrow.types.is_boolean), @@ -250,7 +243,6 @@ def test_all_(): "BIGNUMERIC", "REPEATED", all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), - # marks=skip_if_no_bignumeric, ), ( "BOOLEAN", @@ -372,7 +364,7 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), - # schema.SchemaField("field08", "BIGNUMERIC"), + schema.SchemaField("field08", "BIGNUMERIC"), schema.SchemaField("field09", "BOOLEAN"), schema.SchemaField("field10", "BOOL"), schema.SchemaField("field11", "TIMESTAMP"), @@ -381,8 +373,6 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): schema.SchemaField("field14", "DATETIME"), schema.SchemaField("field15", "GEOGRAPHY"), ) - # if _BIGNUMERIC_SUPPORT: - fields += (schema.SchemaField("field08", "BIGNUMERIC"),) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) @@ -395,7 +385,7 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), - # pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), pyarrow.field("field11", module_under_test.pyarrow_timestamp()), @@ -404,8 +394,6 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field14", module_under_test.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) - # if _BIGNUMERIC_SUPPORT: - expected += (pyarrow.field("field08", module_under_test.pyarrow_bignumeric()),) expected_value_type = pyarrow.struct(expected) assert pyarrow.types.is_list(actual) @@ -459,7 +447,6 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), decimal.Decimal("3.141592653589793238462643383279"), ], - # marks=skip_if_no_bignumeric, ), ("BOOLEAN", [True, None, False, None]), ("BOOL", [False, None, True, None]), @@ -1036,7 +1023,7 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field05", "FLOAT", mode="REQUIRED"), schema.SchemaField("field06", "FLOAT64", mode="REQUIRED"), schema.SchemaField("field07", "NUMERIC", mode="REQUIRED"), - # schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"), + schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"), schema.SchemaField("field09", "BOOLEAN", mode="REQUIRED"), schema.SchemaField("field10", "BOOL", mode="REQUIRED"), schema.SchemaField("field11", "TIMESTAMP", mode="REQUIRED"), @@ -1045,8 +1032,6 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): schema.SchemaField("field14", "DATETIME", mode="REQUIRED"), schema.SchemaField("field15", "GEOGRAPHY", mode="REQUIRED"), ) - # if _BIGNUMERIC_SUPPORT: - bq_schema += (schema.SchemaField("field08", "BIGNUMERIC", mode="REQUIRED"),) data = { "field01": ["hello", "world"], @@ -1056,10 +1041,10 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): "field05": [1.25, 9.75], "field06": [-1.75, -3.5], "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], - # "field08": [ - # decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - # decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - # ], + "field08": [ + decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), + decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), + ], "field09": [True, False], "field10": [False, True], "field11": [ @@ -1074,11 +1059,6 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test): ], "field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"], } - # if _BIGNUMERIC_SUPPORT: - data["field08"] = [ - decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)), - decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)), - ] dataframe = pandas.DataFrame(data) arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema) @@ -1379,12 +1359,8 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("bytes_field", field_type=None, mode="NULLABLE"), schema.SchemaField("string_field", field_type=None, mode="NULLABLE"), schema.SchemaField("numeric_field", field_type=None, mode="NULLABLE"), - # schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), + schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), ) - # if _BIGNUMERIC_SUPPORT: - current_schema += ( # type: ignore - schema.SchemaField("bignumeric_field", field_type=None, mode="NULLABLE"), # type: ignore - ) # type: ignore with warnings.catch_warnings(record=True) as warned: augmented_schema = module_under_test.augment_schema(dataframe, current_schema) @@ -1406,12 +1382,6 @@ def test_augment_schema_type_detection_succeeds(module_under_test): schema.SchemaField("bytes_field", field_type="BYTES", mode="NULLABLE"), schema.SchemaField("string_field", field_type="STRING", mode="NULLABLE"), schema.SchemaField("numeric_field", field_type="NUMERIC", mode="NULLABLE"), - # schema.SchemaField( - # "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" - # ), - ) - # if _BIGNUMERIC_SUPPORT: - expected_schema += ( schema.SchemaField( "bignumeric_field", field_type="BIGNUMERIC", mode="NULLABLE" ), From 1eb5facbc64b31db04c70175f407b9ea218d0583 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 23 Sep 2022 16:54:57 -0700 Subject: [PATCH 26/47] add test importorskip --- setup.py | 8 ++++---- tests/system/test_pandas.py | 1 - tests/unit/test__pandas_helpers.py | 7 +++---- tests/unit/test_table_pandas.py | 14 ++++++-------- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index 74957a271..08cc44b10 100644 --- a/setup.py +++ b/setup.py @@ -57,15 +57,15 @@ # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 "grpcio >= 1.38.1, < 2.0dev", - "pyarrow >= 3.0.0, < 9.0dev", + "pyarrow >= 1.0.1, < 9.0dev", ], "pandas": [ "pandas>=1.0.0", - "pyarrow >= 3.0.0, < 9.0dev", + "pyarrow >= 1.0.1, < 9.0dev", "db-dtypes>=0.3.0,<2.0.0dev", ], - "bignumeric_type": ["pyarrow >= 3.0.0, < 9.0dev"], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], + "bignumeric_type": ["pyarrow >= 1.0.1, < 9.0dev"], + # "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index aa0831e19..91305b450 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -28,7 +28,6 @@ from google.cloud import bigquery -# from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT from google.cloud.bigquery import enums from . import helpers diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 97bea492f..2611f1a9e 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -43,11 +43,10 @@ from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema - -pyarrow = _helpers.PYARROW_VERSIONS.try_import() -if pyarrow: +try: + pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") import pyarrow.types -else: # pragma: NO COVER +except ImportError: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. pyarrow = mock.Mock() diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index b10cfbce9..0e2ced495 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -16,19 +16,17 @@ import decimal from unittest import mock -from google.cloud.bigquery import _helpers -import pkg_resources - import pytest from google.cloud import bigquery -pyarrow = _helpers.PYARROW_VERSIONS.try_import() -PYARROW_VERSION = pkg_resources.parse_version("0.0.1") -if pyarrow: +try: + pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") import pyarrow.types - - PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) +except ImportError: # pragma: NO COVER + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() pandas = pytest.importorskip("pandas") From f23657b0ada9c745139d1f72ac38813bb2900d59 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 23 Sep 2022 17:27:49 -0700 Subject: [PATCH 27/47] update test --- tests/unit/test_table_pandas.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 0e2ced495..a0d973d57 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -16,17 +16,17 @@ import decimal from unittest import mock +from google.cloud.bigquery import _helpers +import pkg_resources + import pytest from google.cloud import bigquery -try: - pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +PYARROW_VERSION = pkg_resources.parse_version("1.0.1") +if pyarrow: import pyarrow.types -except ImportError: # pragma: NO COVER - # Mock out pyarrow when missing, because methods from pyarrow.types are - # used in test parameterization. - pyarrow = mock.Mock() pandas = pytest.importorskip("pandas") From 7138f1ed83ac9f15825110d13c9d25148b7d815a Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Fri, 23 Sep 2022 17:56:40 -0700 Subject: [PATCH 28/47] update test --- tests/unit/job/test_query_pandas.py | 9 +++++++-- tests/unit/test__helpers.py | 9 +++++---- tests/unit/test_table_pandas.py | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 31fa0dfa7..e8c41d1ea 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -49,9 +49,14 @@ from .helpers import _make_job_resource pandas = pytest.importorskip("pandas") -pyarrow = pytest.importorskip("pyarrow") -# pyarrow = _helpers.PYARROW_VERSIONS.try_import() +try: + pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") + import pyarrow.types +except ImportError: # pragma: NO COVER + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() @pytest.fixture def table_read_options_kwarg(): diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 515be0a2a..6f6c63770 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -16,6 +16,7 @@ import datetime import decimal import unittest +import pytest import mock @@ -25,11 +26,11 @@ bigquery_storage = None try: - import pyarrow + pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") except ImportError: # pragma: NO COVER - pyarrow = None - -# pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index a0d973d57..ced341608 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -23,10 +23,13 @@ from google.cloud import bigquery -pyarrow = _helpers.PYARROW_VERSIONS.try_import() -PYARROW_VERSION = pkg_resources.parse_version("1.0.1") -if pyarrow: +try: + pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") import pyarrow.types +except ImportError: # pragma: NO COVER + # Mock out pyarrow when missing, because methods from pyarrow.types are + # used in test parameterization. + pyarrow = mock.Mock() pandas = pytest.importorskip("pandas") From abb9b8c495c717f090383efd48db499465f5c314 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Sat, 24 Sep 2022 12:39:45 -0700 Subject: [PATCH 29/47] update dependency --- setup.py | 2 +- tests/unit/job/test_query_pandas.py | 1 + tests/unit/test__pandas_helpers.py | 6 +++--- tests/unit/test_table.py | 3 ++- tests/unit/test_table_pandas.py | 3 --- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 08cc44b10..466fcb0d6 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ "db-dtypes>=0.3.0,<2.0.0dev", ], "bignumeric_type": ["pyarrow >= 1.0.1, < 9.0dev"], - # "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index e8c41d1ea..a37797a45 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -58,6 +58,7 @@ # used in test parameterization. pyarrow = mock.Mock() + @pytest.fixture def table_read_options_kwarg(): # Create a BigQuery Storage table read options object with pyarrow compression diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 2611f1a9e..898731f98 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -43,10 +43,10 @@ from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema -try: - pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") +pyarrow = _helpers.PYARROW_VERSIONS.try_import() +if pyarrow: import pyarrow.types -except ImportError: # pragma: NO COVER +else: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. pyarrow = mock.Mock() diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index d8cc64ec9..0fcdd2cd8 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -43,12 +43,13 @@ pyarrow = _helpers.PYARROW_VERSIONS.try_import() -PYARROW_VERSION = pkg_resources.parse_version("0.0.1") +PYARROW_VERSION = pkg_resources.parse_version("1.0.1") if pyarrow: import pyarrow.types PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) + try: import pandas except (ImportError, AttributeError): # pragma: NO COVER diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index ced341608..0e2ced495 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -16,9 +16,6 @@ import decimal from unittest import mock -from google.cloud.bigquery import _helpers -import pkg_resources - import pytest from google.cloud import bigquery From d69f8ade45c5fd3f4023610b1d0dad6db165b5a4 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Sat, 24 Sep 2022 14:35:05 -0700 Subject: [PATCH 30/47] change version --- google/cloud/bigquery/_pandas_helpers.py | 15 +++++++++++---- tests/unit/test__pandas_helpers.py | 7 ++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 8333b8ec8..280c66338 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -22,6 +22,8 @@ import queue import warnings +from packaging import version + from google.cloud.bigquery import _helpers try: @@ -203,14 +205,19 @@ def pyarrow_timestamp(): pyarrow.decimal128(38, scale=9).id: "NUMERIC", } - BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal256 instances. - ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): + BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + _BIGNUMERIC_SUPPORT = True + else: + _BIGNUMERIC_SUPPORT = False else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 898731f98..9e5ce1f2b 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -42,6 +42,7 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers from google.cloud.bigquery import schema +from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT pyarrow = _helpers.PYARROW_VERSIONS.try_import() if pyarrow: @@ -67,6 +68,9 @@ PANDAS_INSTALLED_VERSION = pkg_resources.parse_version("0.0.0") +skip_if_no_bignumeric = pytest.mark.skipif( + not _BIGNUMERIC_SUPPORT, reason="BIGNUMERIC support requires pyarrow>=3.0.0", +) @pytest.fixture def module_under_test(): from google.cloud.bigquery import _pandas_helpers @@ -159,6 +163,7 @@ def test_all_(): "BIGNUMERIC", "NULLABLE", is_bignumeric, + marks=skip_if_no_bignumeric, ), ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), ("BOOL", "NULLABLE", pyarrow.types.is_boolean), @@ -241,7 +246,7 @@ def test_all_(): pytest.param( "BIGNUMERIC", "REPEATED", - all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), marks=skip_if_no_bignumeric, ), ( "BOOLEAN", From caa21cb810a703d6cea8d2ee53bc7bd8da9fa125 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Sun, 25 Sep 2022 23:23:48 -0700 Subject: [PATCH 31/47] update imports --- google/cloud/bigquery/_pandas_helpers.py | 31 ++++++------------------ setup.py | 6 ++--- testing/constraints-3.6.txt | 27 --------------------- testing/constraints-3.7.txt | 2 +- tests/unit/test__pandas_helpers.py | 10 ++++++-- tests/unit/test_table.py | 10 +++----- tests/unit/test_table_pandas.py | 2 +- 7 files changed, 24 insertions(+), 64 deletions(-) delete mode 100644 testing/constraints-3.6.txt diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 280c66338..f98ef0850 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -25,6 +25,7 @@ from packaging import version from google.cloud.bigquery import _helpers +from google.cloud.bigquery import schema try: import pandas # type: ignore @@ -47,10 +48,7 @@ db_dtypes_import_exception = exc date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype -try: - import pyarrow # type: ignore -except ImportError: # pragma: NO COVER - pyarrow = None +pyarrow = _helpers.PYARROW_VERSIONS.try_import() try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` @@ -88,10 +86,6 @@ def _to_wkb(v): # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too. _ARROW_COMPRESSION_SUPPORT = True -from google.cloud.bigquery import schema - -pyarrow = _helpers.PYARROW_VERSIONS.try_import() - _LOGGER = logging.getLogger(__name__) _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. @@ -100,12 +94,6 @@ def _to_wkb(v): _NO_PANDAS_ERROR = "Please install the 'pandas' package to use this function." _NO_DB_TYPES_ERROR = "Please install the 'db-dtypes' package to use this function." -_NO_PYARROW_ERROR = "Please install the 'pyarrow' package to use this function." - -_NO_BQSTORAGE_ERROR = ( - "The google-cloud-bigquery-storage library is not installed, " - "please install google-cloud-bigquery-storage to use bqstorage features." -) _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", @@ -137,20 +125,17 @@ def __init__(self): def pyarrow_datetime(): - if pyarrow: - return pyarrow.timestamp("us", tz=None) + return pyarrow.timestamp("us", tz=None) def pyarrow_numeric(): - if pyarrow: - return pyarrow.decimal128(38, 9) + return pyarrow.decimal128(38, 9) def pyarrow_bignumeric(): - if pyarrow: - # 77th digit is partial. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types - return pyarrow.decimal256(76, 38) + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) def pyarrow_time(): @@ -212,7 +197,7 @@ def pyarrow_timestamp(): ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" _BIGNUMERIC_SUPPORT = True else: - _BIGNUMERIC_SUPPORT = False + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER else: # pragma: NO COVER BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER diff --git a/setup.py b/setup.py index 466fcb0d6..74957a271 100644 --- a/setup.py +++ b/setup.py @@ -57,14 +57,14 @@ # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 "grpcio >= 1.38.1, < 2.0dev", - "pyarrow >= 1.0.1, < 9.0dev", + "pyarrow >= 3.0.0, < 9.0dev", ], "pandas": [ "pandas>=1.0.0", - "pyarrow >= 1.0.1, < 9.0dev", + "pyarrow >= 3.0.0, < 9.0dev", "db-dtypes>=0.3.0,<2.0.0dev", ], - "bignumeric_type": ["pyarrow >= 1.0.1, < 9.0dev"], + "bignumeric_type": ["pyarrow >= 3.0.0, < 9.0dev"], "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt deleted file mode 100644 index 47b842a6d..000000000 --- a/testing/constraints-3.6.txt +++ /dev/null @@ -1,27 +0,0 @@ -# This constraints file is used to check that lower bounds -# are correct in setup.py -# List *all* library dependencies and extras in this file. -# Pin the version to the lower bound. -# -# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", -# Then this file should have foo==1.14.0 -db-dtypes==0.3.0 -geopandas==0.9.0 -google-api-core==1.31.5 -google-cloud-bigquery-storage==2.0.0 -google-cloud-core==1.4.1 -google-resumable-media==0.6.0 -grpcio==1.38.1 -ipython==7.0.1 -opentelemetry-api==1.1.0 -opentelemetry-instrumentation==0.20b0 -opentelemetry-sdk==1.1.0 -pandas==1.0.0 -proto-plus==1.15.0 -protobuf==3.12.0 -pyarrow==3.0.0 -python-dateutil==2.7.2 -requests==2.18.0 -Shapely==1.6.0 -six==1.13.0 -tqdm==4.7.4 diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 67313f6b8..870d27e74 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -22,6 +22,6 @@ protobuf==3.19.0 pyarrow==3.0.0 python-dateutil==2.7.3 requests==2.18.0 -Shapely==1.6.4.post2 +Shapely==1.8.4 six==1.13.0 tqdm==4.7.4 diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 9e5ce1f2b..885cd318c 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -45,7 +45,9 @@ from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT pyarrow = _helpers.PYARROW_VERSIONS.try_import() + if pyarrow: + import pyarrow.parquet import pyarrow.types else: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are @@ -69,8 +71,11 @@ skip_if_no_bignumeric = pytest.mark.skipif( - not _BIGNUMERIC_SUPPORT, reason="BIGNUMERIC support requires pyarrow>=3.0.0", + not _BIGNUMERIC_SUPPORT, + reason="BIGNUMERIC support requires pyarrow>=3.0.0", ) + + @pytest.fixture def module_under_test(): from google.cloud.bigquery import _pandas_helpers @@ -246,7 +251,8 @@ def test_all_(): pytest.param( "BIGNUMERIC", "REPEATED", - all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), marks=skip_if_no_bignumeric, + all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + marks=skip_if_no_bignumeric, ), ( "BOOLEAN", diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 0fcdd2cd8..e4f554f6b 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -41,15 +41,14 @@ from google.cloud.bigquery import _helpers - pyarrow = _helpers.PYARROW_VERSIONS.try_import() -PYARROW_VERSION = pkg_resources.parse_version("1.0.1") +PYARROW_VERSION = pkg_resources.parse_version("0.0.1") + if pyarrow: import pyarrow.types PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) - try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -3212,10 +3211,7 @@ def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): df = row_iterator.to_dataframe(create_bqstorage_client=False) - tzinfo = None - if PYARROW_VERSION >= PYARROW_TIMESTAMP_VERSION: - tzinfo = datetime.timezone.utc - + tzinfo = datetime.timezone.utc self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 2) # verify the number of rows self.assertEqual(list(df.columns), ["some_timestamp"]) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index 0e2ced495..f7e5471d8 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -21,7 +21,7 @@ from google.cloud import bigquery try: - pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") + pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") import pyarrow.types except ImportError: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are From 21ebf7d114ef94f4f91ee4e282fd9161434728b5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 10:58:46 -0600 Subject: [PATCH 32/47] adjust test expectations when google-cloud-bigquery-storage is not available --- tests/unit/test_table.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 9df1c3e90..58141863e 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2846,14 +2846,15 @@ def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): api_request = mock.Mock(return_value={"rows": rows}) mock_client = _mock_client() - mock_client._ensure_bqstorage_client.return_value = None row_iterator = self._make_one(mock_client, api_request, path, schema) - tbl = row_iterator.to_arrow(create_bqstorage_client=True) + def mock_verify_version(): + raise _helpers.LegacyBigQueryStorageError("no bqstorage") - # The client attempted to create a BQ Storage client, and even though - # that was not possible, results were still returned without errors. - mock_client._ensure_bqstorage_client.assert_called_once() + with mock.patch("google.cloud.bigquery._helpers.BQ_STORAGE_VERSIONS.verify_version", mock_verify_version): + tbl = row_iterator.to_arrow(create_bqstorage_client=True) + + mock_client._ensure_bqstorage_client.assert_not_called() self.assertIsInstance(tbl, pyarrow.Table) self.assertEqual(tbl.num_rows, 2) From 39b173a73dd433c3c1f525f2b75c55d6d79246a0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:20:06 -0600 Subject: [PATCH 33/47] export pyarrow exception --- google/cloud/bigquery/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 2ae30a081..ebd5b3109 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -43,6 +43,7 @@ from google.cloud.bigquery.enums import SqlTypeNames from google.cloud.bigquery.enums import StandardSqlTypeNames from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery.exceptions import LegacyPyarrowError from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -198,6 +199,7 @@ "EncryptionConfiguration", # Custom exceptions "LegacyBigQueryStorageError", + "LegacyPyarrowError", ] From 88fa115a1131ebbf86b55b490e07148ca3bae8d4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:22:00 -0600 Subject: [PATCH 34/47] whitespace in docstrings --- google/cloud/bigquery/_helpers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 4149c1ffc..14d0fc2f4 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -105,12 +105,15 @@ def is_read_session_optional(self) -> bool: def verify_version(self): """Verify that a recent enough version of BigQuery Storage extra is installed. + The function assumes that google-cloud-bigquery-storage extra is installed, and should thus be used in places where this assumption holds. + Because `pip` can install an outdated version of this extra despite the constraints in `setup.py`, the calling code can use this helper to verify the version compatibility at runtime. + Raises: LegacyBigQueryStorageError: If the google-cloud-bigquery-storage package is outdated. @@ -152,13 +155,17 @@ def use_compliant_nested_type(self) -> bool: def try_import(self, raise_if_error: bool = False) -> Any: """Verify that a recent enough version of pyarrow extra is installed. + The function assumes that pyarrow extra is installed, and should thus be used in places where this assumption holds. + Because `pip` can install an outdated version of this extra despite the constraints in `setup.py`, the calling code can use this helper to verify the version compatibility at runtime. + Returns: The ``pyarrow`` module or ``None``. + Raises: LegacyPyarrowError: If the pyarrow package is outdated and ``raise_if_error`` is ``True``. From 1b926aaaa863be817112ed50d221e85b5c285b82 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:22:55 -0600 Subject: [PATCH 35/47] format minimum bqstorage version string --- google/cloud/bigquery/_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 14d0fc2f4..014a721a8 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -121,7 +121,7 @@ def verify_version(self): if self.installed_version < _MIN_BQ_STORAGE_VERSION: msg = ( "Dependency google-cloud-bigquery-storage is outdated, please upgrade " - f"it to version >= 2.0.0 (version found: {self.installed_version})." + f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})." ) raise LegacyBigQueryStorageError(msg) From d71141d881f33f4bc6dfe99b362e061ce14e1ad7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:28:20 -0600 Subject: [PATCH 36/47] restore optional bqstorage_client --- google/cloud/bigquery/job/query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 1d9ce9dfb..8a5719168 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1537,7 +1537,7 @@ def do_get_result(): def to_arrow( self, progress_bar_type: str = None, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, create_bqstorage_client: bool = True, max_results: Optional[int] = None, ) -> "pyarrow.Table": @@ -1611,7 +1611,7 @@ def to_arrow( # that should only exist here in the QueryJob method. def to_dataframe( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Dict[str, Any] = None, progress_bar_type: str = None, create_bqstorage_client: bool = True, From 51332d17dd08d698a7763984f9c0e65d74fe0de2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:32:02 -0600 Subject: [PATCH 37/47] restore optional bqstorage_client (in table.py) --- google/cloud/bigquery/table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 987ab5fb7..4d7870094 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1839,7 +1839,7 @@ def to_arrow( def to_dataframe_iterable( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Dict[str, Any] = None, max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore ) -> "pandas.DataFrame": @@ -1915,7 +1915,7 @@ def to_dataframe_iterable( # changes to job.QueryJob.to_dataframe() def to_dataframe( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Dict[str, Any] = None, progress_bar_type: str = None, create_bqstorage_client: bool = True, @@ -2276,7 +2276,7 @@ def to_geodataframe( def to_dataframe_iterable( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Optional[Dict[str, Any]] = None, max_queue_size: Optional[int] = None, ) -> Iterator["pandas.DataFrame"]: @@ -2306,7 +2306,7 @@ def to_dataframe_iterable( def to_arrow_iterable( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, max_queue_size: Optional[int] = None, ) -> Iterator["pyarrow.RecordBatch"]: """Create an iterable of pandas DataFrames, to process the table as a stream. From 4c296ae0f55d11d8f49d3940d9432f1e97b6a9e7 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:38:02 -0600 Subject: [PATCH 38/47] synchronize constraints and setup.py --- setup.py | 10 +++++----- testing/constraints-3.7.txt | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index accb733b4..689069a94 100644 --- a/setup.py +++ b/setup.py @@ -43,9 +43,9 @@ "packaging >= 14.3, <22.0.0dev", "protobuf>=3.19.5,<5.0.0dev,!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5", # For the legacy proto-based types. "python-dateutil >= 2.7.2, <3.0dev", - "pyarrow >= 3.0.0, < 11.0dev", "requests >= 2.21.0, < 3.0.0dev", ] +pyarrow_dependency = "pyarrow >= 3.0.0" extras = { # Keep the no-op bqstorage extra for backward compatibility. # See: https://github.com/googleapis/python-bigquery/issues/757 @@ -57,15 +57,15 @@ # See: https://github.com/googleapis/python-bigquery/issues/83 The # grpc.Channel.close() method isn't added until 1.32.0. # https://github.com/grpc/grpc/pull/15254 - "grpcio >= 1.38.1, < 2.0dev", - "pyarrow >= 3.0.0, < 9.0dev", + "grpcio >= 1.47.0, < 2.0dev", + pyarrow_dependency, ], "pandas": [ "pandas>=1.0.0", - "pyarrow >= 3.0.0, < 9.0dev", + pyarrow_dependency, "db-dtypes>=0.3.0,<2.0.0dev", ], - "bignumeric_type": ["pyarrow >= 3.0.0, < 9.0dev"], + "bignumeric_type": [pyarrow_dependency], "ipywidgets": ["ipywidgets==7.7.1"], "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index e6768c276..f5ea2475f 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -17,7 +17,7 @@ ipython==7.0.1 opentelemetry-api==1.1.0 opentelemetry-instrumentation==0.20b0 opentelemetry-sdk==1.1.0 -pandas==1.1.0 +pandas==1.0.0 proto-plus==1.22.0 protobuf==3.19.5 pyarrow==3.0.0 @@ -25,4 +25,4 @@ python-dateutil==2.7.3 requests==2.21.0 Shapely==1.8.4 six==1.13.0 -tqdm==4.7.4 \ No newline at end of file +tqdm==4.7.4 From 6067f90a2eca1c798eec83552c84d85d75a0738d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:41:32 -0600 Subject: [PATCH 39/47] synchronize signatures --- google/cloud/bigquery/job/query.py | 2 +- google/cloud/bigquery/table.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py index 8a5719168..e32e74129 100644 --- a/google/cloud/bigquery/job/query.py +++ b/google/cloud/bigquery/job/query.py @@ -1698,7 +1698,7 @@ def to_dataframe( # that should only exist here in the QueryJob method. def to_geodataframe( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Dict[str, Any] = None, progress_bar_type: str = None, create_bqstorage_client: bool = True, diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 4d7870094..a2110a9fb 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1674,7 +1674,7 @@ def _to_page_iterable( def to_arrow_iterable( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore ) -> Iterator["pyarrow.RecordBatch"]: """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream. @@ -1731,7 +1731,7 @@ def to_arrow_iterable( def to_arrow( self, progress_bar_type: str = None, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, create_bqstorage_client: bool = True, ) -> "pyarrow.Table": """[Beta] Create a class:`pyarrow.Table` by loading all pages of a @@ -2068,7 +2068,7 @@ def __can_cast_timestamp_ns(column): # changes to job.QueryJob.to_geodataframe() def to_geodataframe( self, - bqstorage_client: "bigquery_storage.BigQueryReadClient" = None, + bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None, dtypes: Dict[str, Any] = None, progress_bar_type: str = None, create_bqstorage_client: bool = True, From 6c2b8a540b992e0ffabaf9dcf7e7f92481d75ebe Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:43:10 -0600 Subject: [PATCH 40/47] remove unnecessary bignumeric_type extra --- setup.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.py b/setup.py index 689069a94..d5f7b39cf 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,6 @@ pyarrow_dependency, "db-dtypes>=0.3.0,<2.0.0dev", ], - "bignumeric_type": [pyarrow_dependency], "ipywidgets": ["ipywidgets==7.7.1"], "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], @@ -80,11 +79,6 @@ all_extras = [] for extra in extras: - # Exclude this extra from all to avoid overly strict dependencies on core - # libraries such as pyarrow. - # https://github.com/googleapis/python-bigquery/issues/563 - if extra in {"bignumeric_type"}: - continue all_extras.extend(extras[extra]) extras["all"] = all_extras From 8196a15dbff392c3dda28f0c71f707010d4eec1c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:51:18 -0600 Subject: [PATCH 41/47] more constraints sync --- setup.py | 4 ++-- testing/constraints-3.7.txt | 2 +- tests/unit/test__helpers.py | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index d5f7b39cf..9e1bfbbce 100644 --- a/setup.py +++ b/setup.py @@ -61,12 +61,12 @@ pyarrow_dependency, ], "pandas": [ - "pandas>=1.0.0", + "pandas>=1.1.0", pyarrow_dependency, "db-dtypes>=0.3.0,<2.0.0dev", ], "ipywidgets": ["ipywidgets==7.7.1"], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.0, <2.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.8.4, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index f5ea2475f..149d6c496 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -17,7 +17,7 @@ ipython==7.0.1 opentelemetry-api==1.1.0 opentelemetry-instrumentation==0.20b0 opentelemetry-sdk==1.1.0 -pandas==1.0.0 +pandas==1.1.0 proto-plus==1.22.0 protobuf==3.19.5 pyarrow==3.0.0 diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 6f6c63770..2de60664a 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -16,7 +16,6 @@ import datetime import decimal import unittest -import pytest import mock @@ -26,7 +25,7 @@ bigquery_storage = None try: - pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") + import pyarrow except ImportError: # pragma: NO COVER # Mock out pyarrow when missing, because methods from pyarrow.types are # used in test parameterization. From 5bac0836a55da7344f5fd832dcac3845c3d8f1b3 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 11:56:36 -0600 Subject: [PATCH 42/47] remove unnecessary mock --- tests/unit/test__helpers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 2de60664a..4fb86f665 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -27,9 +27,7 @@ try: import pyarrow except ImportError: # pragma: NO COVER - # Mock out pyarrow when missing, because methods from pyarrow.types are - # used in test parameterization. - pyarrow = mock.Mock() + pyarrow = None @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") From dafdb64b0a2569cdf60804748283ec0e6306c763 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 12:20:05 -0600 Subject: [PATCH 43/47] fix unittest skip --- tests/unit/job/test_query_pandas.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index 0811844f8..a2444efdd 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -51,12 +51,10 @@ pandas = pytest.importorskip("pandas") try: - pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1") + import pyarrow import pyarrow.types except ImportError: # pragma: NO COVER - # Mock out pyarrow when missing, because methods from pyarrow.types are - # used in test parameterization. - pyarrow = mock.Mock() + pyarrow = None @pytest.fixture From 805f5d3d6fe0d1bae048d40ad19736f97087607c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 12:23:26 -0600 Subject: [PATCH 44/47] synchronize constraints --- setup.py | 2 +- testing/constraints-3.7.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9e1bfbbce..666874053 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ "db-dtypes>=0.3.0,<2.0.0dev", ], "ipywidgets": ["ipywidgets==7.7.1"], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.8.4, <2.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.4.post2, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 149d6c496..57928714f 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -23,6 +23,6 @@ protobuf==3.19.5 pyarrow==3.0.0 python-dateutil==2.7.3 requests==2.21.0 -Shapely==1.8.4 +Shapely==1.6.4.post2 six==1.13.0 tqdm==4.7.4 From b85dcf351b9df0375c9a3c85e8d683e1865a9737 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 12:26:49 -0600 Subject: [PATCH 45/47] adjust shapely --- setup.py | 2 +- testing/constraints-3.7.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 666874053..9e1bfbbce 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ "db-dtypes>=0.3.0,<2.0.0dev", ], "ipywidgets": ["ipywidgets==7.7.1"], - "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.6.4.post2, <2.0dev"], + "geopandas": ["geopandas>=0.9.0, <1.0dev", "Shapely>=1.8.4, <2.0dev"], "ipython": ["ipython>=7.0.1,!=8.1.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 57928714f..149d6c496 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -23,6 +23,6 @@ protobuf==3.19.5 pyarrow==3.0.0 python-dateutil==2.7.3 requests==2.21.0 -Shapely==1.6.4.post2 +Shapely==1.8.4 six==1.13.0 tqdm==4.7.4 From bf4f2184d8a5a256a009da333462c57fbac9a396 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 12:28:00 -0600 Subject: [PATCH 46/47] simplify with importorskip --- tests/unit/test_table_pandas.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py index f7e5471d8..5778467a5 100644 --- a/tests/unit/test_table_pandas.py +++ b/tests/unit/test_table_pandas.py @@ -20,15 +20,8 @@ from google.cloud import bigquery -try: - pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") - import pyarrow.types -except ImportError: # pragma: NO COVER - # Mock out pyarrow when missing, because methods from pyarrow.types are - # used in test parameterization. - pyarrow = mock.Mock() - pandas = pytest.importorskip("pandas") +pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") TEST_PATH = "/v1/project/test-proj/dataset/test-dset/table/test-tbl/data" From 794f70c6623c6759d4d661e3d3047b3a6a010cad Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 8 Dec 2022 13:03:10 -0600 Subject: [PATCH 47/47] blacken --- tests/unit/test_table.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 58141863e..a79b98881 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2851,7 +2851,10 @@ def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): def mock_verify_version(): raise _helpers.LegacyBigQueryStorageError("no bqstorage") - with mock.patch("google.cloud.bigquery._helpers.BQ_STORAGE_VERSIONS.verify_version", mock_verify_version): + with mock.patch( + "google.cloud.bigquery._helpers.BQ_STORAGE_VERSIONS.verify_version", + mock_verify_version, + ): tbl = row_iterator.to_arrow(create_bqstorage_client=True) mock_client._ensure_bqstorage_client.assert_not_called()