Skip to content

Commit cdb4cc4

Browse files
tswastplamut
authored andcommitted
Raise with extra or missing columns in load_table_from_dataframe schema. (#9096)
I found it to be difficult to debug typos in column/index names in the schema, so I have hardened the error messages to indicate when unknown field values are found.
1 parent 0f6fa6b commit cdb4cc4

File tree

3 files changed

+50
-27
lines changed

3 files changed

+50
-27
lines changed

bigquery/google/cloud/bigquery/_pandas_helpers.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,15 +211,18 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
211211
"https://github.com/googleapis/google-cloud-python/issues/8191"
212212
)
213213
bq_schema_index = {field.name: field for field in bq_schema}
214+
bq_schema_unused = set(bq_schema_index.keys())
214215
else:
215216
bq_schema_index = {}
217+
bq_schema_unused = set()
216218

217219
bq_schema_out = []
218220
for column, dtype in zip(dataframe.columns, dataframe.dtypes):
219221
# Use provided type from schema, if present.
220222
bq_field = bq_schema_index.get(column)
221223
if bq_field:
222224
bq_schema_out.append(bq_field)
225+
bq_schema_unused.discard(bq_field.name)
223226
continue
224227

225228
# Otherwise, try to automatically determine the type based on the
@@ -230,6 +233,15 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
230233
return None
231234
bq_field = schema.SchemaField(column, bq_type)
232235
bq_schema_out.append(bq_field)
236+
237+
# Catch any schema mismatch. The developer explicitly asked to serialize a
238+
# column, but it was not found.
239+
if bq_schema_unused:
240+
raise ValueError(
241+
"bq_schema contains fields not present in dataframe: {}".format(
242+
bq_schema_unused
243+
)
244+
)
233245
return tuple(bq_schema_out)
234246

235247

@@ -248,9 +260,21 @@ def dataframe_to_arrow(dataframe, bq_schema):
248260
Table containing dataframe data, with schema derived from
249261
BigQuery schema.
250262
"""
251-
if len(bq_schema) != len(dataframe.columns):
263+
column_names = set(dataframe.columns)
264+
bq_field_names = set(field.name for field in bq_schema)
265+
266+
extra_fields = bq_field_names - column_names
267+
if extra_fields:
268+
raise ValueError(
269+
"bq_schema contains fields not present in dataframe: {}".format(
270+
extra_fields
271+
)
272+
)
273+
274+
missing_fields = column_names - bq_field_names
275+
if missing_fields:
252276
raise ValueError(
253-
"Number of columns in schema must match number of columns in dataframe."
277+
"bq_schema is missing fields from dataframe: {}".format(missing_fields)
254278
)
255279

256280
arrow_arrays = []

bigquery/tests/unit/test__pandas_helpers.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -609,12 +609,26 @@ def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch):
609609

610610
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
611611
@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`")
612-
def test_dataframe_to_parquet_w_missing_columns(module_under_test, monkeypatch):
612+
def test_dataframe_to_parquet_w_extra_fields(module_under_test, monkeypatch):
613613
with pytest.raises(ValueError) as exc_context:
614614
module_under_test.dataframe_to_parquet(
615-
pandas.DataFrame(), (schema.SchemaField("not_found", "STRING"),), None
615+
pandas.DataFrame(), (schema.SchemaField("not_in_df", "STRING"),), None
616616
)
617-
assert "columns in schema must match" in str(exc_context.value)
617+
message = str(exc_context.value)
618+
assert "bq_schema contains fields not present in dataframe" in message
619+
assert "not_in_df" in message
620+
621+
622+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
623+
@pytest.mark.skipif(pyarrow is None, reason="Requires `pyarrow`")
624+
def test_dataframe_to_parquet_w_missing_fields(module_under_test, monkeypatch):
625+
with pytest.raises(ValueError) as exc_context:
626+
module_under_test.dataframe_to_parquet(
627+
pandas.DataFrame({"not_in_bq": [1, 2, 3]}), (), None
628+
)
629+
message = str(exc_context.value)
630+
assert "bq_schema is missing fields from dataframe" in message
631+
assert "not_in_bq" in message
618632

619633

620634
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")

bigquery/tests/unit/test_client.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5517,7 +5517,6 @@ def test_load_table_from_dataframe_w_partial_schema(self):
55175517
@unittest.skipIf(pandas is None, "Requires `pandas`")
55185518
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
55195519
def test_load_table_from_dataframe_w_partial_schema_extra_types(self):
5520-
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
55215520
from google.cloud.bigquery import job
55225521
from google.cloud.bigquery.schema import SchemaField
55235522

@@ -5540,31 +5539,17 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self):
55405539
SchemaField("unknown_col", "BYTES"),
55415540
)
55425541
job_config = job.LoadJobConfig(schema=schema)
5543-
with load_patch as load_table_from_file:
5542+
with load_patch as load_table_from_file, pytest.raises(
5543+
ValueError
5544+
) as exc_context:
55445545
client.load_table_from_dataframe(
55455546
dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION
55465547
)
55475548

5548-
load_table_from_file.assert_called_once_with(
5549-
client,
5550-
mock.ANY,
5551-
self.TABLE_REF,
5552-
num_retries=_DEFAULT_NUM_RETRIES,
5553-
rewind=True,
5554-
job_id=mock.ANY,
5555-
job_id_prefix=None,
5556-
location=self.LOCATION,
5557-
project=None,
5558-
job_config=mock.ANY,
5559-
)
5560-
5561-
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
5562-
assert sent_config.source_format == job.SourceFormat.PARQUET
5563-
assert tuple(sent_config.schema) == (
5564-
SchemaField("int_col", "INTEGER"),
5565-
SchemaField("int_as_float_col", "INTEGER"),
5566-
SchemaField("string_col", "STRING"),
5567-
)
5549+
load_table_from_file.assert_not_called()
5550+
message = str(exc_context.value)
5551+
assert "bq_schema contains fields not present in dataframe" in message
5552+
assert "unknown_col" in message
55685553

55695554
@unittest.skipIf(pandas is None, "Requires `pandas`")
55705555
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")

0 commit comments

Comments
 (0)