diff --git a/bigframes/core/explode.py b/bigframes/core/explode.py new file mode 100644 index 0000000000..142536a931 --- /dev/null +++ b/bigframes/core/explode.py @@ -0,0 +1,36 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for implementing 'explode' functions.""" + +from typing import cast, Sequence, Union + +import bigframes.core.blocks as blocks +import bigframes.core.utils as utils + + +def check_column( + column: Union[blocks.Label, Sequence[blocks.Label]], +) -> Sequence[blocks.Label]: + if not utils.is_list_like(column): + column_labels = cast(Sequence[blocks.Label], (column,)) + else: + column_labels = cast(Sequence[blocks.Label], tuple(column)) + + if not column_labels: + raise ValueError("column must be nonempty") + if len(column_labels) > len(set(column_labels)): + raise ValueError("column must be unique") + + return column_labels diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a174ef0b0f..cb9c904121 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -55,6 +55,7 @@ import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.convert +import bigframes.core.explode import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid @@ -71,6 +72,7 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.operations.plotting as plotting +import bigframes.operations.structs import bigframes.series import bigframes.series as bf_series import bigframes.session._io.bigquery @@ -2875,15 +2877,7 @@ def explode( *, ignore_index: Optional[bool] = False, ) -> DataFrame: - if not utils.is_list_like(column): - column_labels = typing.cast(typing.Sequence[blocks.Label], (column,)) - else: - column_labels = typing.cast(typing.Sequence[blocks.Label], tuple(column)) - - if not column_labels: - raise ValueError("column must be nonempty") - if len(column_labels) > len(set(column_labels)): - raise ValueError("column must be unique") + column_labels = bigframes.core.explode.check_column(column) column_ids = [self._resolve_label_exact(label) for label in column_labels] missing = [ @@ -3751,6 +3745,10 @@ def __matmul__(self, other) -> DataFrame: __matmul__.__doc__ = inspect.getdoc(vendored_pandas_frame.DataFrame.__matmul__) + @property + def struct(self): + return bigframes.operations.structs.StructFrameAccessor(self) + def _throw_if_null_index(self, opname: str): if not self._has_index: raise bigframes.exceptions.NullIndexError( diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index d222f0993b..051023c299 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -57,3 +57,26 @@ def dtypes(self) -> pd.Series: ], index=[pa_type.field(i).name for i in range(pa_type.num_fields)], ) + + +@log_adapter.class_logger +class StructFrameAccessor(vendoracessors.StructFrameAccessor): + __doc__ = vendoracessors.StructAccessor.__doc__ + + def __init__(self, data: bigframes.dataframe.DataFrame) -> None: + self._parent = data + + def explode(self, column, *, separator: str = ".") -> bigframes.dataframe.DataFrame: + df = self._parent + column_labels = bigframes.core.explode.check_column(column) + + for label in column_labels: + position = df.columns.to_list().index(label) + df = df.drop(columns=label) + subfields = self._parent[label].struct.explode() + for subfield in reversed(subfields.columns): + df.insert( + position, f"{label}{separator}{subfield}", subfields[subfield] + ) + + return df diff --git a/tests/data/nested.jsonl b/tests/data/nested.jsonl index a71e9b1db1..751ad0df78 100644 --- a/tests/data/nested.jsonl +++ b/tests/data/nested.jsonl @@ -1,4 +1,4 @@ -{"rowindex":0,"customer_id":"jkl","day":"2023-12-18","flag":1,"event_sequence":[{"category":"B","timestamp":"2023-12-18 03:43:58","data":[{"key":"x","value":20.2533015856},{"key":"y","value":42.8363462389}]},{"category":"D","timestamp":"2023-12-18 07:15:37","data":[{"key":"x","value":62.0762664928},{"key":"z","value":83.6655402432}]}]} +{"rowindex":0,"customer_id":"jkl","day":"2023-12-18","flag":1,"label":{"key": "my-key","value":"my-value"},"event_sequence":[{"category":"B","timestamp":"2023-12-18 03:43:58","data":[{"key":"x","value":20.2533015856},{"key":"y","value":42.8363462389}]},{"category":"D","timestamp":"2023-12-18 07:15:37","data":[{"key":"x","value":62.0762664928},{"key":"z","value":83.6655402432}]}],"address":{"street":"123 Test Lane","city":"Testerchon"}} {"rowindex":1,"customer_id":"def","day":"2023-12-18","flag":2,"event_sequence":[{"category":"D","timestamp":"2023-12-18 23:11:11","data":[{"key":"w","value":36.1388065179}]},{"category":"B","timestamp":"2023-12-18 07:12:50","data":[{"key":"z","value":68.7673488304}]},{"category":"D","timestamp":"2023-12-18 09:09:03","data":[{"key":"x","value":57.4139647019}]},{"category":"C","timestamp":"2023-12-18 13:05:30","data":[{"key":"z","value":36.087871201}]}]} {"rowindex":2,"customer_id":"abc","day":"2023-12-6","flag":0,"event_sequence":[{"category":"C","timestamp":"2023-12-06 10:37:11","data":[]},{"category":"A","timestamp":"2023-12-06 03:35:44","data":[]},{"category":"D","timestamp":"2023-12-06 13:10:57","data":[{"key":"z","value":21.8487807658}]},{"category":"B","timestamp":"2023-12-06 01:39:16","data":[{"key":"y","value":1.6380505139}]}]} {"rowindex":3,"customer_id":"mno","day":"2023-12-16","flag":2,"event_sequence":[]} diff --git a/tests/data/nested_schema.json b/tests/data/nested_schema.json index c3fa39b36d..2b843bb395 100644 --- a/tests/data/nested_schema.json +++ b/tests/data/nested_schema.json @@ -19,6 +19,20 @@ "name": "flag", "type": "INTEGER" }, + { + "fields": [ + { + "name": "key", + "type": "STRING" + }, + { + "name": "value", + "type": "STRING" + } + ], + "name": "label", + "type": "RECORD" + }, { "fields": [ { @@ -52,5 +66,19 @@ "mode": "REPEATED", "name": "event_sequence", "type": "RECORD" + }, + { + "fields": [ + { + "name": "street", + "type": "STRING" + }, + { + "name": "city", + "type": "STRING" + } + ], + "name": "address", + "type": "RECORD" } ] diff --git a/tests/system/small/operations/test_struct.py b/tests/system/small/operations/test_struct.py new file mode 100644 index 0000000000..ddb65248d0 --- /dev/null +++ b/tests/system/small/operations/test_struct.py @@ -0,0 +1,40 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_dataframe_struct_explode_multiple_columns(nested_df): + got = nested_df.struct.explode(["label", "address"]) + assert got.columns.to_list() == [ + "customer_id", + "day", + "flag", + "label.key", + "label.value", + "event_sequence", + "address.street", + "address.city", + ] + + +def test_dataframe_struct_explode_separator(nested_df): + got = nested_df.struct.explode("label", separator="__sep__") + assert got.columns.to_list() == [ + "customer_id", + "day", + "flag", + "label__sep__key", + "label__sep__value", + "event_sequence", + "address", + ] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e1644c20b4..ddcf044911 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1436,6 +1436,14 @@ def test_get_dtypes_array_struct_table(nested_df): "customer_id": pd.StringDtype(storage="pyarrow"), "day": pd.ArrowDtype(pa.date32()), "flag": pd.Int64Dtype(), + "label": pd.ArrowDtype( + pa.struct( + [ + ("key", pa.string()), + ("value", pa.string()), + ] + ), + ), "event_sequence": pd.ArrowDtype( pa.list_( pa.struct( @@ -1457,6 +1465,14 @@ def test_get_dtypes_array_struct_table(nested_df): ), ), ), + "address": pd.ArrowDtype( + pa.struct( + [ + ("street", pa.string()), + ("city", pa.string()), + ] + ), + ), } ), ) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index bd6e50d096..4cba928bb6 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -121,3 +121,56 @@ def dtypes(self): A *pandas* Series with the data type of all child fields. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +class StructFrameAccessor: + """ + Accessor object for structured data properties of the DataFrame values. + """ + + def explode(self, column, *, separator: str = "."): + """ + Extract all child fields of struct column(s) and add to the DataFrame. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> countries = bpd.Series(["cn", "es", "us"]) + >>> files = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> downloads = bpd.Series([100, 200, 300]) + >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads}) + >>> df.struct.explode("file") + country file.version file.project download_count + 0 cn 1 pandas 100 + 1 es 2 pandas 200 + 2 us 1 numpy 300 + + [3 rows x 4 columns] + + Args: + column: + Column(s) to explode. For multiple columns, specify a non-empty + list with each element be str or tuple, and all specified + columns their list-like data on same row of the frame must + have matching length. + separator: + Separator/delimiter to use to separate the original column name + from the sub-field column name. + + + Returns: + DataFrame: + Original DataFrame with exploded struct column(s). + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)